From 25f3d7effab632eb10d145f1a5aebf6515a04b98 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Feb 2018 22:05:38 -0800 Subject: rcu: Parallelize expedited grace-period initialization The latency of RCU expedited grace periods grows with increasing numbers of CPUs, eventually failing to be all that expedited. Much of the growth in latency is in the initialization phase, so this commit uses workqueues to carry out this initialization concurrently on a rcu_node-by-rcu_node basis. This change makes use of a new rcu_par_gp_wq because flushing a work item from another work item running from the same workqueue can result in deadlock. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2a734692a581..23781fc90830 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4168,6 +4168,7 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) } struct workqueue_struct *rcu_gp_wq; +struct workqueue_struct *rcu_par_gp_wq; void __init rcu_init(void) { @@ -4199,6 +4200,8 @@ void __init rcu_init(void) /* Create workqueue for expedited GPs and for Tree SRCU. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); + rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_par_gp_wq); } #include "tree_exp.h" -- cgit From cee4393989333795ae04dc9f3b83a578afe3fca6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 2 Mar 2018 16:35:27 -0800 Subject: rcu: Rename cond_resched_rcu_qs() to cond_resched_tasks_rcu_qs() Commit e31d28b6ab8f ("trace: Eliminate cond_resched_rcu_qs() in favor of cond_resched()") substituted cond_resched() for the earlier call to cond_resched_rcu_qs(). However, the new-age cond_resched() does not do anything to help RCU-tasks grace periods because (1) RCU-tasks is only enabled when CONFIG_PREEMPT=y and (2) cond_resched() is a complete no-op when preemption is enabled. This situation results in hangs when running the trace benchmarks. A number of potential fixes were discussed on LKML (https://lkml.kernel.org/r/20180224151240.0d63a059@vmware.local.home), including making cond_resched() not be a no-op; making cond_resched() not be a no-op, but only when running tracing benchmarks; reverting the aforementioned commit (which works because cond_resched_rcu_qs() does provide an RCU-tasks quiescent state; and adding a call to the scheduler/RCU rcu_note_voluntary_context_switch() function. All were deemed unsatisfactory, either due to added cond_resched() overhead or due to magic functions inviting cargo culting. This commit renames cond_resched_rcu_qs() to cond_resched_tasks_rcu_qs(), which provides a clear hint as to what this function is doing and why and where it should be used, and then replaces the call to cond_resched() with cond_resched_tasks_rcu_qs() in the trace benchmark's benchmark_event_kthread() function. Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- include/linux/rcupdate.h | 4 ++-- kernel/rcu/rcuperf.c | 2 +- kernel/rcu/tree.c | 20 ++++++++++---------- kernel/rcu/tree_plugin.h | 4 ++-- kernel/rcu/update.c | 2 +- kernel/torture.c | 2 +- kernel/trace/trace_benchmark.c | 4 ++-- 7 files changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 36360d07f25b..19d235fefdb9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -188,13 +188,13 @@ static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU */ /** - * cond_resched_rcu_qs - Report potential quiescent states to RCU + * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU * * This macro resembles cond_resched(), except that it is defined to * report potential quiescent states to RCU-tasks even if the cond_resched() * machinery were to be shut off, as some advocate for PREEMPT kernels. */ -#define cond_resched_rcu_qs() \ +#define cond_resched_tasks_rcu_qs() \ do { \ if (!cond_resched()) \ rcu_note_voluntary_context_switch_lite(current); \ diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 777e7a6a0292..e232846516b3 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -369,7 +369,7 @@ static bool __maybe_unused torturing_tasks(void) */ static void rcu_perf_wait_shutdown(void) { - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) return; while (!torture_must_stop()) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2a734692a581..c4db0e20b035 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1234,10 +1234,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) } /* - * Has this CPU encountered a cond_resched_rcu_qs() since the - * beginning of the grace period? For this to be the case, - * the CPU has to have noticed the current grace period. This - * might not be the case for nohz_full CPUs looping in the kernel. + * Has this CPU encountered a cond_resched() since the beginning + * of the grace period? For this to be the case, the CPU has to + * have noticed the current grace period. This might not be the + * case for nohz_full CPUs looping in the kernel. */ jtsq = jiffies_till_sched_qs; ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); @@ -2049,7 +2049,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq_rcu_node(rnp); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); } @@ -2151,7 +2151,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq_rcu_node(rnp); rcu_nocb_gp_cleanup(sq); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); rcu_gp_slow(rsp, gp_cleanup_delay); } @@ -2202,7 +2202,7 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, @@ -2247,7 +2247,7 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqsend")); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); ret = 0; /* Force full wait till next FQS. */ j = jiffies_till_next_fqs; @@ -2260,7 +2260,7 @@ static int __noreturn rcu_gp_kthread(void *arg) } } else { /* Deal with stray signal. */ - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, @@ -2782,7 +2782,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 12774c4fb546..dfc42c3d52d4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1598,7 +1598,7 @@ static int rcu_oom_notify(struct notifier_block *self, for_each_online_cpu(cpu) { smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); } /* Unconditionally decrement: no need to wake ourselves up. */ @@ -2227,7 +2227,7 @@ static int rcu_nocb_kthread(void *arg) cl++; c++; local_bh_enable(); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 68fa19a5e7bd..e401960c7f51 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -624,7 +624,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks); * grace period has elapsed, in other words after all currently * executing rcu-tasks read-side critical sections have elapsed. These * read-side critical sections are delimited by calls to schedule(), - * cond_resched_rcu_qs(), idle execution, userspace execution, calls + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). * * This is a very specialized primitive, intended only for a few uses in diff --git a/kernel/torture.c b/kernel/torture.c index 37b94012a3f8..3de1efbecd6a 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -574,7 +574,7 @@ void stutter_wait(const char *title) { int spt; - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); spt = READ_ONCE(stutter_pause_test); for (; spt; spt = READ_ONCE(stutter_pause_test)) { if (spt == 1) { diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 22fee766081b..80e0b2aca703 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -159,13 +159,13 @@ static int benchmark_event_kthread(void *arg) * wants to run, schedule in, but if the CPU is idle, * we'll keep burning cycles. * - * Note the _rcu_qs() version of cond_resched() will + * Note the tasks_rcu_qs() version of cond_resched() will * notify synchronize_rcu_tasks() that this thread has * passed a quiescent state for rcu_tasks. Otherwise * this thread will never voluntarily schedule which would * block synchronize_rcu_tasks() indefinitely. */ - cond_resched(); + cond_resched_tasks_rcu_qs(); } return 0; -- cgit From 5b4c11d54b1b8d0714702006e00441ada59889a9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Apr 2018 17:11:44 -0700 Subject: rcu: Add leaf-node macros This commit adds rcu_first_leaf_node() that returns a pointer to the first leaf rcu_node structure in the specified RCU flavor and an rcu_is_leaf_node() that returns true iff the specified rcu_node structure is a leaf. This commit also uses these macros where appropriate. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/rcu.h | 11 ++++++++--- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_plugin.h | 4 ++-- 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 7a693e31184a..5b5bb9ee2e20 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -270,6 +270,12 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) } } +/* Returns first leaf rcu_node of the specified RCU flavor. */ +#define rcu_first_leaf_node(rsp) ((rsp)->level[rcu_num_lvls - 1]) + +/* Is this rcu_node a leaf? */ +#define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1) + /* * Do a full breadth-first scan of the rcu_node structures for the * specified rcu_state structure. @@ -284,8 +290,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * rcu_node tree with but one rcu_node structure, this loop is a no-op. */ #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) + for ((rnp) = &(rsp)->node[0]; !rcu_is_leaf_node(rsp, rnp); (rnp)++) /* * Scan the leaves of the rcu_node hierarchy for the specified rcu_state @@ -294,7 +299,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * It is still a leaf node, even if it is also the root node. */ #define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ + for ((rnp) = rcu_first_leaf_node(rsp); \ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c4db0e20b035..b22d2e1ca5c0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2398,7 +2398,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ - WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 && + WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, @@ -4056,7 +4056,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) init_swait_queue_head(&rsp->gp_wq); init_swait_queue_head(&rsp->expedited_wq); - rnp = rsp->level[rcu_num_lvls - 1]; + rnp = rcu_first_leaf_node(rsp); for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d37b9bb3f481..b999032e9466 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -182,7 +182,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) raw_lockdep_assert_held_rcu_node(rnp); WARN_ON_ONCE(rdp->mynode != rnp); - WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); + WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); /* * Decide where to queue the newly blocked task. In theory, @@ -533,7 +533,7 @@ void rcu_read_unlock_special(struct task_struct *t) rnp = t->rcu_blocked_node; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ WARN_ON_ONCE(rnp != t->rcu_blocked_node); - WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); + WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ -- cgit From 9036c2ffd596261d2067fc2d693dc4f0d7a51214 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Apr 2018 21:17:56 -0700 Subject: rcu: Improve non-root rcu_cbs_completed() accuracy When rcu_cbs_completed() is invoked on a non-root rcu_node structure, it unconditionally assumes that two grace periods must complete before the callbacks at hand can be invoked. This is overly conservative because if that non-root rcu_node structure believes that no grace period is in progress, and if the corresponding rcu_state structure's ->gpnum field has not yet been incremented, then these callbacks may safely be invoked after only one grace period has completed. This change is required to permit grace-period start requests to use funnel locking, which is in turn permitted to reduce root rcu_node ->lock contention, which has been observed by Nick Piggin. Furthermore, such contention will likely be increased by the merging of RCU-bh, RCU-preempt, and RCU-sched, so it makes sense to take steps to decrease it. This commit therefore improves the accuracy of rcu_cbs_completed() when invoked on a non-root rcu_node structure as described above. Reported-by: Nicholas Piggin Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2a734692a581..f5ca72f2ed43 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1641,6 +1641,21 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) return rnp->completed + 1; + /* + * If the current rcu_node structure believes that RCU is + * idle, and if the rcu_state structure does not yet reflect + * the start of a new grace period, then the next grace period + * will suffice. The memory barrier is needed to accurately + * sample the rsp->gpnum, and pairs with the second lock + * acquisition in rcu_gp_init(), which is augmented with + * smp_mb__after_unlock_lock() for this purpose. + */ + if (rnp->gpnum == rnp->completed) { + smp_mb(); /* See above block comment. */ + if (READ_ONCE(rsp->gpnum) == rnp->completed) + return rnp->completed + 1; + } + /* * Otherwise, wait for a possible partial grace period and * then the subsequent full grace period. -- cgit From 825a9911f6447299a69edacecc81fa2cdc5290a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Apr 2018 15:54:32 -0700 Subject: rcu: Make rcu_start_future_gp()'s grace-period check more precise The rcu_start_future_gp() function uses a sloppy check for a grace period being in progress, which works today because there are a number of code sequences that resolve the resulting races. However, some of these race-resolution code sequences must acquire the root rcu_node structure's ->lock, and contention on that lock has started manifesting. This commit therefore makes rcu_start_future_gp() check more precise, eliminating the sloppy lockless check of the rcu_state structure's ->gpnum and ->completed fields. The effect is that rcu_start_future_gp() will sometimes unnecessarily attempt to start a new grace period, but this overhead will be reduced later using funnel locking. Reported-by: Nicholas Piggin Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f5ca72f2ed43..4bbba17422cd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1705,20 +1705,12 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, } /* - * If either this rcu_node structure or the root rcu_node structure - * believe that a grace period is in progress, then we must wait - * for the one following, which is in "c". Because our request - * will be noticed at the end of the current grace period, we don't - * need to explicitly start one. We only do the lockless check - * of rnp_root's fields if the current rcu_node structure thinks - * there is no grace period in flight, and because we hold rnp->lock, - * the only possible change is when rnp_root's two fields are - * equal, in which case rnp_root->gpnum might be concurrently - * incremented. But that is OK, as it will just result in our - * doing some extra useless work. + * If this rcu_node structure believes that a grace period is in + * progress, then we must wait for the one following, which is in + * "c". Because our request will be noticed at the end of the + * current grace period, we don't need to explicitly start one. */ - if (rnp->gpnum != rnp->completed || - READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) { + if (rnp->gpnum != rnp->completed) { rnp->need_future_gp[c & 0x1]++; trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); goto out; -- cgit From c91a8675b9cc697c725b6d97fcc7f157f4a989d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Apr 2018 11:11:39 -0700 Subject: rcu: Add accessor macros for the ->need_future_gp[] array Accessors for the ->need_future_gp[] array are currently open-coded, which makes them difficult to change. To improve maintainability, this commit adds need_future_gp_mask() to compute the indexing mask from the array size, need_future_gp_element() to access the element corresponding to the specified grace-period number, and need_any_future_gp() to determine if any future grace period is needed. This commit also applies need_future_gp_element() to existing open-coded single-element accesses. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 16 +++++++--------- kernel/rcu/tree.h | 15 +++++++++++++++ kernel/rcu/tree_plugin.h | 2 +- 3 files changed, 23 insertions(+), 10 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4bbba17422cd..79fb99951a0c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -718,11 +718,9 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static int rcu_future_needs_gp(struct rcu_state *rsp) { struct rcu_node *rnp = rcu_get_root(rsp); - int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; - int *fp = &rnp->need_future_gp[idx]; lockdep_assert_irqs_disabled(); - return READ_ONCE(*fp); + return READ_ONCE(need_future_gp_element(rnp, rnp->completed)); } /* @@ -1699,7 +1697,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, */ c = rcu_cbs_completed(rdp->rsp, rnp); trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); - if (rnp->need_future_gp[c & 0x1]) { + if (need_future_gp_element(rnp, c)) { trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); goto out; } @@ -1711,7 +1709,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * current grace period, we don't need to explicitly start one. */ if (rnp->gpnum != rnp->completed) { - rnp->need_future_gp[c & 0x1]++; + need_future_gp_element(rnp, c)++; trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); goto out; } @@ -1737,13 +1735,13 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * If the needed for the required grace period is already * recorded, trace and leave. */ - if (rnp_root->need_future_gp[c & 0x1]) { + if (need_future_gp_element(rnp_root, c)) { trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); goto unlock_out; } /* Record the need for the future grace period. */ - rnp_root->need_future_gp[c & 0x1]++; + need_future_gp_element(rnp_root, c)++; /* If a grace period is not already in progress, start one. */ if (rnp_root->gpnum != rnp_root->completed) { @@ -1771,8 +1769,8 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) int needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - rnp->need_future_gp[c & 0x1] = 0; - needmore = rnp->need_future_gp[(c + 1) & 0x1]; + need_future_gp_element(rnp, c) = 0; + needmore = need_future_gp_element(rnp, c + 1); trace_rcu_future_gp(rnp, rdp, c, needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f491ab4f2e8e..18b091474ffa 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -159,6 +159,21 @@ struct rcu_node { wait_queue_head_t exp_wq[4]; } ____cacheline_internodealigned_in_smp; +/* Accessors for ->need_future_gp[] array. */ +#define need_future_gp_mask() \ + (ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1) +#define need_future_gp_element(rnp, c) \ + ((rnp)->need_future_gp[(c) & need_future_gp_mask()]) +#define need_any_future_gp(rnp) \ +({ \ + int __i; \ + bool __nonzero = false; \ + \ + for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++) \ + __nonzero = __nonzero || (rnp)->need_future_gp[__i]; \ + __nonzero; \ +}) + /* * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and * are indexed relative to this interval rather than the global CPU ID space. diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 84fbee4686d3..640ea927d8a4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1790,7 +1790,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) */ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) { - rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; + need_future_gp_element(rnp, rnp->completed + 1) += nrq; } static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) -- cgit From 5fe0a56298e674358ff2740a6288bf21509d895d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Apr 2018 16:41:20 -0700 Subject: rcu: Make rcu_gp_kthread() check for early-boot activity The rcu_gp_kthread() function immediately sleeps waiting to be notified of the need for a new grace period, which currently works because there are a number of code sequences that will provide the needed wakeup later. However, some of these code sequences need to acquire the root rcu_node structure's ->lock, and contention on that lock has started manifesting. This commit therefore makes rcu_gp_kthread() check for early-boot activity when it starts up, omitting the initial sleep in that case. Reported-by: Nicholas Piggin Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 79fb99951a0c..497f139056c7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2192,6 +2192,12 @@ static int __noreturn rcu_gp_kthread(void *arg) struct rcu_state *rsp = arg; struct rcu_node *rnp = rcu_get_root(rsp); + /* Check for early-boot work. */ + raw_spin_lock_irq_rcu_node(rnp); + if (need_any_future_gp(rnp)) + WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); + raw_spin_unlock_irq_rcu_node(rnp); + rcu_bind_gp_kthread(); for (;;) { -- cgit From fb31340f8a43a6f2e871164822ef4979b36232ae Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Apr 2018 07:20:30 -0700 Subject: rcu: Make rcu_gp_cleanup() more accurately predict need for new GP Currently, rcu_gp_cleanup() scans the rcu_node tree in order to reset state to reflect the end of the grace period. It also checks to see whether a new grace period is needed, but in a number of cases, rather than directly cause the new grace period to be immediately started, it instead leaves the grace-period-needed state where various fail-safes can find it. This works fine, but results in higher contention on the root rcu_node structure's ->lock, which is undesirable, and contention on that lock has recently become noticeable. This commit therefore makes rcu_gp_cleanup() immediately start a new grace period if there is any need for one. It is quite possible that it will later be necessary to throttle the grace-period rate, but that can be dealt with when and if. Reported-by: Nicholas Piggin Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 16 ++++++++++------ kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 17 ----------------- 3 files changed, 10 insertions(+), 24 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 497f139056c7..afc5e32f0da4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1763,14 +1763,14 @@ out: * Clean up any old requests for the just-ended grace period. Also return * whether any additional grace periods have been requested. */ -static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { int c = rnp->completed; - int needmore; + bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); need_future_gp_element(rnp, c) = 0; - needmore = need_future_gp_element(rnp, c + 1); + needmore = need_any_future_gp(rnp); trace_rcu_future_gp(rnp, rdp, c, needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; @@ -2113,7 +2113,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; bool needgp = false; - int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); struct swait_queue_head *sq; @@ -2152,7 +2151,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) if (rnp == rdp->mynode) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ - nocb += rcu_future_gp_cleanup(rsp, rnp); + needgp = rcu_future_gp_cleanup(rsp, rnp) || needgp; sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq_rcu_node(rnp); rcu_nocb_gp_cleanup(sq); @@ -2162,13 +2161,18 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) } rnp = rcu_get_root(rsp); raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ - rcu_nocb_gp_set(rnp, nocb); /* Declare grace period done. */ WRITE_ONCE(rsp->completed, rsp->gpnum); trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->gp_state = RCU_GP_IDLE; + /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); + if (need_any_future_gp(rnp)) { + trace_rcu_future_gp(rnp, rdp, rsp->completed - 1, + TPS("CleanupMore")); + needgp = true; + } /* Advance CBs to reduce false positives below. */ needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; if (needgp || cpu_needs_another_gp(rsp, rdp)) { diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 18b091474ffa..bd1103763551 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -469,7 +469,6 @@ static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 640ea927d8a4..313b77d9cf06 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1780,19 +1780,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) swake_up_all(sq); } -/* - * Set the root rcu_node structure's ->need_future_gp field - * based on the sum of those of all rcu_node structures. This does - * double-count the root rcu_node structure's requests, but this - * is necessary to handle the possibility of a rcu_nocb_kthread() - * having awakened during the time that the rcu_node structures - * were being updated for the end of the previous grace period. - */ -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) -{ - need_future_gp_element(rnp, rnp->completed + 1) += nrq; -} - static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { return &rnp->nocb_gp_wq[rnp->completed & 0x1]; @@ -2495,10 +2482,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { } -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) -{ -} - static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { return NULL; -- cgit From 0ae94e00ce40e4447080ab7675220f725c690330 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Apr 2018 14:14:42 -0700 Subject: rcu: Make rcu_future_needs_gp() check all ->need_future_gps[] elements Currently, the rcu_future_needs_gp() function checks only the current element of the ->need_future_gps[] array, which might miss elements that were offset from the expected element, for example, due to races with the start or the end of a grace period. This commit therefore makes rcu_future_needs_gp() use the need_any_future_gp() macro to check all of the elements of this array. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index afc5e32f0da4..b05ab6379562 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -720,7 +720,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); lockdep_assert_irqs_disabled(); - return READ_ONCE(need_future_gp_element(rnp, rnp->completed)); + return need_any_future_gp(rnp); } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 952cd0c223fe..123c30eac8b5 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -169,7 +169,8 @@ struct rcu_node { bool __nonzero = false; \ \ for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++) \ - __nonzero = __nonzero || (rnp)->need_future_gp[__i]; \ + __nonzero = __nonzero || \ + READ_ONCE((rnp)->need_future_gp[__i]); \ __nonzero; \ }) -- cgit From 6f576e281690316270275bbef17c79ea304ad511 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Apr 2018 16:50:31 -0700 Subject: rcu: Convert ->need_future_gp[] array to boolean There is no longer any need for ->need_future_gp[] to count the number of requests for future grace periods, so this commit converts the additions to assignments to "true" and reduces the size of each element to one byte. While we are in the area, fix an obsolete comment. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 6 +++--- kernel/rcu/tree.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b05ab6379562..6ef1f2b4a6d3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1709,7 +1709,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * current grace period, we don't need to explicitly start one. */ if (rnp->gpnum != rnp->completed) { - need_future_gp_element(rnp, c)++; + need_future_gp_element(rnp, c) = true; trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); goto out; } @@ -1741,7 +1741,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, } /* Record the need for the future grace period. */ - need_future_gp_element(rnp_root, c)++; + need_future_gp_element(rnp_root, c) = true; /* If a grace period is not already in progress, start one. */ if (rnp_root->gpnum != rnp_root->completed) { @@ -1769,7 +1769,7 @@ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - need_future_gp_element(rnp, c) = 0; + need_future_gp_element(rnp, c) = false; needmore = need_any_future_gp(rnp); trace_rcu_future_gp(rnp, rdp, c, needmore ? TPS("CleanupMore") : TPS("Cleanup")); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 123c30eac8b5..9f97fd7f648c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -150,7 +150,7 @@ struct rcu_node { struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - int need_future_gp[4]; /* Counts of upcoming no-CB GP requests. */ + u8 need_future_gp[4]; /* Counts of upcoming GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; spinlock_t exp_lock ____cacheline_internodealigned_in_smp; -- cgit From ec4eaccef4af28376345554580606a43d7392ed8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Apr 2018 08:49:24 -0700 Subject: rcu: Make rcu_migrate_callbacks wake GP kthread when needed The rcu_migrate_callbacks() function invokes rcu_advance_cbs() twice, ignoring the return value. This is OK at pressent because of failsafe code that does the wakeup when needed. However, this failsafe code acquires the root rcu_node structure's lock frequently, while rcu_migrate_callbacks() does so only once per CPU-offline operation. This commit therefore makes rcu_migrate_callbacks() wake up the RCU GP kthread when either call to rcu_advance_cbs() returns true, thus removing need for the failsafe code. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6ef1f2b4a6d3..f75eb5174021 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3876,6 +3876,7 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) struct rcu_data *my_rdp; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + bool needwake; if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ @@ -3887,12 +3888,15 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) return; } raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ - rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ - rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */ + /* Leverage recent GPs and set GP for new callbacks. */ + needwake = rcu_advance_cbs(rsp, rnp_root, rdp) || + rcu_advance_cbs(rsp, rnp_root, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", -- cgit From a6058d85a2b24fa40ce7f0d7683989ec47b603b9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Apr 2018 14:33:18 -0700 Subject: rcu: Avoid __call_rcu_core() root rcu_node ->lock acquisition When __call_rcu_core() notices excessive numbers of callbacks pending on the current CPU, we know that at least one of them is not yet classified, namely the one that was just now queued. Therefore, it is not necessary to invoke rcu_start_gp() and thus not necessary to acquire the root rcu_node structure's ->lock. This commit therefore replaces the rcu_start_gp() with rcu_accelerate_cbs(), thus replacing an acquisition of the root rcu_node structure's ->lock with that of this CPU's leaf rcu_node structure. This decreases contention on the root rcu_node structure's ->lock. Reported-by: Nicholas Piggin Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f75eb5174021..6396a3d10be9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2988,11 +2988,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { - struct rcu_node *rnp_root = rcu_get_root(rsp); + struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_rcu_node(rnp_root); - needwake = rcu_start_gp(rsp); - raw_spin_unlock_rcu_node(rnp_root); + raw_spin_lock_rcu_node(rnp); + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock_rcu_node(rnp); if (needwake) rcu_gp_kthread_wake(rsp); } else { -- cgit From bd7af8463b9fae02b4c7d7248a088ca685ef184c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Apr 2018 09:51:20 -0700 Subject: rcu: Switch __rcu_process_callbacks() to rcu_accelerate_cbs() The __rcu_process_callbacks() function currently checks to see if the current CPU needs a grace period and also if there is any other reason to kick off a new grace period. This is one of the fail-safe checks that has been rendered unnecessary by the changes that increase the accuracy of rcu_gp_cleanup()'s estimate as to whether another grace period is required. Because this particular fail-safe involved acquiring the root rcu_node structure's ->lock, which has seen excessive contention in real life, this fail-safe needs to go. However, one check must remain, namely the check for newly arrived RCU callbacks that have not yet been associated with a grace period. One might hope that the checks in __note_gp_changes(), which is invoked indirectly from rcu_check_quiescent_state(), would suffice, but this function won't be invoked at all if RCU is idle. It is therefore necessary to replace the fail-safe checks with a simpler check for newly arrived callbacks during an RCU idle period, which is exactly what this commit does. This change removes the final call to rcu_start_gp(), so this function is removed as well. Note that lockless use of cpu_needs_another_gp() is racy, but that these races are harmless in this case. If RCU really is idle, the values will not change, so the return value from cpu_needs_another_gp() will be correct. If RCU is not idle, the resulting redundant call to rcu_accelerate_cbs() will be harmless, and might even have the benefit of reducing grace-period latency a bit. This commit also moves interrupt disabling into the "if" statement to improve real-time response a bit. Reported-by: Nicholas Piggin Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 53 +++++++++++++++-------------------------------------- 1 file changed, 15 insertions(+), 38 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6396a3d10be9..fbacc486ed4c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2334,34 +2334,6 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, return true; } -/* - * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's - * callbacks. Note that rcu_start_gp_advanced() cannot do this because it - * is invoked indirectly from rcu_advance_cbs(), which would result in - * endless recursion -- or would do so if it wasn't for the self-deadlock - * that is encountered beforehand. - * - * Returns true if the grace-period kthread needs to be awakened. - */ -static bool rcu_start_gp(struct rcu_state *rsp) -{ - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_node *rnp = rcu_get_root(rsp); - bool ret = false; - - /* - * If there is no grace period in progress right now, any - * callbacks we have up to this point will be satisfied by the - * next grace period. Also, advancing the callbacks reduces the - * probability of false positives from cpu_needs_another_gp() - * resulting in pointless grace periods. So, advance callbacks - * then start the grace period! - */ - ret = rcu_advance_cbs(rsp, rnp, rdp) || ret; - ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret; - return ret; -} - /* * Report a full set of quiescent states to the specified rcu_state data * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period @@ -2889,22 +2861,27 @@ __rcu_process_callbacks(struct rcu_state *rsp) unsigned long flags; bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + struct rcu_node *rnp; WARN_ON_ONCE(!rdp->beenonline); /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); - /* Does this CPU require a not-yet-started grace period? */ - local_irq_save(flags); - if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ - needwake = rcu_start_gp(rsp); - raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); - if (needwake) - rcu_gp_kthread_wake(rsp); - } else { - local_irq_restore(flags); + /* No grace period and unregistered callbacks? */ + if (!rcu_gp_in_progress(rsp) && + rcu_segcblist_is_enabled(&rdp->cblist)) { + local_irq_save(flags); + if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) { + local_irq_restore(flags); + } else { + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); + } } /* If there are callbacks ready, invoke them. */ -- cgit From a508aa597ec2f046c00b8809f887f90cf1aaa47f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Apr 2018 09:05:50 -0700 Subject: rcu: Cleanup, don't put ->completed into an int It is true that currently only the low-order two bits are used, so there should be no problem given modern machines and compilers, but good hygiene and maintainability dictates use of an unsigned long instead of an int. This commit therefore makes this change. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fbacc486ed4c..c7b1e6b2a3da 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1765,7 +1765,7 @@ out: */ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { - int c = rnp->completed; + unsigned long c = rnp->completed; bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); -- cgit From a824a287f6eaec65f1cf7aedfd5f6b69d2d3858f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Apr 2018 09:40:32 -0700 Subject: rcu: Clear request other than RCU_GP_FLAG_INIT at GP end Once the grace period has ended, any RCU_GP_FLAG_FQS requests are irrelevant: The grace period has ended, so there is no longer any point in forcing quiescent states in order to try to make it end sooner. This commit therefore causes rcu_gp_cleanup() to clear any bits other than RCU_GP_FLAG_INIT from ->gp_flags at the end of the grace period. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c7b1e6b2a3da..25dbbc753fef 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2181,6 +2181,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) READ_ONCE(rsp->gpnum), TPS("newreq")); } + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); raw_spin_unlock_irq_rcu_node(rnp); } -- cgit From d5cd96851d520e5caff13ddf99e3b2b759ae3b1d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Apr 2018 10:45:06 -0700 Subject: rcu: Inline rcu_start_gp_advanced() into rcu_start_future_gp() The rcu_start_gp_advanced() is invoked only from rcu_start_future_gp() and much of its code is redundant when invoked from that context. This commit therefore inlines rcu_start_gp_advanced() into rcu_start_future_gp(), then removes rcu_start_gp_advanced(). Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 56 ++++++++++++------------------------------------------- 1 file changed, 12 insertions(+), 44 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 25dbbc753fef..4433f68a1c7b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -524,8 +524,6 @@ module_param(rcu_kick_kthreads, bool, 0644); static ulong jiffies_till_sched_qs = HZ / 10; module_param(jiffies_till_sched_qs, ulong, 0444); -static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); @@ -1679,7 +1677,8 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * rcu_node structure's ->need_future_gp field. Returns true if there * is reason to awaken the grace-period kthread. * - * The caller must hold the specified rcu_node structure's ->lock. + * The caller must hold the specified rcu_node structure's ->lock, which + * is why the caller is responsible for waking the grace-period kthread. */ static bool __maybe_unused rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, @@ -1687,7 +1686,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, { unsigned long c; bool ret = false; - struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + struct rcu_state *rsp = rdp->rsp; + struct rcu_node *rnp_root = rcu_get_root(rsp); raw_lockdep_assert_held_rcu_node(rnp); @@ -1695,7 +1695,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * Pick up grace-period number for new callbacks. If this * grace period is already marked as needed, return to the caller. */ - c = rcu_cbs_completed(rdp->rsp, rnp); + c = rcu_cbs_completed(rsp, rnp); trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); if (need_future_gp_element(rnp, c)) { trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); @@ -1727,7 +1727,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * period in progress, it will be smaller than the one we obtained * earlier. Adjust callbacks as needed. */ - c = rcu_cbs_completed(rdp->rsp, rnp_root); + c = rcu_cbs_completed(rsp, rnp_root); if (!rcu_is_nocb_cpu(rdp->cpu)) (void)rcu_segcblist_accelerate(&rdp->cblist, c); @@ -1748,7 +1748,12 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); } else { trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); - ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); + if (!rsp->gp_kthread) + goto unlock_out; /* No grace-period kthread yet! */ + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), + TPS("newreq")); + ret = true; /* Caller must wake GP kthread. */ } unlock_out: if (rnp != rnp_root) @@ -2298,43 +2303,6 @@ static int __noreturn rcu_gp_kthread(void *arg) } } -/* - * Start a new RCU grace period if warranted, re-initializing the hierarchy - * in preparation for detecting the next grace period. The caller must hold - * the root node's ->lock and hard irqs must be disabled. - * - * Note that it is legal for a dying CPU (which is marked as offline) to - * invoke this function. This can happen when the dying CPU reports its - * quiescent state. - * - * Returns true if the grace-period kthread must be awakened. - */ -static bool -rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) -{ - raw_lockdep_assert_held_rcu_node(rnp); - if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { - /* - * Either we have not yet spawned the grace-period - * task, this CPU does not need another grace period, - * or a grace period is already in progress. - * Either way, don't start a new grace period. - */ - return false; - } - WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), - TPS("newreq")); - - /* - * We can't do wakeups while holding the rnp->lock, as that - * could cause possible deadlocks with the rq->lock. Defer - * the wakeup to our caller. - */ - return true; -} - /* * Report a full set of quiescent states to the specified rcu_state data * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period -- cgit From 41e80595abfc608eb0fe5148bcaed1ed78d7a6b7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Apr 2018 11:24:09 -0700 Subject: rcu: Make rcu_start_future_gp() caller select grace period The rcu_accelerate_cbs() function selects a grace-period target, which it uses to have rcu_segcblist_accelerate() assign numbers to recently queued callbacks. Then it invokes rcu_start_future_gp(), which selects a grace-period target again, which is a bit pointless. This commit therefore changes rcu_start_future_gp() to take the grace-period target as a parameter, thus avoiding double selection. This commit also changes the name of rcu_start_future_gp() to rcu_start_this_gp() to reflect this change in functionality, and also makes a similar change to the name of trace_rcu_future_gp(). Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 53 ++++++++++++++++++++---------------------------- kernel/rcu/tree_plugin.h | 9 ++++---- 2 files changed, 27 insertions(+), 35 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4433f68a1c7b..94519c7d552f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1659,12 +1659,9 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, return rnp->completed + 2; } -/* - * Trace-event helper function for rcu_start_future_gp() and - * rcu_nocb_wait_gp(). - */ -static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c, const char *s) +/* Trace-event wrapper function for trace_rcu_future_grace_period. */ +static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c, const char *s) { trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, rnp->completed, c, rnp->level, @@ -1672,33 +1669,27 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, } /* - * Start some future grace period, as needed to handle newly arrived + * Start the specified grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each - * rcu_node structure's ->need_future_gp field. Returns true if there + * rcu_node structure's ->need_future_gp[] field. Returns true if there * is reason to awaken the grace-period kthread. * * The caller must hold the specified rcu_node structure's ->lock, which * is why the caller is responsible for waking the grace-period kthread. */ -static bool __maybe_unused -rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long *c_out) +static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c) { - unsigned long c; bool ret = false; struct rcu_state *rsp = rdp->rsp; struct rcu_node *rnp_root = rcu_get_root(rsp); raw_lockdep_assert_held_rcu_node(rnp); - /* - * Pick up grace-period number for new callbacks. If this - * grace period is already marked as needed, return to the caller. - */ - c = rcu_cbs_completed(rsp, rnp); - trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); + /* If the specified GP is already known needed, return to caller. */ + trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); if (need_future_gp_element(rnp, c)) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); + trace_rcu_this_gp(rnp, rdp, c, TPS("Prestartleaf")); goto out; } @@ -1710,7 +1701,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, */ if (rnp->gpnum != rnp->completed) { need_future_gp_element(rnp, c) = true; - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); + trace_rcu_this_gp(rnp, rdp, c, TPS("Startedleaf")); goto out; } @@ -1736,7 +1727,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * recorded, trace and leave. */ if (need_future_gp_element(rnp_root, c)) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); + trace_rcu_this_gp(rnp, rdp, c, TPS("Prestartedroot")); goto unlock_out; } @@ -1745,9 +1736,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* If a grace period is not already in progress, start one. */ if (rnp_root->gpnum != rnp_root->completed) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); + trace_rcu_this_gp(rnp, rdp, c, TPS("Startedleafroot")); } else { - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); + trace_rcu_this_gp(rnp, rdp, c, TPS("Startedroot")); if (!rsp->gp_kthread) goto unlock_out; /* No grace-period kthread yet! */ WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); @@ -1759,8 +1750,6 @@ unlock_out: if (rnp != rnp_root) raw_spin_unlock_rcu_node(rnp_root); out: - if (c_out != NULL) - *c_out = c; return ret; } @@ -1776,8 +1765,8 @@ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) need_future_gp_element(rnp, c) = false; needmore = need_any_future_gp(rnp); - trace_rcu_future_gp(rnp, rdp, c, - needmore ? TPS("CleanupMore") : TPS("Cleanup")); + trace_rcu_this_gp(rnp, rdp, c, + needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; } @@ -1812,6 +1801,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + unsigned long c; bool ret = false; raw_lockdep_assert_held_rcu_node(rnp); @@ -1830,8 +1820,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * accelerating callback invocation to an earlier grace-period * number. */ - if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp))) - ret = rcu_start_future_gp(rnp, rdp, NULL); + c = rcu_cbs_completed(rsp, rnp); + if (rcu_segcblist_accelerate(&rdp->cblist, c)) + ret = rcu_start_this_gp(rnp, rdp, c); /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) @@ -2174,8 +2165,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); if (need_any_future_gp(rnp)) { - trace_rcu_future_gp(rnp, rdp, rsp->completed - 1, - TPS("CleanupMore")); + trace_rcu_this_gp(rnp, rdp, rsp->completed - 1, + TPS("CleanupMore")); needgp = true; } /* Advance CBs to reduce false positives below. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 313b77d9cf06..322777492fff 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2035,7 +2035,8 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) struct rcu_node *rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - needwake = rcu_start_future_gp(rnp, rdp, &c); + c = rcu_cbs_completed(rdp->rsp, rnp); + needwake = rcu_start_this_gp(rnp, rdp, c); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rdp->rsp); @@ -2044,7 +2045,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) * Wait for the grace period. Do so interruptibly to avoid messing * up the load average. */ - trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { swait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], @@ -2052,9 +2053,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) if (likely(d)) break; WARN_ON(signal_pending(current)); - trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait")); } - trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait")); smp_mb(); /* Ensure that CB invocation happens after GP end. */ } -- cgit From 360e0da67eab610b0efd53cbab3e1535095e7aa4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Apr 2018 11:50:41 -0700 Subject: rcu: Add funnel locking to rcu_start_this_gp() The rcu_start_this_gp() function had a simple form of funnel locking that used only the leaves and root of the rcu_node tree, which is fine for systems with only a few hundred CPUs, but sub-optimal for systems having thousands of CPUs. This commit therefore adds full-tree funnel locking. This variant of funnel locking is unusual in the following ways: 1. The leaf-level rcu_node structure's ->lock is held throughout. Other funnel-locking implementations drop the leaf-level lock before progressing to the next level of the tree. 2. Funnel locking can be started at the root, which is convenient for code that already holds the root rcu_node structure's ->lock. Other funnel-locking implementations start at the leaves. 3. If an rcu_node structure other than the initial one believes that a grace period is in progress, it is not necessary to go further up the tree. This is because grace-period cleanup scans the full tree, so that marking the need for a subsequent grace period anywhere in the tree suffices -- but only if a grace period is currently in progress. 4. It is possible that the RCU grace-period kthread has not yet started, and this case must be handled appropriately. However, the general approach of using a tree to control lock contention is still in place. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 92 +++++++++++++++++++++---------------------------------- 1 file changed, 35 insertions(+), 57 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 94519c7d552f..d3c769502929 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1682,74 +1682,52 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, { bool ret = false; struct rcu_state *rsp = rdp->rsp; - struct rcu_node *rnp_root = rcu_get_root(rsp); - - raw_lockdep_assert_held_rcu_node(rnp); - - /* If the specified GP is already known needed, return to caller. */ - trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); - if (need_future_gp_element(rnp, c)) { - trace_rcu_this_gp(rnp, rdp, c, TPS("Prestartleaf")); - goto out; - } + struct rcu_node *rnp_root; /* - * If this rcu_node structure believes that a grace period is in - * progress, then we must wait for the one following, which is in - * "c". Because our request will be noticed at the end of the - * current grace period, we don't need to explicitly start one. + * Use funnel locking to either acquire the root rcu_node + * structure's lock or bail out if the need for this grace period + * has already been recorded -- or has already started. If there + * is already a grace period in progress in a non-leaf node, no + * recording is needed because the end of the grace period will + * scan the leaf rcu_node structures. Note that rnp->lock must + * not be released. */ - if (rnp->gpnum != rnp->completed) { - need_future_gp_element(rnp, c) = true; - trace_rcu_this_gp(rnp, rdp, c, TPS("Startedleaf")); - goto out; + raw_lockdep_assert_held_rcu_node(rnp); + trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); + for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { + if (rnp_root != rnp) + raw_spin_lock_rcu_node(rnp_root); + if (need_future_gp_element(rnp_root, c) || + ULONG_CMP_GE(rnp_root->gpnum, c) || + (rnp != rnp_root && + rnp_root->gpnum != rnp_root->completed)) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); + goto unlock_out; + } + need_future_gp_element(rnp_root, c) = true; + if (rnp_root != rnp && rnp_root->parent != NULL) + raw_spin_unlock_rcu_node(rnp_root); + if (!rnp_root->parent) + break; /* At root, and perhaps also leaf. */ } - /* - * There might be no grace period in progress. If we don't already - * hold it, acquire the root rcu_node structure's lock in order to - * start one (if needed). - */ - if (rnp != rnp_root) - raw_spin_lock_rcu_node(rnp_root); - - /* - * Get a new grace-period number. If there really is no grace - * period in progress, it will be smaller than the one we obtained - * earlier. Adjust callbacks as needed. - */ - c = rcu_cbs_completed(rsp, rnp_root); - if (!rcu_is_nocb_cpu(rdp->cpu)) - (void)rcu_segcblist_accelerate(&rdp->cblist, c); - - /* - * If the needed for the required grace period is already - * recorded, trace and leave. - */ - if (need_future_gp_element(rnp_root, c)) { - trace_rcu_this_gp(rnp, rdp, c, TPS("Prestartedroot")); + /* If GP already in progress, just leave, otherwise start one. */ + if (rnp_root->gpnum != rnp_root->completed) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot")); goto unlock_out; } - - /* Record the need for the future grace period. */ - need_future_gp_element(rnp_root, c) = true; - - /* If a grace period is not already in progress, start one. */ - if (rnp_root->gpnum != rnp_root->completed) { - trace_rcu_this_gp(rnp, rdp, c, TPS("Startedleafroot")); - } else { - trace_rcu_this_gp(rnp, rdp, c, TPS("Startedroot")); - if (!rsp->gp_kthread) - goto unlock_out; /* No grace-period kthread yet! */ - WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), - TPS("newreq")); - ret = true; /* Caller must wake GP kthread. */ + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot")); + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); + if (!rsp->gp_kthread) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread")); + goto unlock_out; } + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); + ret = true; /* Caller must wake GP kthread. */ unlock_out: if (rnp != rnp_root) raw_spin_unlock_rcu_node(rnp_root); -out: return ret; } -- cgit From 665f08f1ce9cf608a9435e11d66f55be4e72540a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Apr 2018 12:23:34 -0700 Subject: rcu: Make rcu_start_this_gp() check for out-of-range requests If rcu_start_this_gp() is invoked with a requested grace period more than three in the future, then either the ->need_future_gp[] array needs to be bigger or the caller needs to be repaired. This commit therefore adds a WARN_ON_ONCE() checking for this condition. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d3c769502929..07bccb1f0c87 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1698,6 +1698,8 @@ static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); + WARN_ON_ONCE(ULONG_CMP_LT(rnp_root->gpnum + + need_future_gp_mask(), c)); if (need_future_gp_element(rnp_root, c) || ULONG_CMP_GE(rnp_root->gpnum, c) || (rnp != rnp_root && -- cgit From 384f77f4cb765707216ea43f9122580d8a07be7d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Apr 2018 16:16:45 -0700 Subject: rcu: The rcu_gp_cleanup() function does not need cpu_needs_another_gp() All of the cpu_needs_another_gp() function's checks (except for newly arrived callbacks) have been subsumed into the rcu_gp_cleanup() function's scan of the rcu_node tree. This commit therefore drops the call to cpu_needs_another_gp(). The check for newly arrived callbacks is supplied by rcu_accelerate_cbs(). Any needed advancing (as in the earlier rcu_advance_cbs() call) will be supplied when the corresponding CPU becomes aware of the end of the now-completed grace period. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 07bccb1f0c87..7776d709e060 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2150,11 +2150,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) needgp = true; } /* Advance CBs to reduce false positives below. */ - needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; - if (needgp || cpu_needs_another_gp(rsp, rdp)) { + if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) { WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); } WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); -- cgit From c1935209df8c903fc3a33143223338826fa54bd1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Apr 2018 16:29:13 -0700 Subject: rcu: Simplify and inline cpu_needs_another_gp() Now that RCU no longer relies on failsafe checks, cpu_needs_another_gp() can be greatly simplified. This simplification eliminates the last call to rcu_future_needs_gp() and to rcu_segcblist_future_gp_needed(), both of which which can then be eliminated. And then, because cpu_needs_another_gp() is called only from __rcu_pending(), it can be inlined and eliminated. This commit carries out the simplification, inlining, and elimination called out above. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/rcu_segcblist.c | 18 ------------------ kernel/rcu/rcu_segcblist.h | 2 -- kernel/rcu/tree.c | 40 +++------------------------------------- 3 files changed, 3 insertions(+), 57 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 88cba7c2956c..5aff271adf1e 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -403,24 +403,6 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) return true; } -/* - * Scan the specified rcu_segcblist structure for callbacks that need - * a grace period later than the one specified by "seq". We don't look - * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't - * have a grace-period sequence number. - */ -bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, - unsigned long seq) -{ - int i; - - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) - if (rsclp->tails[i - 1] != rsclp->tails[i] && - ULONG_CMP_LT(seq, rsclp->gp_seq[i])) - return true; - return false; -} - /* * Merge the source rcu_segcblist structure into the destination * rcu_segcblist structure, then initialize the source. Any pending diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 581c12b63544..948470cef385 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -134,7 +134,5 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp); void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq); bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); -bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, - unsigned long seq); void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, struct rcu_segcblist *src_rsclp); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7776d709e060..020a0fe2dbee 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -708,42 +708,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) return &rsp->node[0]; } -/* - * Is there any need for future grace periods? - * Interrupts must be disabled. If the caller does not hold the root - * rnp_node structure's ->lock, the results are advisory only. - */ -static int rcu_future_needs_gp(struct rcu_state *rsp) -{ - struct rcu_node *rnp = rcu_get_root(rsp); - - lockdep_assert_irqs_disabled(); - return need_any_future_gp(rnp); -} - -/* - * Does the current CPU require a not-yet-started grace period? - * The caller must have disabled interrupts to prevent races with - * normal callback registry. - */ -static bool -cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); - if (rcu_gp_in_progress(rsp)) - return false; /* No, a grace period is already in progress. */ - if (rcu_future_needs_gp(rsp)) - return true; /* Yes, a no-CBs CPU needs one. */ - if (!rcu_segcblist_is_enabled(&rdp->cblist)) - return false; /* No, this is a no-CBs (or offline) CPU. */ - if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) - return true; /* Yes, CPU has newly registered callbacks. */ - if (rcu_segcblist_future_gp_needed(&rdp->cblist, - READ_ONCE(rsp->completed))) - return true; /* Yes, CBs for future grace period. */ - return false; /* No grace period needed. */ -} - /* * Enter an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. @@ -3298,7 +3262,9 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) + if (!rcu_gp_in_progress(rsp) && + rcu_segcblist_is_enabled(&rdp->cblist) && + !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; /* Has another RCU grace period completed? */ -- cgit From a458360af63a36424c9f607015f0858aacb84a19 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Apr 2018 15:59:55 -0700 Subject: rcu: Drop early GP request check from rcu_gp_kthread() Now that grace-period requests use funnel locking and now that they set ->gp_flags to RCU_GP_FLAG_INIT even when the RCU grace-period kthread has not yet started, rcu_gp_kthread() no longer needs to check need_any_future_gp() at startup time. This commit therefore removes this check. Signed-off-by: Paul E. McKenney Tested-by: Nicholas Piggin --- kernel/rcu/tree.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 020a0fe2dbee..ed238886e6ca 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2135,12 +2135,6 @@ static int __noreturn rcu_gp_kthread(void *arg) struct rcu_state *rsp = arg; struct rcu_node *rnp = rcu_get_root(rsp); - /* Check for early-boot work. */ - raw_spin_lock_irq_rcu_node(rnp); - if (need_any_future_gp(rnp)) - WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - raw_spin_unlock_irq_rcu_node(rnp); - rcu_bind_gp_kthread(); for (;;) { -- cgit From f64c6013a2029316ea552f99823d116ee5f5f955 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 May 2018 09:50:53 -0700 Subject: rcu/x86: Provide early rcu_cpu_starting() callback The x86/mtrr code does horrific things because hardware. It uses stop_machine_from_inactive_cpu(), which does a wakeup (of the stopper thread on another CPU), which uses RCU, all before the CPU is onlined. RCU complains about this, because wakeups use RCU and RCU does (rightfully) not consider offline CPUs for grace-periods. Fix this by initializing RCU way early in the MTRR case. Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney [ paulmck: Add !SMP support, per 0day Test Robot report. ] --- arch/x86/kernel/cpu/mtrr/main.c | 4 ++++ include/linux/rcupdate.h | 1 - include/linux/rcutiny.h | 1 + include/linux/rcutree.h | 1 + kernel/rcu/tree.c | 9 +++++++++ 5 files changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/tree.c') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7468de429087..3ea0047beb40 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -793,6 +794,9 @@ void mtrr_ap_init(void) if (!use_intel() || mtrr_aps_delayed_init) return; + + rcu_cpu_starting(smp_processor_id()); + /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries * changed, but this routine will be called in cpu boot time, diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 19d235fefdb9..e679b175b411 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -108,7 +108,6 @@ void rcu_sched_qs(void); void rcu_bh_qs(void); void rcu_check_callbacks(int user); void rcu_report_dead(unsigned int cpu); -void rcu_cpu_starting(unsigned int cpu); void rcutree_migrate_callbacks(int cpu); #ifdef CONFIG_RCU_STALL_COMMON diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index ce9beec35e34..7b3c82e8a625 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -132,5 +132,6 @@ static inline void rcu_all_qs(void) { barrier(); } #define rcutree_offline_cpu NULL #define rcutree_dead_cpu NULL #define rcutree_dying_cpu NULL +static inline void rcu_cpu_starting(unsigned int cpu) { } #endif /* __LINUX_RCUTINY_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 448f20f27396..914655848ef6 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -101,5 +101,6 @@ int rcutree_online_cpu(unsigned int cpu); int rcutree_offline_cpu(unsigned int cpu); int rcutree_dead_cpu(unsigned int cpu); int rcutree_dying_cpu(unsigned int cpu); +void rcu_cpu_starting(unsigned int cpu); #endif /* __LINUX_RCUTREE_H */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4fccdfa25716..aa7cade1b9f3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3665,6 +3665,8 @@ int rcutree_dead_cpu(unsigned int cpu) return 0; } +static DEFINE_PER_CPU(int, rcu_cpu_started); + /* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that @@ -3686,6 +3688,11 @@ void rcu_cpu_starting(unsigned int cpu) struct rcu_node *rnp; struct rcu_state *rsp; + if (per_cpu(rcu_cpu_started, cpu)) + return; + + per_cpu(rcu_cpu_started, cpu) = 1; + for_each_rcu_flavor(rsp) { rdp = per_cpu_ptr(rsp->rda, cpu); rnp = rdp->mynode; @@ -3742,6 +3749,8 @@ void rcu_report_dead(unsigned int cpu) preempt_enable(); for_each_rcu_flavor(rsp) rcu_cleanup_dying_idle_cpu(cpu, rsp); + + per_cpu(rcu_cpu_started, cpu) = 0; } /* Migrate the dead CPU's callbacks to the current CPU. */ -- cgit