diff options
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r-- | kernel/rcu/tree.c | 536 |
1 files changed, 370 insertions, 166 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f78ee759af9c..da6f5213fb74 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -70,19 +70,6 @@ #endif #define MODULE_PARAM_PREFIX "rcutree." -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - /* Data structures. */ /* @@ -96,6 +83,9 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), +#ifdef CONFIG_RCU_NOCB_CPU + .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY, +#endif }; static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, @@ -113,8 +103,10 @@ static struct rcu_state rcu_state = { static bool dump_tree; module_param(dump_tree, bool, 0444); /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ -static bool use_softirq = true; +static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT); +#ifndef CONFIG_PREEMPT_RT module_param(use_softirq, bool, 0444); +#endif /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; module_param(rcu_fanout_exact, bool, 0444); @@ -178,13 +170,19 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +// Add delay to rcu_read_unlock() for strict grace periods. +static int rcu_unlock_delay; +#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD +module_param(rcu_unlock_delay, int, 0444); +#endif + /* * This rcu parameter is runtime-read-only. It reflects * a minimum allowed number of objects which can be cached * per-CPU. Object size is equal to one page. This value * can be changed at boot time. */ -static int rcu_min_cached_objs = 2; +static int rcu_min_cached_objs = 5; module_param(rcu_min_cached_objs, int, 0444); /* Retrieve RCU kthreads priority for rcutorture */ @@ -348,6 +346,14 @@ static bool rcu_dynticks_in_eqs(int snap) return !(snap & RCU_DYNTICK_CTRL_CTR); } +/* Return true if the specified CPU is currently idle from an RCU viewpoint. */ +bool rcu_is_idle_cpu(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + return rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); +} + /* * Return true if the CPU corresponding to the specified rcu_data * structure has spent some time in an extended quiescent state since @@ -416,7 +422,7 @@ bool rcu_eqs_special_set(int cpu) * * The caller must have disabled interrupts and must not be idle. */ -void rcu_momentary_dyntick_idle(void) +notrace void rcu_momentary_dyntick_idle(void) { int special; @@ -468,24 +474,25 @@ static int rcu_is_cpu_rrupt_from_idle(void) return __this_cpu_read(rcu_data.dynticks_nesting) == 0; } -#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */ -#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */ +#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) + // Maximum callbacks per rcu_do_batch ... +#define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood. static long blimit = DEFAULT_RCU_BLIMIT; -#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ +#define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit. static long qhimark = DEFAULT_RCU_QHIMARK; -#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ +#define DEFAULT_RCU_QLOMARK 100 // Once only this many pending, use blimit. static long qlowmark = DEFAULT_RCU_QLOMARK; #define DEFAULT_RCU_QOVLD_MULT 2 #define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK) -static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */ -static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */ +static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS. +static long qovld_calc = -1; // No pre-initialization lock acquisitions! module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); module_param(qovld, long, 0444); -static ulong jiffies_till_first_fqs = ULONG_MAX; +static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 0 : ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; static bool rcu_kick_kthreads; static int rcu_divisor = 7; @@ -552,12 +559,12 @@ static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param return ret; } -static struct kernel_param_ops first_fqs_jiffies_ops = { +static const struct kernel_param_ops first_fqs_jiffies_ops = { .set = param_set_first_fqs_jiffies, .get = param_get_ulong, }; -static struct kernel_param_ops next_fqs_jiffies_ops = { +static const struct kernel_param_ops next_fqs_jiffies_ops = { .set = param_set_next_fqs_jiffies, .get = param_get_ulong, }; @@ -642,7 +649,6 @@ static noinstr void rcu_eqs_enter(bool user) trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rdp = this_cpu_ptr(&rcu_data); - do_nocb_deferred_wakeup(rdp); rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); @@ -676,6 +682,50 @@ void rcu_idle_enter(void) EXPORT_SYMBOL_GPL(rcu_idle_enter); #ifdef CONFIG_NO_HZ_FULL + +#if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK) +/* + * An empty function that will trigger a reschedule on + * IRQ tail once IRQs get re-enabled on userspace/guest resume. + */ +static void late_wakeup_func(struct irq_work *work) +{ +} + +static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) = + IRQ_WORK_INIT(late_wakeup_func); + +/* + * If either: + * + * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work + * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry. + * + * In these cases the late RCU wake ups aren't supported in the resched loops and our + * last resort is to fire a local irq_work that will trigger a reschedule once IRQs + * get re-enabled again. + */ +noinstr static void rcu_irq_work_resched(void) +{ + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + + if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU)) + return; + + if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU)) + return; + + instrumentation_begin(); + if (do_nocb_deferred_wakeup(rdp) && need_resched()) { + irq_work_queue(this_cpu_ptr(&late_wakeup_work)); + } + instrumentation_end(); +} + +#else +static inline void rcu_irq_work_resched(void) { } +#endif + /** * rcu_user_enter - inform RCU that we are resuming userspace. * @@ -690,8 +740,16 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); noinstr void rcu_user_enter(void) { lockdep_assert_irqs_disabled(); + + /* + * Other than generic entry implementation, we may be past the last + * rescheduling opportunity in the entry code. Trigger a self IPI + * that will fire and reschedule once we resume in user/guest mode. + */ + rcu_irq_work_resched(); rcu_eqs_enter(true); } + #endif /* CONFIG_NO_HZ_FULL */ /** @@ -934,8 +992,8 @@ void __rcu_irq_enter_check_tick(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - // Enabling the tick is unsafe in NMI handlers. - if (WARN_ON_ONCE(in_nmi())) + // If we're here from NMI there's nothing to do. + if (in_nmi()) return; RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), @@ -1092,11 +1150,6 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } } -noinstr bool __rcu_is_watching(void) -{ - return !rcu_dynticks_curr_cpu_in_eqs(); -} - /** * rcu_is_watching - see if RCU thinks that the current CPU is not idle * @@ -1104,8 +1157,11 @@ noinstr bool __rcu_is_watching(void) * CPU can safely enter RCU read-side critical sections. In other words, * if the current CPU is not in its idle loop or is in an interrupt or * NMI handler, return true. + * + * Make notrace because it can be called by the internal functions of + * ftrace, and making this notrace removes unnecessary recursion calls. */ -bool rcu_is_watching(void) +notrace bool rcu_is_watching(void) { bool ret; @@ -1160,7 +1216,7 @@ bool rcu_lockdep_current_cpu_online(void) preempt_disable_notrace(); rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; - if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) + if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1) ret = true; preempt_enable_notrace(); return ret; @@ -1229,13 +1285,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 1; } - /* If waiting too long on an offline CPU, complain. */ - if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) && - time_after(jiffies, rcu_state.gp_start + HZ)) { + /* + * Complain if a CPU that is considered to be offline from RCU's + * perspective has not yet reported a quiescent state. After all, + * the offline CPU should have reported a quiescent state during + * the CPU-offline process, or, failing that, by rcu_gp_init() + * if it ran concurrently with either the CPU going offline or the + * last task on a leaf rcu_node structure exiting its RCU read-side + * critical section while all CPUs corresponding to that structure + * are offline. This added warning detects bugs in any of these + * code paths. + * + * The rcu_node structure's ->lock is held here, which excludes + * the relevant portions the CPU-hotplug code, the grace-period + * initialization code, and the rcu_read_unlock() code paths. + * + * For more detail, please refer to the "Hotplug CPU" section + * of RCU's Requirements documentation. + */ + if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) { bool onl; struct rcu_node *rnp1; - WARN_ON(1); /* Offline CPUs are supposed to report QS! */ pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", __func__, rnp->grplo, rnp->grphi, rnp->level, (long)rnp->gp_seq, (long)rnp->completedqs); @@ -1307,8 +1378,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (IS_ENABLED(CONFIG_IRQ_WORK) && !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && (rnp->ffmask & rdp->grpmask)) { - init_irq_work(&rdp->rcu_iw, rcu_iw_handler); - atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ); rdp->rcu_iw_pending = true; rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); @@ -1482,6 +1551,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc")); + /* * Callbacks are often registered with incomplete grace-period * information. Something about the fact that getting exact @@ -1498,9 +1569,12 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) - trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccWaitCB")); + trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccWaitCB")); else - trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccReadyCB")); + trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB")); + + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc")); + return ret; } @@ -1576,6 +1650,19 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, } /* + * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a + * quiescent state. This is intended to be invoked when the CPU notices + * a new grace period. + */ +static void rcu_strict_gp_check_qs(void) +{ + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rcu_read_lock(); + rcu_read_unlock(); + } +} + +/* * Update CPU-local rcu_data state to record the beginnings and ends of * grace periods. The caller must hold the ->lock of the leaf rcu_node * structure corresponding to the current CPU, and must have irqs disabled. @@ -1585,8 +1672,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) { bool ret = false; bool need_qs; - const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); raw_lockdep_assert_held_rcu_node(rnp); @@ -1645,6 +1731,7 @@ static void note_gp_changes(struct rcu_data *rdp) } needwake = __note_gp_changes(rnp, rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + rcu_strict_gp_check_qs(); if (needwake) rcu_gp_kthread_wake(); } @@ -1683,10 +1770,20 @@ static void rcu_gp_torture_wait(void) } /* + * Handler for on_each_cpu() to invoke the target CPU's RCU core + * processing. + */ +static void rcu_strict_gp_boundary(void *unused) +{ + invoke_rcu_core(); +} + +/* * Initialize a new grace period. Return false if no grace period required. */ static bool rcu_gp_init(void) { + unsigned long firstseq; unsigned long flags; unsigned long oldmask; unsigned long mask; @@ -1720,13 +1817,22 @@ static bool rcu_gp_init(void) raw_spin_unlock_irq_rcu_node(rnp); /* - * Apply per-leaf buffered online and offline operations to the - * rcu_node tree. Note that this new grace period need not wait - * for subsequent online CPUs, and that quiescent-state forcing - * will handle subsequent offline CPUs. + * Apply per-leaf buffered online and offline operations to + * the rcu_node tree. Note that this new grace period need not + * wait for subsequent online CPUs, and that RCU hooks in the CPU + * offlining path, when combined with checks in this function, + * will handle CPUs that are currently going offline or that will + * go offline later. Please also refer to "Hotplug CPU" section + * of RCU's Requirements documentation. */ - rcu_state.gp_state = RCU_GP_ONOFF; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF); rcu_for_each_leaf_node(rnp) { + smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values. + firstseq = READ_ONCE(rnp->ofl_seq); + if (firstseq & 0x1) + while (firstseq == READ_ONCE(rnp->ofl_seq)) + schedule_timeout_idle(1); // Can't wake unless RCU is watching. + smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values. raw_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -1785,7 +1891,7 @@ static bool rcu_gp_init(void) * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. */ - rcu_state.gp_state = RCU_GP_INIT; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT); rcu_for_each_node_breadth_first(rnp) { rcu_gp_slow(gp_init_delay); raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -1810,6 +1916,10 @@ static bool rcu_gp_init(void) WRITE_ONCE(rcu_state.gp_activity, jiffies); } + // If strict, make all CPUs aware of new grace period. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + on_each_cpu(rcu_strict_gp_boundary, NULL, 0); + return true; } @@ -1880,17 +1990,22 @@ static void rcu_gp_fqs_loop(void) ret = 0; for (;;) { if (!ret) { - rcu_state.jiffies_force_qs = jiffies + j; + WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j); + /* + * jiffies_force_qs before RCU_GP_WAIT_FQS state + * update; required for stall checks. + */ + smp_wmb(); WRITE_ONCE(rcu_state.jiffies_kick_kthreads, jiffies + (j ? 3 * j : 2)); } trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswait")); - rcu_state.gp_state = RCU_GP_WAIT_FQS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS); ret = swait_event_idle_timeout_exclusive( rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); rcu_gp_torture_wait(); - rcu_state.gp_state = RCU_GP_DOING_FQS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS); /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ if (!READ_ONCE(rnp->qsmask) && @@ -1898,7 +2013,7 @@ static void rcu_gp_fqs_loop(void) break; /* If time for quiescent-state forcing, do it. */ if (!time_after(rcu_state.jiffies_force_qs, jiffies) || - (gf & RCU_GP_FLAG_FQS)) { + (gf & (RCU_GP_FLAG_FQS | RCU_GP_FLAG_OVLD))) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); @@ -2004,7 +2119,7 @@ static void rcu_gp_cleanup(void) trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); rcu_seq_end(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); - rcu_state.gp_state = RCU_GP_IDLE; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE); /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(&rcu_data); if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { @@ -2013,8 +2128,7 @@ static void rcu_gp_cleanup(void) needgp = true; } /* Advance CBs to reduce false positives below. */ - offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_is_offloaded(&rdp->cblist); + offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); WRITE_ONCE(rcu_state.gp_req_activity, jiffies); @@ -2026,6 +2140,10 @@ static void rcu_gp_cleanup(void) rcu_state.gp_flags & RCU_GP_FLAG_INIT); } raw_spin_unlock_irq_rcu_node(rnp); + + // If strict, make all CPUs aware of the end of the old grace period. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + on_each_cpu(rcu_strict_gp_boundary, NULL, 0); } /* @@ -2040,12 +2158,12 @@ static int __noreturn rcu_gp_kthread(void *unused) for (;;) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("reqwait")); - rcu_state.gp_state = RCU_GP_WAIT_GPS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS); swait_event_idle_exclusive(rcu_state.gp_wq, READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_INIT); rcu_gp_torture_wait(); - rcu_state.gp_state = RCU_GP_DONE_GPS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS); /* Locking provides needed memory barrier. */ if (rcu_gp_init()) break; @@ -2060,9 +2178,9 @@ static int __noreturn rcu_gp_kthread(void *unused) rcu_gp_fqs_loop(); /* Handle grace-period end. */ - rcu_state.gp_state = RCU_GP_CLEANUP; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP); rcu_gp_cleanup(); - rcu_state.gp_state = RCU_GP_CLEANED; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED); } } @@ -2204,15 +2322,15 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) * structure. This must be called from the specified CPU. */ static void -rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) +rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; bool needwake = false; - const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_node *rnp; + WARN_ON_ONCE(rdp->cpu != smp_processor_id()); rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || @@ -2229,8 +2347,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; - if (rdp->cpu == smp_processor_id()) - rdp->core_needs_qs = false; + rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { @@ -2279,7 +2396,7 @@ rcu_check_quiescent_state(struct rcu_data *rdp) * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rdp); + rcu_report_qs_rdp(rdp); } /* @@ -2360,6 +2477,7 @@ int rcutree_dead_cpu(unsigned int cpu) if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return 0; + WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); /* Do any needed no-CB deferred wakeups from this CPU. */ @@ -2376,12 +2494,13 @@ int rcutree_dead_cpu(unsigned int cpu) */ static void rcu_do_batch(struct rcu_data *rdp) { + int div; + bool __maybe_unused empty; unsigned long flags; - const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); - long bl, count; + long bl, count = 0; long pending, tlimit = 0; /* If no callbacks are ready, just return. */ @@ -2404,22 +2523,32 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_lock(rdp); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); pending = rcu_segcblist_n_cbs(&rdp->cblist); - bl = max(rdp->blimit, pending >> rcu_divisor); - if (unlikely(bl > 100)) - tlimit = local_clock() + rcu_resched_ns; + div = READ_ONCE(rcu_divisor); + div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; + bl = max(rdp->blimit, pending >> div); + if (unlikely(bl > 100)) { + long rrn = READ_ONCE(rcu_resched_ns); + + rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; + tlimit = local_clock() + rrn; + } trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); if (offloaded) rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); + + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued")); rcu_nocb_unlock_irqrestore(rdp, flags); /* Invoke callbacks. */ tick_dep_set_task(current, TICK_DEP_BIT_RCU); rhp = rcu_cblist_dequeue(&rcl); + for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { rcu_callback_t f; + count++; debug_rcu_head_unqueue(rhp); rcu_lock_acquire(&rcu_callback_map); @@ -2433,21 +2562,19 @@ static void rcu_do_batch(struct rcu_data *rdp) /* * Stop only if limit reached and CPU has something to do. - * Note: The rcl structure counts down from zero. */ - if (-rcl.len >= bl && !offloaded && + if (count >= bl && !offloaded && (need_resched() || (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) break; if (unlikely(tlimit)) { /* only call local_clock() every 32 callbacks */ - if (likely((-rcl.len & 31) || local_clock() < tlimit)) + if (likely((count & 31) || local_clock() < tlimit)) continue; /* Exceeded the time limit, so leave. */ break; } - if (offloaded) { - WARN_ON_ONCE(in_serving_softirq()); + if (!in_serving_softirq()) { local_bh_enable(); lockdep_assert_irqs_enabled(); cond_resched_tasks_rcu_qs(); @@ -2458,15 +2585,13 @@ static void rcu_do_batch(struct rcu_data *rdp) local_irq_save(flags); rcu_nocb_lock(rdp); - count = -rcl.len; rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); /* Update counts and requeue any remaining callbacks. */ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); - smp_mb(); /* List handling before counting for rcu_barrier(). */ - rcu_segcblist_insert_count(&rdp->cblist, &rcl); + rcu_segcblist_add_len(&rdp->cblist, -count); /* Reinstate batch limit if we have worked down the excess. */ count = rcu_segcblist_n_cbs(&rdp->cblist); @@ -2484,9 +2609,12 @@ static void rcu_do_batch(struct rcu_data *rdp) * The following usually indicates a double call_rcu(). To track * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. */ - WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist)); + empty = rcu_segcblist_empty(&rdp->cblist); + WARN_ON_ONCE(count == 0 && !empty); WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - count != 0 && rcu_segcblist_empty(&rdp->cblist)); + count != 0 && empty); + WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0); + WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0); rcu_nocb_unlock_irqrestore(rdp, flags); @@ -2507,6 +2635,7 @@ static void rcu_do_batch(struct rcu_data *rdp) void rcu_sched_clock_irq(int user) { trace_rcu_utilization(TPS("Start scheduler-tick")); + lockdep_assert_irqs_disabled(); raw_cpu_inc(rcu_data.ticks_this_gp); /* The load-acquire pairs with the store-release setting to true. */ if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { @@ -2520,6 +2649,7 @@ void rcu_sched_clock_irq(int user) rcu_flavor_sched_clock_irq(user); if (rcu_pending(user)) invoke_rcu_core(); + lockdep_assert_irqs_disabled(); trace_rcu_utilization(TPS("End scheduler-tick")); } @@ -2547,8 +2677,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) raw_spin_lock_irqsave_rcu_node(rnp, flags); rcu_state.cbovldnext |= !!rnp->cbovldmask; if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || - rcu_preempt_blocked_readers_cgp(rnp)) { + if (rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they * are all zero. But we might need to @@ -2616,14 +2745,21 @@ void rcu_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); +// Workqueue handler for an RCU reader for kernels enforcing struct RCU +// grace periods. +static void strict_work_handler(struct work_struct *work) +{ + rcu_read_lock(); + rcu_read_unlock(); +} + /* Perform RCU core processing work for the current CPU. */ static __latent_entropy void rcu_core(void) { unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_is_offloaded(&rdp->cblist); + const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) return; @@ -2643,23 +2779,27 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && - rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) { - local_irq_save(flags); + rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) { + rcu_nocb_lock_irqsave(rdp, flags); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); - local_irq_restore(flags); + rcu_nocb_unlock_irqrestore(rdp, flags); } rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) && + if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) && likely(READ_ONCE(rcu_scheduler_fully_active))) rcu_do_batch(rdp); /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); trace_rcu_utilization(TPS("End RCU core")); + + // If strict GPs, schedule an RCU reader in a clean environment. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work); } static void rcu_core_si(struct softirq_action *h) @@ -2872,6 +3012,7 @@ static void check_cb_ovld(struct rcu_data *rdp) static void __call_rcu(struct rcu_head *head, rcu_callback_t func) { + static atomic_t doublefrees; unsigned long flags; struct rcu_data *rdp; bool was_alldone; @@ -2885,8 +3026,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) * Use rcu:rcu_callback trace event to find the previous * time callback was passed to __call_rcu(). */ - WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n", - head, head->func); + if (atomic_inc_return(&doublefrees) < 4) { + pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); + mem_dump_obj(head); + } WRITE_ONCE(head->func, rcu_leak_callback); return; } @@ -2920,9 +3063,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) trace_rcu_callback(rcu_state.name, head, rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); + /* Go handle any RCU core processing required. */ - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { + if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ } else { __call_rcu_core(rdp, head, flags); @@ -3022,6 +3166,15 @@ struct kfree_rcu_cpu_work { * @monitor_todo: Tracks whether a @monitor_work delayed work is pending * @initialized: The @rcu_work fields have been initialized * @count: Number of objects for which GP not started + * @bkvcache: + * A simple cache list that contains objects for reuse purpose. + * In order to save some per-cpu space the list is singular. + * Even though it is lockless an access has to be protected by the + * per-cpu lock. + * @page_cache_work: A work to refill the cache when it is empty + * @work_in_progress: Indicates that page_cache_work is running + * @hrtimer: A hrtimer for scheduling a page_cache_work + * @nr_bkv_objs: number of allocated objects at @bkvcache. * * This is a per-CPU structure. The reason that it is not included in * the rcu_data structure is to permit this code to be extracted from @@ -3038,13 +3191,10 @@ struct kfree_rcu_cpu { bool initialized; int count; - /* - * A simple cache list that contains objects for - * reuse purpose. In order to save some per-cpu - * space the list is singular. Even though it is - * lockless an access has to be protected by the - * per-cpu lock. - */ + struct work_struct page_cache_work; + atomic_t work_in_progress; + struct hrtimer hrtimer; + struct llist_head bkvcache; int nr_bkv_objs; }; @@ -3162,10 +3312,10 @@ static void kfree_rcu_work(struct work_struct *work) } rcu_lock_release(&rcu_callback_map); - krcp = krc_this_cpu_lock(&flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (put_cached_bnode(krcp, bkvhead[i])) bkvhead[i] = NULL; - krc_this_cpu_unlock(krcp, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); if (bkvhead[i]) free_page((unsigned long) bkvhead[i]); @@ -3292,6 +3442,57 @@ static void kfree_rcu_monitor(struct work_struct *work) raw_spin_unlock_irqrestore(&krcp->lock, flags); } +static enum hrtimer_restart +schedule_page_work_fn(struct hrtimer *t) +{ + struct kfree_rcu_cpu *krcp = + container_of(t, struct kfree_rcu_cpu, hrtimer); + + queue_work(system_highpri_wq, &krcp->page_cache_work); + return HRTIMER_NORESTART; +} + +static void fill_page_cache_func(struct work_struct *work) +{ + struct kvfree_rcu_bulk_data *bnode; + struct kfree_rcu_cpu *krcp = + container_of(work, struct kfree_rcu_cpu, + page_cache_work); + unsigned long flags; + bool pushed; + int i; + + for (i = 0; i < rcu_min_cached_objs; i++) { + bnode = (struct kvfree_rcu_bulk_data *) + __get_free_page(GFP_KERNEL | __GFP_NOWARN); + + if (bnode) { + raw_spin_lock_irqsave(&krcp->lock, flags); + pushed = put_cached_bnode(krcp, bnode); + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + if (!pushed) { + free_page((unsigned long) bnode); + break; + } + } + } + + atomic_set(&krcp->work_in_progress, 0); +} + +static void +run_page_cache_worker(struct kfree_rcu_cpu *krcp) +{ + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && + !atomic_xchg(&krcp->work_in_progress, 1)) { + hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + krcp->hrtimer.function = schedule_page_work_fn; + hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); + } +} + static inline bool kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) { @@ -3308,32 +3509,8 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) if (!krcp->bkvhead[idx] || krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { bnode = get_cached_bnode(krcp); - if (!bnode) { - /* - * To keep this path working on raw non-preemptible - * sections, prevent the optional entry into the - * allocator as it uses sleeping locks. In fact, even - * if the caller of kfree_rcu() is preemptible, this - * path still is not, as krcp->lock is a raw spinlock. - * With additional page pre-allocation in the works, - * hitting this return is going to be much less likely. - */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - return false; - - /* - * NOTE: For one argument of kvfree_rcu() we can - * drop the lock and get the page in sleepable - * context. That would allow to maintain an array - * for the CONFIG_PREEMPT_RT as well if no cached - * pages are available. - */ - bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_NOWAIT | __GFP_NOWARN); - } - /* Switch to emergency path. */ - if (unlikely(!bnode)) + if (!bnode) return false; /* Initialize the new block. */ @@ -3397,12 +3574,11 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) goto unlock_return; } - /* - * Under high memory pressure GFP_NOWAIT can fail, - * in that case the emergency path is maintained. - */ + kasan_record_aux_stack(ptr); success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); if (!success) { + run_page_cache_worker(krcp); + if (head == NULL) // Inline if kvfree_rcu(one_arg) call. goto unlock_return; @@ -3445,7 +3621,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) unsigned long count = 0; /* Snapshot count of all CPUs */ - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count += READ_ONCE(krcp->count); @@ -3460,7 +3636,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int cpu, freed = 0; unsigned long flags; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { int count; struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); @@ -3493,7 +3669,7 @@ void __init kfree_rcu_scheduler_running(void) int cpu; unsigned long flags; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); raw_spin_lock_irqsave(&krcp->lock, flags); @@ -3512,7 +3688,7 @@ void __init kfree_rcu_scheduler_running(void) * During early boot, any blocking grace-period wait automatically * implies a grace period. Later on, this is never the case for PREEMPTION. * - * Howevr, because a context switch is a grace period for !PREEMPTION, any + * However, because a context switch is a grace period for !PREEMPTION, any * blocking grace-period wait automatically implies a grace period if * there is only one CPU online at any point time during execution of * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to @@ -3528,7 +3704,20 @@ static int rcu_blocking_is_gp(void) return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; might_sleep(); /* Check for RCU read-side critical section. */ preempt_disable(); - ret = num_online_cpus() <= 1; + /* + * If the rcu_state.n_online_cpus counter is equal to one, + * there is only one CPU, and that CPU sees all prior accesses + * made by any CPU that was online at the time of its access. + * Furthermore, if this counter is equal to one, its value cannot + * change until after the preempt_enable() below. + * + * Furthermore, if rcu_state.n_online_cpus is equal to one here, + * all later CPUs (both this one and any that come online later + * on) are guaranteed to see all accesses prior to this point + * in the code, without the need for additional memory barriers. + * Those memory barriers are provided by CPU-hotplug code. + */ + ret = READ_ONCE(rcu_state.n_online_cpus) <= 1; preempt_enable(); return ret; } @@ -3573,7 +3762,7 @@ void synchronize_rcu(void) lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); if (rcu_blocking_is_gp()) - return; + return; // Context allows vacuous grace periods. if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); else @@ -3635,6 +3824,8 @@ static int rcu_pending(int user) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; + lockdep_assert_irqs_disabled(); + /* Check for CPU stalls, if enabled. */ check_cpu_stall(rdp); @@ -3652,13 +3843,13 @@ static int rcu_pending(int user) return 1; /* Does this CPU have callbacks ready to invoke? */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) + if (!rcu_segcblist_is_offloaded(&rdp->cblist) && + rcu_segcblist_ready_cbs(&rdp->cblist)) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) && - (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) || - !rcu_segcblist_is_offloaded(&rdp->cblist)) && + !rcu_segcblist_is_offloaded(&rdp->cblist) && !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; @@ -3857,6 +4048,7 @@ rcu_boot_init_percpu_data(int cpu) /* Set up local state, ensuring consistent view of global state. */ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); + INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(rdp->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; @@ -3888,12 +4080,18 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rcu_state.n_force_qs; rdp->blimit = blimit; - if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ - !rcu_segcblist_is_offloaded(&rdp->cblist)) - rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + /* + * Lock in case the CB/GP kthreads are still around handling + * old callbacks (longer term we should flush all callbacks + * before completing CPU offline) + */ + rcu_nocb_lock(rdp); + if (rcu_segcblist_empty(&rdp->cblist)) /* No early-boot CBs? */ + rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ + rcu_nocb_unlock(rdp); /* * Add CPU to leaf rcu_node pending-online bitmask. Any needed @@ -3908,11 +4106,13 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->cpu_no_qs.b.norm = true; rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; + rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler); rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_prepare_kthreads(cpu); rcu_spawn_cpu_nocb_kthread(cpu); + WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1); return 0; } @@ -3975,8 +4175,6 @@ int rcutree_offline_cpu(unsigned int cpu) return 0; } -static DEFINE_PER_CPU(int, rcu_cpu_started); - /* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that @@ -3996,14 +4194,16 @@ void rcu_cpu_starting(unsigned int cpu) struct rcu_node *rnp; bool newcpu; - if (per_cpu(rcu_cpu_started, cpu)) + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->cpu_started) return; + rdp->cpu_started = true; - per_cpu(rcu_cpu_started, cpu) = 1; - - rdp = per_cpu_ptr(&rcu_data, cpu); rnp = rdp->mynode; mask = rdp->grpmask; + WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); + WARN_ON_ONCE(!(rnp->ofl_seq & 0x1)); + smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). raw_spin_lock_irqsave_rcu_node(rnp, flags); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); newcpu = !(rnp->expmaskinitnext & mask); @@ -4014,17 +4214,21 @@ void rcu_cpu_starting(unsigned int cpu) rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); - if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ + + /* An incoming CPU should never be blocking a grace period. */ + if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */ rcu_disable_urgency_upon_qs(rdp); /* Report QS -after- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); } else { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } + smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). + WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); + WARN_ON_ONCE(rnp->ofl_seq & 0x1); smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } -#ifdef CONFIG_HOTPLUG_CPU /* * The outgoing function has no further need of RCU, so remove it from * the rcu_node tree's ->qsmaskinitnext bit masks. @@ -4040,6 +4244,9 @@ void rcu_report_dead(unsigned int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + // Do any dangling deferred wakeups. + do_nocb_deferred_wakeup(rdp); + /* QS for any half-done expedited grace period. */ preempt_disable(); rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); @@ -4048,6 +4255,9 @@ void rcu_report_dead(unsigned int cpu) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; + WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); + WARN_ON_ONCE(!(rnp->ofl_seq & 0x1)); + smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). raw_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); @@ -4060,10 +4270,14 @@ void rcu_report_dead(unsigned int cpu) WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); raw_spin_unlock(&rcu_state.ofl_lock); + smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). + WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); + WARN_ON_ONCE(rnp->ofl_seq & 0x1); - per_cpu(rcu_cpu_started, cpu) = 0; + rdp->cpu_started = false; } +#ifdef CONFIG_HOTPLUG_CPU /* * The outgoing CPU has just passed through the dying-idle state, and we * are being invoked from the CPU that was IPIed to continue the offline @@ -4396,24 +4610,14 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - struct kvfree_rcu_bulk_data *bnode; for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; } - for (i = 0; i < rcu_min_cached_objs; i++) { - bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_NOWAIT | __GFP_NOWARN); - - if (bnode) - put_cached_bnode(krcp, bnode); - else - pr_err("Failed to preallocate for %d CPU!\n", cpu); - } - INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); + INIT_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; } if (register_shrinker(&kfree_rcu_shrinker)) |