aboutsummaryrefslogtreecommitdiff
path: root/kernel/rcu/tree.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r--kernel/rcu/tree.c384
1 files changed, 250 insertions, 134 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f78ee759af9c..40e5e3dd253e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -70,19 +70,6 @@
#endif
#define MODULE_PARAM_PREFIX "rcutree."
-#ifndef data_race
-#define data_race(expr) \
- ({ \
- expr; \
- })
-#endif
-#ifndef ASSERT_EXCLUSIVE_WRITER
-#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0)
-#endif
-#ifndef ASSERT_EXCLUSIVE_ACCESS
-#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0)
-#endif
-
/* Data structures. */
/*
@@ -178,13 +165,19 @@ module_param(gp_init_delay, int, 0444);
static int gp_cleanup_delay;
module_param(gp_cleanup_delay, int, 0444);
+// Add delay to rcu_read_unlock() for strict grace periods.
+static int rcu_unlock_delay;
+#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD
+module_param(rcu_unlock_delay, int, 0444);
+#endif
+
/*
* This rcu parameter is runtime-read-only. It reflects
* a minimum allowed number of objects which can be cached
* per-CPU. Object size is equal to one page. This value
* can be changed at boot time.
*/
-static int rcu_min_cached_objs = 2;
+static int rcu_min_cached_objs = 5;
module_param(rcu_min_cached_objs, int, 0444);
/* Retrieve RCU kthreads priority for rcutorture */
@@ -348,6 +341,14 @@ static bool rcu_dynticks_in_eqs(int snap)
return !(snap & RCU_DYNTICK_CTRL_CTR);
}
+/* Return true if the specified CPU is currently idle from an RCU viewpoint. */
+bool rcu_is_idle_cpu(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ return rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
+}
+
/*
* Return true if the CPU corresponding to the specified rcu_data
* structure has spent some time in an extended quiescent state since
@@ -416,7 +417,7 @@ bool rcu_eqs_special_set(int cpu)
*
* The caller must have disabled interrupts and must not be idle.
*/
-void rcu_momentary_dyntick_idle(void)
+notrace void rcu_momentary_dyntick_idle(void)
{
int special;
@@ -468,24 +469,25 @@ static int rcu_is_cpu_rrupt_from_idle(void)
return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
}
-#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */
-#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */
+#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
+ // Maximum callbacks per rcu_do_batch ...
+#define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood.
static long blimit = DEFAULT_RCU_BLIMIT;
-#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */
+#define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit.
static long qhimark = DEFAULT_RCU_QHIMARK;
-#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */
+#define DEFAULT_RCU_QLOMARK 100 // Once only this many pending, use blimit.
static long qlowmark = DEFAULT_RCU_QLOMARK;
#define DEFAULT_RCU_QOVLD_MULT 2
#define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK)
-static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */
-static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */
+static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS.
+static long qovld_calc = -1; // No pre-initialization lock acquisitions!
module_param(blimit, long, 0444);
module_param(qhimark, long, 0444);
module_param(qlowmark, long, 0444);
module_param(qovld, long, 0444);
-static ulong jiffies_till_first_fqs = ULONG_MAX;
+static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 0 : ULONG_MAX;
static ulong jiffies_till_next_fqs = ULONG_MAX;
static bool rcu_kick_kthreads;
static int rcu_divisor = 7;
@@ -552,12 +554,12 @@ static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param
return ret;
}
-static struct kernel_param_ops first_fqs_jiffies_ops = {
+static const struct kernel_param_ops first_fqs_jiffies_ops = {
.set = param_set_first_fqs_jiffies,
.get = param_get_ulong,
};
-static struct kernel_param_ops next_fqs_jiffies_ops = {
+static const struct kernel_param_ops next_fqs_jiffies_ops = {
.set = param_set_next_fqs_jiffies,
.get = param_get_ulong,
};
@@ -934,8 +936,8 @@ void __rcu_irq_enter_check_tick(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- // Enabling the tick is unsafe in NMI handlers.
- if (WARN_ON_ONCE(in_nmi()))
+ // If we're here from NMI there's nothing to do.
+ if (in_nmi())
return;
RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
@@ -1092,11 +1094,6 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
}
}
-noinstr bool __rcu_is_watching(void)
-{
- return !rcu_dynticks_curr_cpu_in_eqs();
-}
-
/**
* rcu_is_watching - see if RCU thinks that the current CPU is not idle
*
@@ -1104,8 +1101,11 @@ noinstr bool __rcu_is_watching(void)
* CPU can safely enter RCU read-side critical sections. In other words,
* if the current CPU is not in its idle loop or is in an interrupt or
* NMI handler, return true.
+ *
+ * Make notrace because it can be called by the internal functions of
+ * ftrace, and making this notrace removes unnecessary recursion calls.
*/
-bool rcu_is_watching(void)
+notrace bool rcu_is_watching(void)
{
bool ret;
@@ -1160,7 +1160,7 @@ bool rcu_lockdep_current_cpu_online(void)
preempt_disable_notrace();
rdp = this_cpu_ptr(&rcu_data);
rnp = rdp->mynode;
- if (rdp->grpmask & rcu_rnp_online_cpus(rnp))
+ if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1)
ret = true;
preempt_enable_notrace();
return ret;
@@ -1229,13 +1229,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
return 1;
}
- /* If waiting too long on an offline CPU, complain. */
- if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) &&
- time_after(jiffies, rcu_state.gp_start + HZ)) {
+ /*
+ * Complain if a CPU that is considered to be offline from RCU's
+ * perspective has not yet reported a quiescent state. After all,
+ * the offline CPU should have reported a quiescent state during
+ * the CPU-offline process, or, failing that, by rcu_gp_init()
+ * if it ran concurrently with either the CPU going offline or the
+ * last task on a leaf rcu_node structure exiting its RCU read-side
+ * critical section while all CPUs corresponding to that structure
+ * are offline. This added warning detects bugs in any of these
+ * code paths.
+ *
+ * The rcu_node structure's ->lock is held here, which excludes
+ * the relevant portions the CPU-hotplug code, the grace-period
+ * initialization code, and the rcu_read_unlock() code paths.
+ *
+ * For more detail, please refer to the "Hotplug CPU" section
+ * of RCU's Requirements documentation.
+ */
+ if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) {
bool onl;
struct rcu_node *rnp1;
- WARN_ON(1); /* Offline CPUs are supposed to report QS! */
pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
__func__, rnp->grplo, rnp->grphi, rnp->level,
(long)rnp->gp_seq, (long)rnp->completedqs);
@@ -1307,8 +1322,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (IS_ENABLED(CONFIG_IRQ_WORK) &&
!rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
(rnp->ffmask & rdp->grpmask)) {
- init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
- atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ);
rdp->rcu_iw_pending = true;
rdp->rcu_iw_gp_seq = rnp->gp_seq;
irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
@@ -1498,9 +1511,10 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
/* Trace depending on how much we were able to accelerate. */
if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
- trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccWaitCB"));
+ trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccWaitCB"));
else
- trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccReadyCB"));
+ trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
+
return ret;
}
@@ -1576,6 +1590,19 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
}
/*
+ * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a
+ * quiescent state. This is intended to be invoked when the CPU notices
+ * a new grace period.
+ */
+static void rcu_strict_gp_check_qs(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
+ rcu_read_lock();
+ rcu_read_unlock();
+ }
+}
+
+/*
* Update CPU-local rcu_data state to record the beginnings and ends of
* grace periods. The caller must hold the ->lock of the leaf rcu_node
* structure corresponding to the current CPU, and must have irqs disabled.
@@ -1585,8 +1612,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
{
bool ret = false;
bool need_qs;
- const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_is_offloaded(&rdp->cblist);
+ const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
raw_lockdep_assert_held_rcu_node(rnp);
@@ -1645,6 +1671,7 @@ static void note_gp_changes(struct rcu_data *rdp)
}
needwake = __note_gp_changes(rnp, rdp);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ rcu_strict_gp_check_qs();
if (needwake)
rcu_gp_kthread_wake();
}
@@ -1683,10 +1710,20 @@ static void rcu_gp_torture_wait(void)
}
/*
+ * Handler for on_each_cpu() to invoke the target CPU's RCU core
+ * processing.
+ */
+static void rcu_strict_gp_boundary(void *unused)
+{
+ invoke_rcu_core();
+}
+
+/*
* Initialize a new grace period. Return false if no grace period required.
*/
static bool rcu_gp_init(void)
{
+ unsigned long firstseq;
unsigned long flags;
unsigned long oldmask;
unsigned long mask;
@@ -1720,13 +1757,22 @@ static bool rcu_gp_init(void)
raw_spin_unlock_irq_rcu_node(rnp);
/*
- * Apply per-leaf buffered online and offline operations to the
- * rcu_node tree. Note that this new grace period need not wait
- * for subsequent online CPUs, and that quiescent-state forcing
- * will handle subsequent offline CPUs.
+ * Apply per-leaf buffered online and offline operations to
+ * the rcu_node tree. Note that this new grace period need not
+ * wait for subsequent online CPUs, and that RCU hooks in the CPU
+ * offlining path, when combined with checks in this function,
+ * will handle CPUs that are currently going offline or that will
+ * go offline later. Please also refer to "Hotplug CPU" section
+ * of RCU's Requirements documentation.
*/
rcu_state.gp_state = RCU_GP_ONOFF;
rcu_for_each_leaf_node(rnp) {
+ smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
+ firstseq = READ_ONCE(rnp->ofl_seq);
+ if (firstseq & 0x1)
+ while (firstseq == READ_ONCE(rnp->ofl_seq))
+ schedule_timeout_idle(1); // Can't wake unless RCU is watching.
+ smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values.
raw_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irq_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@@ -1810,6 +1856,10 @@ static bool rcu_gp_init(void)
WRITE_ONCE(rcu_state.gp_activity, jiffies);
}
+ // If strict, make all CPUs aware of new grace period.
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+ on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
+
return true;
}
@@ -1898,7 +1948,7 @@ static void rcu_gp_fqs_loop(void)
break;
/* If time for quiescent-state forcing, do it. */
if (!time_after(rcu_state.jiffies_force_qs, jiffies) ||
- (gf & RCU_GP_FLAG_FQS)) {
+ (gf & (RCU_GP_FLAG_FQS | RCU_GP_FLAG_OVLD))) {
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqsstart"));
rcu_gp_fqs(first_gp_fqs);
@@ -2013,8 +2063,7 @@ static void rcu_gp_cleanup(void)
needgp = true;
}
/* Advance CBs to reduce false positives below. */
- offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_is_offloaded(&rdp->cblist);
+ offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
@@ -2026,6 +2075,10 @@ static void rcu_gp_cleanup(void)
rcu_state.gp_flags & RCU_GP_FLAG_INIT);
}
raw_spin_unlock_irq_rcu_node(rnp);
+
+ // If strict, make all CPUs aware of the end of the old grace period.
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+ on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
}
/*
@@ -2204,15 +2257,15 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
* structure. This must be called from the specified CPU.
*/
static void
-rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
+rcu_report_qs_rdp(struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
bool needwake = false;
- const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_is_offloaded(&rdp->cblist);
+ const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
struct rcu_node *rnp;
+ WARN_ON_ONCE(rdp->cpu != smp_processor_id());
rnp = rdp->mynode;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq ||
@@ -2229,8 +2282,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
return;
}
mask = rdp->grpmask;
- if (rdp->cpu == smp_processor_id())
- rdp->core_needs_qs = false;
+ rdp->core_needs_qs = false;
if ((rnp->qsmask & mask) == 0) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
} else {
@@ -2279,7 +2331,7 @@ rcu_check_quiescent_state(struct rcu_data *rdp)
* Tell RCU we are done (but rcu_report_qs_rdp() will be the
* judge of that).
*/
- rcu_report_qs_rdp(rdp->cpu, rdp);
+ rcu_report_qs_rdp(rdp);
}
/*
@@ -2360,6 +2412,7 @@ int rcutree_dead_cpu(unsigned int cpu)
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
return 0;
+ WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
/* Adjust any no-longer-needed kthreads. */
rcu_boost_kthread_setaffinity(rnp, -1);
/* Do any needed no-CB deferred wakeups from this CPU. */
@@ -2376,9 +2429,9 @@ int rcutree_dead_cpu(unsigned int cpu)
*/
static void rcu_do_batch(struct rcu_data *rdp)
{
+ int div;
unsigned long flags;
- const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_is_offloaded(&rdp->cblist);
+ const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
struct rcu_head *rhp;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
long bl, count;
@@ -2404,9 +2457,15 @@ static void rcu_do_batch(struct rcu_data *rdp)
rcu_nocb_lock(rdp);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
pending = rcu_segcblist_n_cbs(&rdp->cblist);
- bl = max(rdp->blimit, pending >> rcu_divisor);
- if (unlikely(bl > 100))
- tlimit = local_clock() + rcu_resched_ns;
+ div = READ_ONCE(rcu_divisor);
+ div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;
+ bl = max(rdp->blimit, pending >> div);
+ if (unlikely(bl > 100)) {
+ long rrn = READ_ONCE(rcu_resched_ns);
+
+ rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn;
+ tlimit = local_clock() + rrn;
+ }
trace_rcu_batch_start(rcu_state.name,
rcu_segcblist_n_cbs(&rdp->cblist), bl);
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
@@ -2547,8 +2606,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rcu_state.cbovldnext |= !!rnp->cbovldmask;
if (rnp->qsmask == 0) {
- if (!IS_ENABLED(CONFIG_PREEMPT_RCU) ||
- rcu_preempt_blocked_readers_cgp(rnp)) {
+ if (rcu_preempt_blocked_readers_cgp(rnp)) {
/*
* No point in scanning bits because they
* are all zero. But we might need to
@@ -2616,14 +2674,21 @@ void rcu_force_quiescent_state(void)
}
EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+// Workqueue handler for an RCU reader for kernels enforcing struct RCU
+// grace periods.
+static void strict_work_handler(struct work_struct *work)
+{
+ rcu_read_lock();
+ rcu_read_unlock();
+}
+
/* Perform RCU core processing work for the current CPU. */
static __latent_entropy void rcu_core(void)
{
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_is_offloaded(&rdp->cblist);
+ const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
if (cpu_is_offline(smp_processor_id()))
return;
@@ -2660,6 +2725,10 @@ static __latent_entropy void rcu_core(void)
/* Do any needed deferred wakeups of rcuo kthreads. */
do_nocb_deferred_wakeup(rdp);
trace_rcu_utilization(TPS("End RCU core"));
+
+ // If strict GPs, schedule an RCU reader in a clean environment.
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+ queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work);
}
static void rcu_core_si(struct softirq_action *h)
@@ -2921,8 +2990,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
rcu_segcblist_n_cbs(&rdp->cblist));
/* Go handle any RCU core processing required. */
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
+ if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
__call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
} else {
__call_rcu_core(rdp, head, flags);
@@ -3022,6 +3090,15 @@ struct kfree_rcu_cpu_work {
* @monitor_todo: Tracks whether a @monitor_work delayed work is pending
* @initialized: The @rcu_work fields have been initialized
* @count: Number of objects for which GP not started
+ * @bkvcache:
+ * A simple cache list that contains objects for reuse purpose.
+ * In order to save some per-cpu space the list is singular.
+ * Even though it is lockless an access has to be protected by the
+ * per-cpu lock.
+ * @page_cache_work: A work to refill the cache when it is empty
+ * @work_in_progress: Indicates that page_cache_work is running
+ * @hrtimer: A hrtimer for scheduling a page_cache_work
+ * @nr_bkv_objs: number of allocated objects at @bkvcache.
*
* This is a per-CPU structure. The reason that it is not included in
* the rcu_data structure is to permit this code to be extracted from
@@ -3038,13 +3115,10 @@ struct kfree_rcu_cpu {
bool initialized;
int count;
- /*
- * A simple cache list that contains objects for
- * reuse purpose. In order to save some per-cpu
- * space the list is singular. Even though it is
- * lockless an access has to be protected by the
- * per-cpu lock.
- */
+ struct work_struct page_cache_work;
+ atomic_t work_in_progress;
+ struct hrtimer hrtimer;
+
struct llist_head bkvcache;
int nr_bkv_objs;
};
@@ -3162,10 +3236,10 @@ static void kfree_rcu_work(struct work_struct *work)
}
rcu_lock_release(&rcu_callback_map);
- krcp = krc_this_cpu_lock(&flags);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
if (put_cached_bnode(krcp, bkvhead[i]))
bkvhead[i] = NULL;
- krc_this_cpu_unlock(krcp, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
if (bkvhead[i])
free_page((unsigned long) bkvhead[i]);
@@ -3292,6 +3366,57 @@ static void kfree_rcu_monitor(struct work_struct *work)
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
+static enum hrtimer_restart
+schedule_page_work_fn(struct hrtimer *t)
+{
+ struct kfree_rcu_cpu *krcp =
+ container_of(t, struct kfree_rcu_cpu, hrtimer);
+
+ queue_work(system_highpri_wq, &krcp->page_cache_work);
+ return HRTIMER_NORESTART;
+}
+
+static void fill_page_cache_func(struct work_struct *work)
+{
+ struct kvfree_rcu_bulk_data *bnode;
+ struct kfree_rcu_cpu *krcp =
+ container_of(work, struct kfree_rcu_cpu,
+ page_cache_work);
+ unsigned long flags;
+ bool pushed;
+ int i;
+
+ for (i = 0; i < rcu_min_cached_objs; i++) {
+ bnode = (struct kvfree_rcu_bulk_data *)
+ __get_free_page(GFP_KERNEL | __GFP_NOWARN);
+
+ if (bnode) {
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ pushed = put_cached_bnode(krcp, bnode);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (!pushed) {
+ free_page((unsigned long) bnode);
+ break;
+ }
+ }
+ }
+
+ atomic_set(&krcp->work_in_progress, 0);
+}
+
+static void
+run_page_cache_worker(struct kfree_rcu_cpu *krcp)
+{
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
+ !atomic_xchg(&krcp->work_in_progress, 1)) {
+ hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ krcp->hrtimer.function = schedule_page_work_fn;
+ hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+ }
+}
+
static inline bool
kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
{
@@ -3308,32 +3433,8 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
if (!krcp->bkvhead[idx] ||
krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
bnode = get_cached_bnode(krcp);
- if (!bnode) {
- /*
- * To keep this path working on raw non-preemptible
- * sections, prevent the optional entry into the
- * allocator as it uses sleeping locks. In fact, even
- * if the caller of kfree_rcu() is preemptible, this
- * path still is not, as krcp->lock is a raw spinlock.
- * With additional page pre-allocation in the works,
- * hitting this return is going to be much less likely.
- */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- return false;
-
- /*
- * NOTE: For one argument of kvfree_rcu() we can
- * drop the lock and get the page in sleepable
- * context. That would allow to maintain an array
- * for the CONFIG_PREEMPT_RT as well if no cached
- * pages are available.
- */
- bnode = (struct kvfree_rcu_bulk_data *)
- __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
- }
-
/* Switch to emergency path. */
- if (unlikely(!bnode))
+ if (!bnode)
return false;
/* Initialize the new block. */
@@ -3397,12 +3498,10 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
goto unlock_return;
}
- /*
- * Under high memory pressure GFP_NOWAIT can fail,
- * in that case the emergency path is maintained.
- */
success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
if (!success) {
+ run_page_cache_worker(krcp);
+
if (head == NULL)
// Inline if kvfree_rcu(one_arg) call.
goto unlock_return;
@@ -3445,7 +3544,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
unsigned long count = 0;
/* Snapshot count of all CPUs */
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
count += READ_ONCE(krcp->count);
@@ -3460,7 +3559,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
int cpu, freed = 0;
unsigned long flags;
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
int count;
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
@@ -3493,7 +3592,7 @@ void __init kfree_rcu_scheduler_running(void)
int cpu;
unsigned long flags;
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
raw_spin_lock_irqsave(&krcp->lock, flags);
@@ -3512,7 +3611,7 @@ void __init kfree_rcu_scheduler_running(void)
* During early boot, any blocking grace-period wait automatically
* implies a grace period. Later on, this is never the case for PREEMPTION.
*
- * Howevr, because a context switch is a grace period for !PREEMPTION, any
+ * However, because a context switch is a grace period for !PREEMPTION, any
* blocking grace-period wait automatically implies a grace period if
* there is only one CPU online at any point time during execution of
* either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
@@ -3528,7 +3627,20 @@ static int rcu_blocking_is_gp(void)
return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
might_sleep(); /* Check for RCU read-side critical section. */
preempt_disable();
- ret = num_online_cpus() <= 1;
+ /*
+ * If the rcu_state.n_online_cpus counter is equal to one,
+ * there is only one CPU, and that CPU sees all prior accesses
+ * made by any CPU that was online at the time of its access.
+ * Furthermore, if this counter is equal to one, its value cannot
+ * change until after the preempt_enable() below.
+ *
+ * Furthermore, if rcu_state.n_online_cpus is equal to one here,
+ * all later CPUs (both this one and any that come online later
+ * on) are guaranteed to see all accesses prior to this point
+ * in the code, without the need for additional memory barriers.
+ * Those memory barriers are provided by CPU-hotplug code.
+ */
+ ret = READ_ONCE(rcu_state.n_online_cpus) <= 1;
preempt_enable();
return ret;
}
@@ -3573,7 +3685,7 @@ void synchronize_rcu(void)
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
if (rcu_blocking_is_gp())
- return;
+ return; // Context allows vacuous grace periods.
if (rcu_gp_is_expedited())
synchronize_rcu_expedited();
else
@@ -3652,13 +3764,13 @@ static int rcu_pending(int user)
return 1;
/* Does this CPU have callbacks ready to invoke? */
- if (rcu_segcblist_ready_cbs(&rdp->cblist))
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist) &&
+ rcu_segcblist_ready_cbs(&rdp->cblist))
return 1;
/* Has RCU gone idle with this CPU needing another grace period? */
if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) &&
- (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) ||
- !rcu_segcblist_is_offloaded(&rdp->cblist)) &&
+ !rcu_segcblist_is_offloaded(&rdp->cblist) &&
!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
return 1;
@@ -3857,6 +3969,7 @@ rcu_boot_init_percpu_data(int cpu)
/* Set up local state, ensuring consistent view of global state. */
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
+ INIT_WORK(&rdp->strict_work, strict_work_handler);
WARN_ON_ONCE(rdp->dynticks_nesting != 1);
WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
@@ -3908,11 +4021,13 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->cpu_no_qs.b.norm = true;
rdp->core_needs_qs = false;
rdp->rcu_iw_pending = false;
+ rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler);
rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcu_prepare_kthreads(cpu);
rcu_spawn_cpu_nocb_kthread(cpu);
+ WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
return 0;
}
@@ -3975,8 +4090,6 @@ int rcutree_offline_cpu(unsigned int cpu)
return 0;
}
-static DEFINE_PER_CPU(int, rcu_cpu_started);
-
/*
* Mark the specified CPU as being online so that subsequent grace periods
* (both expedited and normal) will wait on it. Note that this means that
@@ -3996,14 +4109,16 @@ void rcu_cpu_starting(unsigned int cpu)
struct rcu_node *rnp;
bool newcpu;
- if (per_cpu(rcu_cpu_started, cpu))
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rdp->cpu_started)
return;
+ rdp->cpu_started = true;
- per_cpu(rcu_cpu_started, cpu) = 1;
-
- rdp = per_cpu_ptr(&rcu_data, cpu);
rnp = rdp->mynode;
mask = rdp->grpmask;
+ WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+ WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
+ smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
raw_spin_lock_irqsave_rcu_node(rnp, flags);
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
newcpu = !(rnp->expmaskinitnext & mask);
@@ -4014,17 +4129,21 @@ void rcu_cpu_starting(unsigned int cpu)
rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
- if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */
+
+ /* An incoming CPU should never be blocking a grace period. */
+ if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
rcu_disable_urgency_upon_qs(rdp);
/* Report QS -after- changing ->qsmaskinitnext! */
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
} else {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
+ smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
+ WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+ WARN_ON_ONCE(rnp->ofl_seq & 0x1);
smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}
-#ifdef CONFIG_HOTPLUG_CPU
/*
* The outgoing function has no further need of RCU, so remove it from
* the rcu_node tree's ->qsmaskinitnext bit masks.
@@ -4048,6 +4167,9 @@ void rcu_report_dead(unsigned int cpu)
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
+ WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+ WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
+ smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
raw_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@@ -4060,10 +4182,14 @@ void rcu_report_dead(unsigned int cpu)
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
raw_spin_unlock(&rcu_state.ofl_lock);
+ smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
+ WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+ WARN_ON_ONCE(rnp->ofl_seq & 0x1);
- per_cpu(rcu_cpu_started, cpu) = 0;
+ rdp->cpu_started = false;
}
+#ifdef CONFIG_HOTPLUG_CPU
/*
* The outgoing CPU has just passed through the dying-idle state, and we
* are being invoked from the CPU that was IPIed to continue the offline
@@ -4396,24 +4522,14 @@ static void __init kfree_rcu_batch_init(void)
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- struct kvfree_rcu_bulk_data *bnode;
for (i = 0; i < KFREE_N_BATCHES; i++) {
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
krcp->krw_arr[i].krcp = krcp;
}
- for (i = 0; i < rcu_min_cached_objs; i++) {
- bnode = (struct kvfree_rcu_bulk_data *)
- __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
-
- if (bnode)
- put_cached_bnode(krcp, bnode);
- else
- pr_err("Failed to preallocate for %d CPU!\n", cpu);
- }
-
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+ INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
krcp->initialized = true;
}
if (register_shrinker(&kfree_rcu_shrinker))