aboutsummaryrefslogtreecommitdiff
path: root/kernel/rcu/tree.c
diff options
context:
space:
mode:
authorDmitry Torokhov <[email protected]>2020-12-14 16:27:23 -0800
committerDmitry Torokhov <[email protected]>2020-12-14 16:27:23 -0800
commit4b4193256c8d3bc3a5397b5cd9494c2ad386317d (patch)
treeb6b070f8893384b5d563fc616018e7d5644b2ece /kernel/rcu/tree.c
parentcffdd6d90482316e18d686060a4397902ea04bd2 (diff)
parent92f0a3a22c7a4c936277ece3a0215955a2d52238 (diff)
Merge branch 'next' into for-linus
Prepare input updates for 5.11 merge window.
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r--kernel/rcu/tree.c826
1 files changed, 625 insertions, 201 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d9a49cd6065a..f78ee759af9c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -57,6 +57,9 @@
#include <linux/slab.h>
#include <linux/sched/isolation.h>
#include <linux/sched/clock.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/kasan.h>
#include "../time/tick-internal.h"
#include "tree.h"
@@ -67,6 +70,19 @@
#endif
#define MODULE_PARAM_PREFIX "rcutree."
+#ifndef data_race
+#define data_race(expr) \
+ ({ \
+ expr; \
+ })
+#endif
+#ifndef ASSERT_EXCLUSIVE_WRITER
+#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0)
+#endif
+#ifndef ASSERT_EXCLUSIVE_ACCESS
+#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0)
+#endif
+
/* Data structures. */
/*
@@ -75,9 +91,6 @@
*/
#define RCU_DYNTICK_CTRL_MASK 0x1
#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
-#ifndef rcu_eqs_special_exit
-#define rcu_eqs_special_exit() do { } while (0)
-#endif
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks_nesting = 1,
@@ -100,7 +113,7 @@ static struct rcu_state rcu_state = {
static bool dump_tree;
module_param(dump_tree, bool, 0444);
/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
-static bool use_softirq = 1;
+static bool use_softirq = true;
module_param(use_softirq, bool, 0444);
/* Control rcu_node-tree auto-balancing at boot time. */
static bool rcu_fanout_exact;
@@ -165,6 +178,15 @@ module_param(gp_init_delay, int, 0444);
static int gp_cleanup_delay;
module_param(gp_cleanup_delay, int, 0444);
+/*
+ * This rcu parameter is runtime-read-only. It reflects
+ * a minimum allowed number of objects which can be cached
+ * per-CPU. Object size is equal to one page. This value
+ * can be changed at boot time.
+ */
+static int rcu_min_cached_objs = 2;
+module_param(rcu_min_cached_objs, int, 0444);
+
/* Retrieve RCU kthreads priority for rcutorture */
int rcu_get_gp_kthreads_prio(void)
{
@@ -225,9 +247,11 @@ void rcu_softirq_qs(void)
/*
* Record entry into an extended quiescent state. This is only to be
- * called when not already in an extended quiescent state.
+ * called when not already in an extended quiescent state, that is,
+ * RCU is watching prior to the call to this function and is no longer
+ * watching upon return.
*/
-static void rcu_dynticks_eqs_enter(void)
+static noinstr void rcu_dynticks_eqs_enter(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
int seq;
@@ -237,8 +261,9 @@ static void rcu_dynticks_eqs_enter(void)
* critical sections, and we also must force ordering with the
* next idle sojourn.
*/
- seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
- /* Better be in an extended quiescent state! */
+ rcu_dynticks_task_trace_enter(); // Before ->dynticks update!
+ seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
+ // RCU is no longer watching. Better be in extended quiescent state!
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
(seq & RCU_DYNTICK_CTRL_CTR));
/* Better not have special action (TLB flush) pending! */
@@ -248,9 +273,10 @@ static void rcu_dynticks_eqs_enter(void)
/*
* Record exit from an extended quiescent state. This is only to be
- * called from an extended quiescent state.
+ * called from an extended quiescent state, that is, RCU is not watching
+ * prior to the call to this function and is watching upon return.
*/
-static void rcu_dynticks_eqs_exit(void)
+static noinstr void rcu_dynticks_eqs_exit(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
int seq;
@@ -260,14 +286,14 @@ static void rcu_dynticks_eqs_exit(void)
* and we also must force ordering with the next RCU read-side
* critical section.
*/
- seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
+ seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
+ // RCU is now watching. Better not be in an extended quiescent state!
+ rcu_dynticks_task_trace_exit(); // After ->dynticks update!
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
!(seq & RCU_DYNTICK_CTRL_CTR));
if (seq & RCU_DYNTICK_CTRL_MASK) {
- atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks);
+ arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks);
smp_mb__after_atomic(); /* _exit after clearing mask. */
- /* Prefer duplicate flushes to losing a flush. */
- rcu_eqs_special_exit();
}
}
@@ -295,11 +321,11 @@ static void rcu_dynticks_eqs_online(void)
*
* No ordering, as we are sampling CPU-local information.
*/
-static bool rcu_dynticks_curr_cpu_in_eqs(void)
+static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- return !(atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR);
+ return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR);
}
/*
@@ -333,6 +359,28 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
}
/*
+ * Return true if the referenced integer is zero while the specified
+ * CPU remains within a single extended quiescent state.
+ */
+bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int snap;
+
+ // If not quiescent, force back to earlier extended quiescent state.
+ snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK |
+ RCU_DYNTICK_CTRL_CTR);
+
+ smp_rmb(); // Order ->dynticks and *vp reads.
+ if (READ_ONCE(*vp))
+ return false; // Non-zero, so report failure;
+ smp_rmb(); // Order *vp read and ->dynticks re-read.
+
+ // If still in the same extended quiescent state, we are good!
+ return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK);
+}
+
+/*
* Set the special (bottom) bit of the specified CPU so that it
* will take special action (such as flushing its TLB) on the
* next exit from an extended quiescent state. Returns true if
@@ -382,16 +430,23 @@ void rcu_momentary_dyntick_idle(void)
EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
/**
- * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle
+ * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle
*
* If the current CPU is idle and running at a first-level (not nested)
- * interrupt from idle, return true. The caller must have at least
- * disabled preemption.
+ * interrupt, or directly, from idle, return true.
+ *
+ * The caller must have at least disabled IRQs.
*/
static int rcu_is_cpu_rrupt_from_idle(void)
{
- /* Called only from within the scheduling-clock interrupt */
- lockdep_assert_in_irq();
+ long nesting;
+
+ /*
+ * Usually called from the tick; but also used from smp_function_call()
+ * for expedited grace periods. This latter can result in running from
+ * the idle task, instead of an actual IPI.
+ */
+ lockdep_assert_irqs_disabled();
/* Check for counter underflows */
RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
@@ -400,9 +455,15 @@ static int rcu_is_cpu_rrupt_from_idle(void)
"RCU dynticks_nmi_nesting counter underflow/zero!");
/* Are we at first interrupt nesting level? */
- if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1)
+ nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting);
+ if (nesting > 1)
return false;
+ /*
+ * If we're not in an interrupt, we must be in the idle task!
+ */
+ WARN_ON_ONCE(!nesting && !is_idle_task(current));
+
/* Does CPU appear to be idle from an RCU standpoint? */
return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
}
@@ -562,7 +623,7 @@ EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
* the possibility of usermode upcalls having messed up our count
* of interrupt nesting level during the prior busy period.
*/
-static void rcu_eqs_enter(bool user)
+static noinstr void rcu_eqs_enter(bool user)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
@@ -571,19 +632,28 @@ static void rcu_eqs_enter(bool user)
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
rdp->dynticks_nesting == 0);
if (rdp->dynticks_nesting != 1) {
+ // RCU will still be watching, so just do accounting and leave.
rdp->dynticks_nesting--;
return;
}
lockdep_assert_irqs_disabled();
+ instrumentation_begin();
trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
rdp = this_cpu_ptr(&rcu_data);
do_nocb_deferred_wakeup(rdp);
rcu_prepare_for_idle();
rcu_preempt_deferred_qs(current);
+
+ // instrumentation for the noinstr rcu_dynticks_eqs_enter()
+ instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
+
+ instrumentation_end();
WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */
+ // RCU is watching here ...
rcu_dynticks_eqs_enter();
+ // ... but is no longer watching here.
rcu_dynticks_task_enter();
}
@@ -603,6 +673,7 @@ void rcu_idle_enter(void)
lockdep_assert_irqs_disabled();
rcu_eqs_enter(false);
}
+EXPORT_SYMBOL_GPL(rcu_idle_enter);
#ifdef CONFIG_NO_HZ_FULL
/**
@@ -616,26 +687,29 @@ void rcu_idle_enter(void)
* If you add or remove a call to rcu_user_enter(), be sure to test with
* CONFIG_RCU_EQS_DEBUG=y.
*/
-void rcu_user_enter(void)
+noinstr void rcu_user_enter(void)
{
lockdep_assert_irqs_disabled();
rcu_eqs_enter(true);
}
#endif /* CONFIG_NO_HZ_FULL */
-/*
+/**
+ * rcu_nmi_exit - inform RCU of exit from NMI context
+ *
* If we are returning from the outermost NMI handler that interrupted an
* RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting
* to let the RCU grace-period handling know that the CPU is back to
* being RCU-idle.
*
- * If you add or remove a call to rcu_nmi_exit_common(), be sure to test
+ * If you add or remove a call to rcu_nmi_exit(), be sure to test
* with CONFIG_RCU_EQS_DEBUG=y.
*/
-static __always_inline void rcu_nmi_exit_common(bool irq)
+noinstr void rcu_nmi_exit(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ instrumentation_begin();
/*
* Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
* (We are exiting an NMI handler, so RCU better be paying attention
@@ -653,6 +727,7 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
rdp->dynticks_nmi_nesting - 2);
+ instrumentation_end();
return;
}
@@ -660,27 +735,22 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
- if (irq)
+ if (!in_nmi())
rcu_prepare_for_idle();
+ // instrumentation for the noinstr rcu_dynticks_eqs_enter()
+ instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
+ instrumentation_end();
+
+ // RCU is watching here ...
rcu_dynticks_eqs_enter();
+ // ... but is no longer watching here.
- if (irq)
+ if (!in_nmi())
rcu_dynticks_task_enter();
}
/**
- * rcu_nmi_exit - inform RCU of exit from NMI context
- *
- * If you add or remove a call to rcu_nmi_exit(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_nmi_exit(void)
-{
- rcu_nmi_exit_common(false);
-}
-
-/**
* rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
*
* Exit from an interrupt handler, which might possibly result in entering
@@ -699,11 +769,51 @@ void rcu_nmi_exit(void)
* If you add or remove a call to rcu_irq_exit(), be sure to test with
* CONFIG_RCU_EQS_DEBUG=y.
*/
-void rcu_irq_exit(void)
+void noinstr rcu_irq_exit(void)
+{
+ lockdep_assert_irqs_disabled();
+ rcu_nmi_exit();
+}
+
+/**
+ * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq
+ * towards in kernel preemption
+ *
+ * Same as rcu_irq_exit() but has a sanity check that scheduling is safe
+ * from RCU point of view. Invoked from return from interrupt before kernel
+ * preemption.
+ */
+void rcu_irq_exit_preempt(void)
+{
+ lockdep_assert_irqs_disabled();
+ rcu_nmi_exit();
+
+ RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
+ "RCU dynticks_nesting counter underflow/zero!");
+ RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
+ DYNTICK_IRQ_NONIDLE,
+ "Bad RCU dynticks_nmi_nesting counter\n");
+ RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
+ "RCU in extended quiescent state!");
+}
+
+#ifdef CONFIG_PROVE_RCU
+/**
+ * rcu_irq_exit_check_preempt - Validate that scheduling is possible
+ */
+void rcu_irq_exit_check_preempt(void)
{
lockdep_assert_irqs_disabled();
- rcu_nmi_exit_common(true);
+
+ RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
+ "RCU dynticks_nesting counter underflow/zero!");
+ RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
+ DYNTICK_IRQ_NONIDLE,
+ "Bad RCU dynticks_nmi_nesting counter\n");
+ RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
+ "RCU in extended quiescent state!");
}
+#endif /* #ifdef CONFIG_PROVE_RCU */
/*
* Wrapper for rcu_irq_exit() where interrupts are enabled.
@@ -728,7 +838,7 @@ void rcu_irq_exit_irqson(void)
* allow for the possibility of usermode upcalls messing up our count of
* interrupt nesting level during the busy period that is just now starting.
*/
-static void rcu_eqs_exit(bool user)
+static void noinstr rcu_eqs_exit(bool user)
{
struct rcu_data *rdp;
long oldval;
@@ -738,17 +848,26 @@ static void rcu_eqs_exit(bool user)
oldval = rdp->dynticks_nesting;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
if (oldval) {
+ // RCU was already watching, so just do accounting and leave.
rdp->dynticks_nesting++;
return;
}
rcu_dynticks_task_exit();
+ // RCU is not watching here ...
rcu_dynticks_eqs_exit();
+ // ... but is watching here.
+ instrumentation_begin();
+
+ // instrumentation for the noinstr rcu_dynticks_eqs_exit()
+ instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
+
rcu_cleanup_after_idle();
trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
WRITE_ONCE(rdp->dynticks_nesting, 1);
WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
+ instrumentation_end();
}
/**
@@ -768,6 +887,7 @@ void rcu_idle_exit(void)
rcu_eqs_exit(false);
local_irq_restore(flags);
}
+EXPORT_SYMBOL_GPL(rcu_idle_exit);
#ifdef CONFIG_NO_HZ_FULL
/**
@@ -779,15 +899,75 @@ void rcu_idle_exit(void)
* If you add or remove a call to rcu_user_exit(), be sure to test with
* CONFIG_RCU_EQS_DEBUG=y.
*/
-void rcu_user_exit(void)
+void noinstr rcu_user_exit(void)
{
rcu_eqs_exit(1);
}
+
+/**
+ * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it.
+ *
+ * The scheduler tick is not normally enabled when CPUs enter the kernel
+ * from nohz_full userspace execution. After all, nohz_full userspace
+ * execution is an RCU quiescent state and the time executing in the kernel
+ * is quite short. Except of course when it isn't. And it is not hard to
+ * cause a large system to spend tens of seconds or even minutes looping
+ * in the kernel, which can cause a number of problems, include RCU CPU
+ * stall warnings.
+ *
+ * Therefore, if a nohz_full CPU fails to report a quiescent state
+ * in a timely manner, the RCU grace-period kthread sets that CPU's
+ * ->rcu_urgent_qs flag with the expectation that the next interrupt or
+ * exception will invoke this function, which will turn on the scheduler
+ * tick, which will enable RCU to detect that CPU's quiescent states,
+ * for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels.
+ * The tick will be disabled once a quiescent state is reported for
+ * this CPU.
+ *
+ * Of course, in carefully tuned systems, there might never be an
+ * interrupt or exception. In that case, the RCU grace-period kthread
+ * will eventually cause one to happen. However, in less carefully
+ * controlled environments, this function allows RCU to get what it
+ * needs without creating otherwise useless interruptions.
+ */
+void __rcu_irq_enter_check_tick(void)
+{
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+
+ // Enabling the tick is unsafe in NMI handlers.
+ if (WARN_ON_ONCE(in_nmi()))
+ return;
+
+ RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
+ "Illegal rcu_irq_enter_check_tick() from extended quiescent state");
+
+ if (!tick_nohz_full_cpu(rdp->cpu) ||
+ !READ_ONCE(rdp->rcu_urgent_qs) ||
+ READ_ONCE(rdp->rcu_forced_tick)) {
+ // RCU doesn't need nohz_full help from this CPU, or it is
+ // already getting that help.
+ return;
+ }
+
+ // We get here only when not in an extended quiescent state and
+ // from interrupts (as opposed to NMIs). Therefore, (1) RCU is
+ // already watching and (2) The fact that we are in an interrupt
+ // handler and that the rcu_node lock is an irq-disabled lock
+ // prevents self-deadlock. So we can safely recheck under the lock.
+ // Note that the nohz_full state currently cannot change.
+ raw_spin_lock_rcu_node(rdp->mynode);
+ if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
+ // A nohz_full CPU is in the kernel and RCU needs a
+ // quiescent state. Turn on the tick!
+ WRITE_ONCE(rdp->rcu_forced_tick, true);
+ tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
+ }
+ raw_spin_unlock_rcu_node(rdp->mynode);
+}
#endif /* CONFIG_NO_HZ_FULL */
/**
- * rcu_nmi_enter_common - inform RCU of entry to NMI context
- * @irq: Is this call from rcu_irq_enter?
+ * rcu_nmi_enter - inform RCU of entry to NMI context
*
* If the CPU was idle from RCU's viewpoint, update rdp->dynticks and
* rdp->dynticks_nmi_nesting to let the RCU grace-period handling know
@@ -795,10 +975,10 @@ void rcu_user_exit(void)
* long as the nesting level does not overflow an int. (You will probably
* run out of stack space first.)
*
- * If you add or remove a call to rcu_nmi_enter_common(), be sure to test
+ * If you add or remove a call to rcu_nmi_enter(), be sure to test
* with CONFIG_RCU_EQS_DEBUG=y.
*/
-static __always_inline void rcu_nmi_enter_common(bool irq)
+noinstr void rcu_nmi_enter(void)
{
long incby = 2;
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
@@ -816,45 +996,44 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
*/
if (rcu_dynticks_curr_cpu_in_eqs()) {
- if (irq)
+ if (!in_nmi())
rcu_dynticks_task_exit();
+ // RCU is not watching here ...
rcu_dynticks_eqs_exit();
+ // ... but is watching here.
- if (irq)
+ if (!in_nmi()) {
+ instrumentation_begin();
rcu_cleanup_after_idle();
+ instrumentation_end();
+ }
+
+ instrumentation_begin();
+ // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
+ instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
+ // instrumentation for the noinstr rcu_dynticks_eqs_exit()
+ instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
incby = 1;
- } else if (irq && tick_nohz_full_cpu(rdp->cpu) &&
- rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE &&
- READ_ONCE(rdp->rcu_urgent_qs) &&
- !READ_ONCE(rdp->rcu_forced_tick)) {
- raw_spin_lock_rcu_node(rdp->mynode);
- // Recheck under lock.
- if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
- WRITE_ONCE(rdp->rcu_forced_tick, true);
- tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
- }
- raw_spin_unlock_rcu_node(rdp->mynode);
+ } else if (!in_nmi()) {
+ instrumentation_begin();
+ rcu_irq_enter_check_tick();
+ instrumentation_end();
+ } else {
+ instrumentation_begin();
}
+
trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
rdp->dynticks_nmi_nesting,
rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
+ instrumentation_end();
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
rdp->dynticks_nmi_nesting + incby);
barrier();
}
/**
- * rcu_nmi_enter - inform RCU of entry to NMI context
- */
-void rcu_nmi_enter(void)
-{
- rcu_nmi_enter_common(false);
-}
-NOKPROBE_SYMBOL(rcu_nmi_enter);
-
-/**
* rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
*
* Enter an interrupt handler, which might possibly result in exiting
@@ -876,10 +1055,10 @@ NOKPROBE_SYMBOL(rcu_nmi_enter);
* If you add or remove a call to rcu_irq_enter(), be sure to test with
* CONFIG_RCU_EQS_DEBUG=y.
*/
-void rcu_irq_enter(void)
+noinstr void rcu_irq_enter(void)
{
lockdep_assert_irqs_disabled();
- rcu_nmi_enter_common(true);
+ rcu_nmi_enter();
}
/*
@@ -913,6 +1092,11 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
}
}
+noinstr bool __rcu_is_watching(void)
+{
+ return !rcu_dynticks_curr_cpu_in_eqs();
+}
+
/**
* rcu_is_watching - see if RCU thinks that the current CPU is not idle
*
@@ -921,7 +1105,7 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
* if the current CPU is not in its idle loop or is in an interrupt or
* NMI handler, return true.
*/
-bool notrace rcu_is_watching(void)
+bool rcu_is_watching(void)
{
bool ret;
@@ -973,12 +1157,12 @@ bool rcu_lockdep_current_cpu_online(void)
if (in_nmi() || !rcu_scheduler_fully_active)
return true;
- preempt_disable();
+ preempt_disable_notrace();
rdp = this_cpu_ptr(&rcu_data);
rnp = rdp->mynode;
if (rdp->grpmask & rcu_rnp_online_cpus(rnp))
ret = true;
- preempt_enable();
+ preempt_enable_notrace();
return ret;
}
EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
@@ -1217,7 +1401,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp,
trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread"));
goto unlock_out;
}
- trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq"));
+ trace_rcu_grace_period(rcu_state.name, data_race(rcu_state.gp_seq), TPS("newreq"));
ret = true; /* Caller must wake GP kthread. */
unlock_out:
/* Push furthest requested GP to leaf node and rcu_data structure. */
@@ -1470,7 +1654,32 @@ static void rcu_gp_slow(int delay)
if (delay > 0 &&
!(rcu_seq_ctr(rcu_state.gp_seq) %
(rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
- schedule_timeout_uninterruptible(delay);
+ schedule_timeout_idle(delay);
+}
+
+static unsigned long sleep_duration;
+
+/* Allow rcutorture to stall the grace-period kthread. */
+void rcu_gp_set_torture_wait(int duration)
+{
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST) && duration > 0)
+ WRITE_ONCE(sleep_duration, duration);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait);
+
+/* Actually implement the aforementioned wait. */
+static void rcu_gp_torture_wait(void)
+{
+ unsigned long duration;
+
+ if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST))
+ return;
+ duration = xchg(&sleep_duration, 0UL);
+ if (duration > 0) {
+ pr_alert("%s: Waiting %lu jiffies\n", __func__, duration);
+ schedule_timeout_idle(duration);
+ pr_alert("%s: Wait complete\n", __func__);
+ }
}
/*
@@ -1506,6 +1715,7 @@ static bool rcu_gp_init(void)
record_gp_stall_check_time();
/* Record GP times before starting GP, hence rcu_seq_start(). */
rcu_seq_start(&rcu_state.gp_seq);
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
raw_spin_unlock_irq_rcu_node(rnp);
@@ -1611,12 +1821,16 @@ static bool rcu_gp_fqs_check_wake(int *gfp)
{
struct rcu_node *rnp = rcu_get_root();
- /* Someone like call_rcu() requested a force-quiescent-state scan. */
+ // If under overload conditions, force an immediate FQS scan.
+ if (*gfp & RCU_GP_FLAG_OVLD)
+ return true;
+
+ // Someone like call_rcu() requested a force-quiescent-state scan.
*gfp = READ_ONCE(rcu_state.gp_flags);
if (*gfp & RCU_GP_FLAG_FQS)
return true;
- /* The current grace period has completed. */
+ // The current grace period has completed.
if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
return true;
@@ -1654,13 +1868,15 @@ static void rcu_gp_fqs(bool first_time)
static void rcu_gp_fqs_loop(void)
{
bool first_gp_fqs;
- int gf;
+ int gf = 0;
unsigned long j;
int ret;
struct rcu_node *rnp = rcu_get_root();
first_gp_fqs = true;
j = READ_ONCE(jiffies_till_first_fqs);
+ if (rcu_state.cbovld)
+ gf = RCU_GP_FLAG_OVLD;
ret = 0;
for (;;) {
if (!ret) {
@@ -1673,6 +1889,7 @@ static void rcu_gp_fqs_loop(void)
rcu_state.gp_state = RCU_GP_WAIT_FQS;
ret = swait_event_idle_timeout_exclusive(
rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j);
+ rcu_gp_torture_wait();
rcu_state.gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
/* If grace period done, leave loop. */
@@ -1680,12 +1897,16 @@ static void rcu_gp_fqs_loop(void)
!rcu_preempt_blocked_readers_cgp(rnp))
break;
/* If time for quiescent-state forcing, do it. */
- if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) ||
+ if (!time_after(rcu_state.jiffies_force_qs, jiffies) ||
(gf & RCU_GP_FLAG_FQS)) {
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqsstart"));
rcu_gp_fqs(first_gp_fqs);
- first_gp_fqs = false;
+ gf = 0;
+ if (first_gp_fqs) {
+ first_gp_fqs = false;
+ gf = rcu_state.cbovld ? RCU_GP_FLAG_OVLD : 0;
+ }
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqsend"));
cond_resched_tasks_rcu_qs();
@@ -1705,6 +1926,7 @@ static void rcu_gp_fqs_loop(void)
j = 1;
else
j = rcu_state.jiffies_force_qs - j;
+ gf = 0;
}
}
}
@@ -1781,6 +2003,7 @@ static void rcu_gp_cleanup(void)
/* Declare grace period done, trace first to use old GP number. */
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
rcu_seq_end(&rcu_state.gp_seq);
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
rcu_state.gp_state = RCU_GP_IDLE;
/* Check for GP requests since above loop. */
rdp = this_cpu_ptr(&rcu_data);
@@ -1821,6 +2044,7 @@ static int __noreturn rcu_gp_kthread(void *unused)
swait_event_idle_exclusive(rcu_state.gp_wq,
READ_ONCE(rcu_state.gp_flags) &
RCU_GP_FLAG_INIT);
+ rcu_gp_torture_wait();
rcu_state.gp_state = RCU_GP_DONE_GPS;
/* Locking provides needed memory barrier. */
if (rcu_gp_init())
@@ -2235,6 +2459,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
local_irq_save(flags);
rcu_nocb_lock(rdp);
count = -rcl.len;
+ rdp->n_cbs_invoked += count;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
is_idle_task(current), rcu_is_callbacks_kthread());
@@ -2518,7 +2743,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
}
*statusp = RCU_KTHREAD_YIELDING;
trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
- schedule_timeout_interruptible(2);
+ schedule_timeout_idle(2);
trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
*statusp = RCU_KTHREAD_WAITING;
}
@@ -2668,6 +2893,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
head->func = func;
head->next = NULL;
local_irq_save(flags);
+ kasan_record_aux_stack(head);
rdp = this_cpu_ptr(&rcu_data);
/* Add the callback to our list. */
@@ -2686,8 +2912,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
return; // Enqueued onto ->nocb_bypass, so just leave.
// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
rcu_segcblist_enqueue(&rdp->cblist, head);
- if (__is_kfree_rcu_offset((unsigned long)func))
- trace_rcu_kfree_callback(rcu_state.name, head,
+ if (__is_kvfree_rcu_offset((unsigned long)func))
+ trace_rcu_kvfree_callback(rcu_state.name, head,
(unsigned long)func,
rcu_segcblist_n_cbs(&rdp->cblist));
else
@@ -2749,53 +2975,53 @@ EXPORT_SYMBOL_GPL(call_rcu);
/* Maximum number of jiffies to wait before draining a batch. */
#define KFREE_DRAIN_JIFFIES (HZ / 50)
#define KFREE_N_BATCHES 2
-
-/*
- * This macro defines how many entries the "records" array
- * will contain. It is based on the fact that the size of
- * kfree_rcu_bulk_data structure becomes exactly one page.
- */
-#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3)
+#define FREE_N_CHANNELS 2
/**
- * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers
+ * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
* @nr_records: Number of active pointers in the array
- * @records: Array of the kfree_rcu() pointers
* @next: Next bulk object in the block chain
- * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set
+ * @records: Array of the kvfree_rcu() pointers
*/
-struct kfree_rcu_bulk_data {
+struct kvfree_rcu_bulk_data {
unsigned long nr_records;
- void *records[KFREE_BULK_MAX_ENTR];
- struct kfree_rcu_bulk_data *next;
- struct rcu_head *head_free_debug;
+ struct kvfree_rcu_bulk_data *next;
+ void *records[];
};
+/*
+ * This macro defines how many entries the "records" array
+ * will contain. It is based on the fact that the size of
+ * kvfree_rcu_bulk_data structure becomes exactly one page.
+ */
+#define KVFREE_BULK_MAX_ENTR \
+ ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
+
/**
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
* @head_free: List of kfree_rcu() objects waiting for a grace period
- * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period
+ * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
* @krcp: Pointer to @kfree_rcu_cpu structure
*/
struct kfree_rcu_cpu_work {
struct rcu_work rcu_work;
struct rcu_head *head_free;
- struct kfree_rcu_bulk_data *bhead_free;
+ struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
struct kfree_rcu_cpu *krcp;
};
/**
* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
* @head: List of kfree_rcu() objects not yet waiting for a grace period
- * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
- * @bcached: Keeps at most one object for later reuse when build chain blocks
+ * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
* @lock: Synchronize access to this structure
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
* @monitor_todo: Tracks whether a @monitor_work delayed work is pending
- * @initialized: The @lock and @rcu_work fields have been initialized
+ * @initialized: The @rcu_work fields have been initialized
+ * @count: Number of objects for which GP not started
*
* This is a per-CPU structure. The reason that it is not included in
* the rcu_data structure is to permit this code to be extracted from
@@ -2804,26 +3030,84 @@ struct kfree_rcu_cpu_work {
*/
struct kfree_rcu_cpu {
struct rcu_head *head;
- struct kfree_rcu_bulk_data *bhead;
- struct kfree_rcu_bulk_data *bcached;
+ struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS];
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
- spinlock_t lock;
+ raw_spinlock_t lock;
struct delayed_work monitor_work;
bool monitor_todo;
bool initialized;
+ int count;
+
+ /*
+ * A simple cache list that contains objects for
+ * reuse purpose. In order to save some per-cpu
+ * space the list is singular. Even though it is
+ * lockless an access has to be protected by the
+ * per-cpu lock.
+ */
+ struct llist_head bkvcache;
+ int nr_bkv_objs;
};
-static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
+static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
+};
static __always_inline void
-debug_rcu_head_unqueue_bulk(struct rcu_head *head)
+debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
{
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
- for (; head; head = head->next)
- debug_rcu_head_unqueue(head);
+ int i;
+
+ for (i = 0; i < bhead->nr_records; i++)
+ debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
#endif
}
+static inline struct kfree_rcu_cpu *
+krc_this_cpu_lock(unsigned long *flags)
+{
+ struct kfree_rcu_cpu *krcp;
+
+ local_irq_save(*flags); // For safely calling this_cpu_ptr().
+ krcp = this_cpu_ptr(&krc);
+ raw_spin_lock(&krcp->lock);
+
+ return krcp;
+}
+
+static inline void
+krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
+{
+ raw_spin_unlock(&krcp->lock);
+ local_irq_restore(flags);
+}
+
+static inline struct kvfree_rcu_bulk_data *
+get_cached_bnode(struct kfree_rcu_cpu *krcp)
+{
+ if (!krcp->nr_bkv_objs)
+ return NULL;
+
+ krcp->nr_bkv_objs--;
+ return (struct kvfree_rcu_bulk_data *)
+ llist_del_first(&krcp->bkvcache);
+}
+
+static inline bool
+put_cached_bnode(struct kfree_rcu_cpu *krcp,
+ struct kvfree_rcu_bulk_data *bnode)
+{
+ // Check the limit.
+ if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
+ return false;
+
+ llist_add((struct llist_node *) bnode, &krcp->bkvcache);
+ krcp->nr_bkv_objs++;
+ return true;
+
+}
+
/*
* This function is invoked in workqueue context after a grace period.
* It frees all the objects queued on ->bhead_free or ->head_free.
@@ -2831,38 +3115,63 @@ debug_rcu_head_unqueue_bulk(struct rcu_head *head)
static void kfree_rcu_work(struct work_struct *work)
{
unsigned long flags;
+ struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext;
struct rcu_head *head, *next;
- struct kfree_rcu_bulk_data *bhead, *bnext;
struct kfree_rcu_cpu *krcp;
struct kfree_rcu_cpu_work *krwp;
+ int i, j;
krwp = container_of(to_rcu_work(work),
struct kfree_rcu_cpu_work, rcu_work);
krcp = krwp->krcp;
- spin_lock_irqsave(&krcp->lock, flags);
- head = krwp->head_free;
- krwp->head_free = NULL;
- bhead = krwp->bhead_free;
- krwp->bhead_free = NULL;
- spin_unlock_irqrestore(&krcp->lock, flags);
-
- /* "bhead" is now private, so traverse locklessly. */
- for (; bhead; bhead = bnext) {
- bnext = bhead->next;
- debug_rcu_head_unqueue_bulk(bhead->head_free_debug);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ // Channels 1 and 2.
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ bkvhead[i] = krwp->bkvhead_free[i];
+ krwp->bkvhead_free[i] = NULL;
+ }
- rcu_lock_acquire(&rcu_callback_map);
- trace_rcu_invoke_kfree_bulk_callback(rcu_state.name,
- bhead->nr_records, bhead->records);
+ // Channel 3.
+ head = krwp->head_free;
+ krwp->head_free = NULL;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ // Handle two first channels.
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ for (; bkvhead[i]; bkvhead[i] = bnext) {
+ bnext = bkvhead[i]->next;
+ debug_rcu_bhead_unqueue(bkvhead[i]);
+
+ rcu_lock_acquire(&rcu_callback_map);
+ if (i == 0) { // kmalloc() / kfree().
+ trace_rcu_invoke_kfree_bulk_callback(
+ rcu_state.name, bkvhead[i]->nr_records,
+ bkvhead[i]->records);
+
+ kfree_bulk(bkvhead[i]->nr_records,
+ bkvhead[i]->records);
+ } else { // vmalloc() / vfree().
+ for (j = 0; j < bkvhead[i]->nr_records; j++) {
+ trace_rcu_invoke_kvfree_callback(
+ rcu_state.name,
+ bkvhead[i]->records[j], 0);
+
+ vfree(bkvhead[i]->records[j]);
+ }
+ }
+ rcu_lock_release(&rcu_callback_map);
- kfree_bulk(bhead->nr_records, bhead->records);
- rcu_lock_release(&rcu_callback_map);
+ krcp = krc_this_cpu_lock(&flags);
+ if (put_cached_bnode(krcp, bkvhead[i]))
+ bkvhead[i] = NULL;
+ krc_this_cpu_unlock(krcp, flags);
- if (cmpxchg(&krcp->bcached, NULL, bhead))
- free_page((unsigned long) bhead);
+ if (bkvhead[i])
+ free_page((unsigned long) bkvhead[i]);
- cond_resched_tasks_rcu_qs();
+ cond_resched_tasks_rcu_qs();
+ }
}
/*
@@ -2872,14 +3181,15 @@ static void kfree_rcu_work(struct work_struct *work)
*/
for (; head; head = next) {
unsigned long offset = (unsigned long)head->func;
+ void *ptr = (void *)head - offset;
next = head->next;
- debug_rcu_head_unqueue(head);
+ debug_rcu_head_unqueue((struct rcu_head *)ptr);
rcu_lock_acquire(&rcu_callback_map);
- trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
+ trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
- if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
- kfree((void *)head - offset);
+ if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
+ kvfree(ptr);
rcu_lock_release(&rcu_callback_map);
cond_resched_tasks_rcu_qs();
@@ -2895,8 +3205,8 @@ static void kfree_rcu_work(struct work_struct *work)
static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
{
struct kfree_rcu_cpu_work *krwp;
- bool queued = false;
- int i;
+ bool repeat = false;
+ int i, j;
lockdep_assert_held(&krcp->lock);
@@ -2904,38 +3214,48 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
krwp = &(krcp->krw_arr[i]);
/*
- * Try to detach bhead or head and attach it over any
+ * Try to detach bkvhead or head and attach it over any
* available corresponding free channel. It can be that
* a previous RCU batch is in progress, it means that
* immediately to queue another one is not possible so
* return false to tell caller to retry.
*/
- if ((krcp->bhead && !krwp->bhead_free) ||
+ if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
+ (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
(krcp->head && !krwp->head_free)) {
- /* Channel 1. */
- if (!krwp->bhead_free) {
- krwp->bhead_free = krcp->bhead;
- krcp->bhead = NULL;
+ // Channel 1 corresponds to SLAB ptrs.
+ // Channel 2 corresponds to vmalloc ptrs.
+ for (j = 0; j < FREE_N_CHANNELS; j++) {
+ if (!krwp->bkvhead_free[j]) {
+ krwp->bkvhead_free[j] = krcp->bkvhead[j];
+ krcp->bkvhead[j] = NULL;
+ }
}
- /* Channel 2. */
+ // Channel 3 corresponds to emergency path.
if (!krwp->head_free) {
krwp->head_free = krcp->head;
krcp->head = NULL;
}
+ WRITE_ONCE(krcp->count, 0);
+
/*
- * One work is per one batch, so there are two "free channels",
- * "bhead_free" and "head_free" the batch can handle. It can be
- * that the work is in the pending state when two channels have
- * been detached following each other, one by one.
+ * One work is per one batch, so there are three
+ * "free channels", the batch can handle. It can
+ * be that the work is in the pending state when
+ * channels have been detached following by each
+ * other.
*/
queue_rcu_work(system_wq, &krwp->rcu_work);
- queued = true;
}
+
+ // Repeat if any "free" corresponding channel is still busy.
+ if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
+ repeat = true;
}
- return queued;
+ return !repeat;
}
static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
@@ -2945,14 +3265,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
krcp->monitor_todo = false;
if (queue_kfree_rcu_work(krcp)) {
// Success! Our job is done here.
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
return;
}
// Previous RCU batch still in progress, try again later.
krcp->monitor_todo = true;
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
/*
@@ -2965,32 +3285,50 @@ static void kfree_rcu_monitor(struct work_struct *work)
struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
monitor_work.work);
- spin_lock_irqsave(&krcp->lock, flags);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
if (krcp->monitor_todo)
kfree_rcu_drain_unlock(krcp, flags);
else
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
static inline bool
-kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
- struct rcu_head *head, rcu_callback_t func)
+kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
{
- struct kfree_rcu_bulk_data *bnode;
+ struct kvfree_rcu_bulk_data *bnode;
+ int idx;
if (unlikely(!krcp->initialized))
return false;
lockdep_assert_held(&krcp->lock);
+ idx = !!is_vmalloc_addr(ptr);
/* Check if a new block is required. */
- if (!krcp->bhead ||
- krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
- bnode = xchg(&krcp->bcached, NULL);
+ if (!krcp->bkvhead[idx] ||
+ krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
+ bnode = get_cached_bnode(krcp);
if (!bnode) {
- WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
+ /*
+ * To keep this path working on raw non-preemptible
+ * sections, prevent the optional entry into the
+ * allocator as it uses sleeping locks. In fact, even
+ * if the caller of kfree_rcu() is preemptible, this
+ * path still is not, as krcp->lock is a raw spinlock.
+ * With additional page pre-allocation in the works,
+ * hitting this return is going to be much less likely.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return false;
- bnode = (struct kfree_rcu_bulk_data *)
+ /*
+ * NOTE: For one argument of kvfree_rcu() we can
+ * drop the lock and get the page in sleepable
+ * context. That would allow to maintain an array
+ * for the CONFIG_PREEMPT_RT as well if no cached
+ * pages are available.
+ */
+ bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
}
@@ -3000,53 +3338,62 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
/* Initialize the new block. */
bnode->nr_records = 0;
- bnode->next = krcp->bhead;
- bnode->head_free_debug = NULL;
+ bnode->next = krcp->bkvhead[idx];
/* Attach it to the head. */
- krcp->bhead = bnode;
+ krcp->bkvhead[idx] = bnode;
}
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
- head->func = func;
- head->next = krcp->bhead->head_free_debug;
- krcp->bhead->head_free_debug = head;
-#endif
-
/* Finally insert. */
- krcp->bhead->records[krcp->bhead->nr_records++] =
- (void *) head - (unsigned long) func;
+ krcp->bkvhead[idx]->records
+ [krcp->bkvhead[idx]->nr_records++] = ptr;
return true;
}
/*
- * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace
- * period. Please note there are two paths are maintained, one is the main one
- * that uses kfree_bulk() interface and second one is emergency one, that is
- * used only when the main path can not be maintained temporary, due to memory
- * pressure.
+ * Queue a request for lazy invocation of appropriate free routine after a
+ * grace period. Please note there are three paths are maintained, two are the
+ * main ones that use array of pointers interface and third one is emergency
+ * one, that is used only when the main path can not be maintained temporary,
+ * due to memory pressure.
*
- * Each kfree_call_rcu() request is added to a batch. The batch will be drained
+ * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
* every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
* be free'd in workqueue context. This allows us to: batch requests together to
- * reduce the number of grace periods during heavy kfree_rcu() load.
+ * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
*/
-void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
unsigned long flags;
struct kfree_rcu_cpu *krcp;
+ bool success;
+ void *ptr;
- local_irq_save(flags); // For safely calling this_cpu_ptr().
- krcp = this_cpu_ptr(&krc);
- if (krcp->initialized)
- spin_lock(&krcp->lock);
+ if (head) {
+ ptr = (void *) head - (unsigned long) func;
+ } else {
+ /*
+ * Please note there is a limitation for the head-less
+ * variant, that is why there is a clear rule for such
+ * objects: it can be used from might_sleep() context
+ * only. For other places please embed an rcu_head to
+ * your data.
+ */
+ might_sleep();
+ ptr = (unsigned long *) func;
+ }
+
+ krcp = krc_this_cpu_lock(&flags);
// Queue the object but don't yet schedule the batch.
- if (debug_rcu_head_queue(head)) {
+ if (debug_rcu_head_queue(ptr)) {
// Probable double kfree_rcu(), just leak.
WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
__func__, head);
+
+ // Mark as success and leave.
+ success = true;
goto unlock_return;
}
@@ -3054,12 +3401,20 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
* Under high memory pressure GFP_NOWAIT can fail,
* in that case the emergency path is maintained.
*/
- if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) {
+ success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
+ if (!success) {
+ if (head == NULL)
+ // Inline if kvfree_rcu(one_arg) call.
+ goto unlock_return;
+
head->func = func;
head->next = krcp->head;
krcp->head = head;
+ success = true;
}
+ WRITE_ONCE(krcp->count, krcp->count + 1);
+
// Set timer to drain after KFREE_DRAIN_JIFFIES.
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!krcp->monitor_todo) {
@@ -3068,11 +3423,70 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
}
unlock_return:
- if (krcp->initialized)
- spin_unlock(&krcp->lock);
- local_irq_restore(flags);
+ krc_this_cpu_unlock(krcp, flags);
+
+ /*
+ * Inline kvfree() after synchronize_rcu(). We can do
+ * it from might_sleep() context only, so the current
+ * CPU can pass the QS state.
+ */
+ if (!success) {
+ debug_rcu_head_unqueue((struct rcu_head *) ptr);
+ synchronize_rcu();
+ kvfree(ptr);
+ }
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+
+static unsigned long
+kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu;
+ unsigned long count = 0;
+
+ /* Snapshot count of all CPUs */
+ for_each_online_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ count += READ_ONCE(krcp->count);
+ }
+
+ return count;
}
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
+
+static unsigned long
+kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu, freed = 0;
+ unsigned long flags;
+
+ for_each_online_cpu(cpu) {
+ int count;
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ count = krcp->count;
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ if (krcp->monitor_todo)
+ kfree_rcu_drain_unlock(krcp, flags);
+ else
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ sc->nr_to_scan -= count;
+ freed += count;
+
+ if (sc->nr_to_scan <= 0)
+ break;
+ }
+
+ return freed == 0 ? SHRINK_STOP : freed;
+}
+
+static struct shrinker kfree_rcu_shrinker = {
+ .count_objects = kfree_rcu_shrink_count,
+ .scan_objects = kfree_rcu_shrink_scan,
+ .batch = 0,
+ .seeks = DEFAULT_SEEKS,
+};
void __init kfree_rcu_scheduler_running(void)
{
@@ -3082,15 +3496,15 @@ void __init kfree_rcu_scheduler_running(void)
for_each_online_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- spin_lock_irqsave(&krcp->lock, flags);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
if (!krcp->head || krcp->monitor_todo) {
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
continue;
}
krcp->monitor_todo = true;
schedule_delayed_work_on(cpu, &krcp->monitor_work,
KFREE_DRAIN_JIFFIES);
- spin_unlock_irqrestore(&krcp->lock, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
}
@@ -3578,10 +3992,9 @@ void rcu_cpu_starting(unsigned int cpu)
{
unsigned long flags;
unsigned long mask;
- int nbits;
- unsigned long oldmask;
struct rcu_data *rdp;
struct rcu_node *rnp;
+ bool newcpu;
if (per_cpu(rcu_cpu_started, cpu))
return;
@@ -3593,12 +4006,11 @@ void rcu_cpu_starting(unsigned int cpu)
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
- oldmask = rnp->expmaskinitnext;
+ newcpu = !(rnp->expmaskinitnext & mask);
rnp->expmaskinitnext |= mask;
- oldmask ^= rnp->expmaskinitnext;
- nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
/* Allow lockless access for expedited grace periods. */
- smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */
+ smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus);
rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
@@ -3984,16 +4396,28 @@ static void __init kfree_rcu_batch_init(void)
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+ struct kvfree_rcu_bulk_data *bnode;
- spin_lock_init(&krcp->lock);
for (i = 0; i < KFREE_N_BATCHES; i++) {
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
krcp->krw_arr[i].krcp = krcp;
}
+ for (i = 0; i < rcu_min_cached_objs; i++) {
+ bnode = (struct kvfree_rcu_bulk_data *)
+ __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+
+ if (bnode)
+ put_cached_bnode(krcp, bnode);
+ else
+ pr_err("Failed to preallocate for %d CPU!\n", cpu);
+ }
+
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
krcp->initialized = true;
}
+ if (register_shrinker(&kfree_rcu_shrinker))
+ pr_err("Failed to register kfree_rcu() shrinker!\n");
}
void __init rcu_init(void)