diff options
Diffstat (limited to 'kernel/rcu/tree.c')
| -rw-r--r-- | kernel/rcu/tree.c | 422 |
1 files changed, 209 insertions, 213 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e78b2430c16..bce848e50512 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -32,6 +32,8 @@ #include <linux/export.h> #include <linux/completion.h> #include <linux/moduleparam.h> +#include <linux/panic.h> +#include <linux/panic_notifier.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/cpu.h> @@ -72,17 +74,10 @@ /* Data structures. */ -/* - * Steal a bit from the bottom of ->dynticks for idle entry/exit - * control. Initially this is for TLB flushing. - */ -#define RCU_DYNTICK_CTRL_MASK 0x1 -#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) - static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, - .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), + .dynticks = ATOMIC_INIT(1), #ifdef CONFIG_RCU_NOCB_CPU .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY, #endif @@ -186,6 +181,17 @@ module_param(rcu_unlock_delay, int, 0444); static int rcu_min_cached_objs = 5; module_param(rcu_min_cached_objs, int, 0444); +// A page shrinker can ask for pages to be freed to make them +// available for other parts of the system. This usually happens +// under low memory conditions, and in that case we should also +// defer page-cache filling for a short time period. +// +// The default value is 5 seconds, which is long enough to reduce +// interference with the shrinker while it asks other systems to +// drain their caches. +static int rcu_delay_page_cache_fill_msec = 5000; +module_param(rcu_delay_page_cache_fill_msec, int, 0444); + /* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { @@ -202,7 +208,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); * the need for long delays to increase some race probabilities with the * need for fast grace periods to increase other race probabilities. */ -#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */ +#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */ /* * Compute the mask of online CPUs for the specified rcu_node structure. @@ -242,6 +248,16 @@ void rcu_softirq_qs(void) { rcu_qs(); rcu_preempt_deferred_qs(current); + rcu_tasks_qs(current, false); +} + +/* + * Increment the current CPU's rcu_data structure's ->dynticks field + * with ordering. Return the new value. + */ +static noinline noinstr unsigned long rcu_dynticks_inc(int incby) +{ + return arch_atomic_add_return(incby, this_cpu_ptr(&rcu_data.dynticks)); } /* @@ -252,7 +268,6 @@ void rcu_softirq_qs(void) */ static noinstr void rcu_dynticks_eqs_enter(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; /* @@ -261,13 +276,9 @@ static noinstr void rcu_dynticks_eqs_enter(void) * next idle sojourn. */ rcu_dynticks_task_trace_enter(); // Before ->dynticks update! - seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = rcu_dynticks_inc(1); // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (seq & RCU_DYNTICK_CTRL_CTR)); - /* Better not have special action (TLB flush) pending! */ - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (seq & RCU_DYNTICK_CTRL_MASK)); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1)); } /* @@ -277,7 +288,6 @@ static noinstr void rcu_dynticks_eqs_enter(void) */ static noinstr void rcu_dynticks_eqs_exit(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; /* @@ -285,15 +295,10 @@ static noinstr void rcu_dynticks_eqs_exit(void) * and we also must force ordering with the next RCU read-side * critical section. */ - seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = rcu_dynticks_inc(1); // RCU is now watching. Better not be in an extended quiescent state! rcu_dynticks_task_trace_exit(); // After ->dynticks update! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !(seq & RCU_DYNTICK_CTRL_CTR)); - if (seq & RCU_DYNTICK_CTRL_MASK) { - arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); - smp_mb__after_atomic(); /* _exit after clearing mask. */ - } + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1)); } /* @@ -310,9 +315,9 @@ static void rcu_dynticks_eqs_online(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR) + if (atomic_read(&rdp->dynticks) & 0x1) return; - atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + rcu_dynticks_inc(1); } /* @@ -322,9 +327,7 @@ static void rcu_dynticks_eqs_online(void) */ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - - return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); + return !(atomic_read(this_cpu_ptr(&rcu_data.dynticks)) & 0x1); } /* @@ -333,9 +336,8 @@ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) */ static int rcu_dynticks_snap(struct rcu_data *rdp) { - int snap = atomic_add_return(0, &rdp->dynticks); - - return snap & ~RCU_DYNTICK_CTRL_MASK; + smp_mb(); // Fundamental RCU ordering guarantee. + return atomic_read_acquire(&rdp->dynticks); } /* @@ -344,7 +346,7 @@ static int rcu_dynticks_snap(struct rcu_data *rdp) */ static bool rcu_dynticks_in_eqs(int snap) { - return !(snap & RCU_DYNTICK_CTRL_CTR); + return !(snap & 0x1); } /* Return true if the specified CPU is currently idle from an RCU viewpoint. */ @@ -375,8 +377,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) int snap; // If not quiescent, force back to earlier extended quiescent state. - snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK | - RCU_DYNTICK_CTRL_CTR); + snap = atomic_read(&rdp->dynticks) & ~0x1; smp_rmb(); // Order ->dynticks and *vp reads. if (READ_ONCE(*vp)) @@ -384,32 +385,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) smp_rmb(); // Order *vp read and ->dynticks re-read. // If still in the same extended quiescent state, we are good! - return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK); -} - -/* - * Set the special (bottom) bit of the specified CPU so that it - * will take special action (such as flushing its TLB) on the - * next exit from an extended quiescent state. Returns true if - * the bit was successfully set, or false if the CPU was not in - * an extended quiescent state. - */ -bool rcu_eqs_special_set(int cpu) -{ - int old; - int new; - int new_old; - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - - new_old = atomic_read(&rdp->dynticks); - do { - old = new_old; - if (old & RCU_DYNTICK_CTRL_CTR) - return false; - new = old | RCU_DYNTICK_CTRL_MASK; - new_old = atomic_cmpxchg(&rdp->dynticks, old, new); - } while (new_old != old); - return true; + return snap == atomic_read(&rdp->dynticks); } /* @@ -425,13 +401,12 @@ bool rcu_eqs_special_set(int cpu) */ notrace void rcu_momentary_dyntick_idle(void) { - int special; + int seq; raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); - special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, - &this_cpu_ptr(&rcu_data)->dynticks); + seq = rcu_dynticks_inc(2); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); + WARN_ON_ONCE(!(seq & 0x1)); rcu_preempt_deferred_qs(current); } EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); @@ -833,28 +808,6 @@ void noinstr rcu_irq_exit(void) rcu_nmi_exit(); } -/** - * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq - * towards in kernel preemption - * - * Same as rcu_irq_exit() but has a sanity check that scheduling is safe - * from RCU point of view. Invoked from return from interrupt before kernel - * preemption. - */ -void rcu_irq_exit_preempt(void) -{ - lockdep_assert_irqs_disabled(); - rcu_nmi_exit(); - - RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, - "RCU dynticks_nesting counter underflow/zero!"); - RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != - DYNTICK_IRQ_NONIDLE, - "Bad RCU dynticks_nmi_nesting counter\n"); - RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), - "RCU in extended quiescent state!"); -} - #ifdef CONFIG_PROVE_RCU /** * rcu_irq_exit_check_preempt - Validate that scheduling is possible @@ -959,7 +912,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit); */ void noinstr rcu_user_exit(void) { - rcu_eqs_exit(1); + rcu_eqs_exit(true); } /** @@ -1225,7 +1178,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ /* - * We are reporting a quiescent state on behalf of some other CPU, so + * When trying to report a quiescent state on behalf of some other CPU, * it is our responsibility to check for and handle potential overflow * of the rcu_node ->gp_seq counter with respect to the rcu_data counters. * After all, the CPU might be in deep idle state, and thus executing no @@ -1333,7 +1286,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) */ jtsq = READ_ONCE(jiffies_to_sched_qs); ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu); - rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu); + rnhqp = per_cpu_ptr(&rcu_data.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && (time_after(jiffies, rcu_state.gp_start + jtsq * 2) || time_after(jiffies, rcu_state.jiffies_resched) || @@ -1780,7 +1733,7 @@ static void rcu_strict_gp_boundary(void *unused) /* * Initialize a new grace period. Return false if no grace period required. */ -static bool rcu_gp_init(void) +static noinline_for_stack bool rcu_gp_init(void) { unsigned long firstseq; unsigned long flags; @@ -1974,7 +1927,7 @@ static void rcu_gp_fqs(bool first_time) /* * Loop doing repeated quiescent-state forcing until the grace period ends. */ -static void rcu_gp_fqs_loop(void) +static noinline_for_stack void rcu_gp_fqs_loop(void) { bool first_gp_fqs; int gf = 0; @@ -2001,8 +1954,8 @@ static void rcu_gp_fqs_loop(void) trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswait")); WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS); - ret = swait_event_idle_timeout_exclusive( - rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); + (void)swait_event_idle_timeout_exclusive(rcu_state.gp_wq, + rcu_gp_fqs_check_wake(&gf), j); rcu_gp_torture_wait(); WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS); /* Locking provides needed memory barriers. */ @@ -2048,7 +2001,7 @@ static void rcu_gp_fqs_loop(void) /* * Clean up after the old grace period. */ -static void rcu_gp_cleanup(void) +static noinline void rcu_gp_cleanup(void) { int cpu; bool needgp = false; @@ -2479,9 +2432,6 @@ int rcutree_dead_cpu(unsigned int cpu) WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); - /* Do any needed no-CB deferred wakeups from this CPU. */ - do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu)); - // Stop-machine done, so allow nohz_full to disable tick. tick_dep_clear(TICK_DEP_BIT_RCU); return 0; @@ -2489,7 +2439,7 @@ int rcutree_dead_cpu(unsigned int cpu) /* * Invoke any RCU callbacks that have made it to the end of their grace - * period. Thottle as specified by rdp->blimit. + * period. Throttle as specified by rdp->blimit. */ static void rcu_do_batch(struct rcu_data *rdp) { @@ -2629,7 +2579,7 @@ static void rcu_do_batch(struct rcu_data *rdp) * state, for example, user mode or idle loop. It also schedules RCU * core processing. If the current grace period has gone on too long, * it will ask the scheduler to manufacture a context switch for the sole - * purpose of providing a providing the needed quiescent state. + * purpose of providing the needed quiescent state. */ void rcu_sched_clock_irq(int user) { @@ -2911,7 +2861,6 @@ static int __init rcu_spawn_core_kthreads(void) "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__); return 0; } -early_initcall(rcu_spawn_core_kthreads); /* * Handle any core-RCU processing required by a call_rcu() invocation. @@ -3082,12 +3031,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) * period elapses, in other words after all pre-existing RCU read-side * critical sections have completed. However, the callback function * might well execute concurrently with RCU read-side critical sections - * that started after call_rcu() was invoked. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and - * may be nested. In addition, regions of code across which interrupts, - * preemption, or softirqs have been disabled also serve as RCU read-side - * critical sections. This includes hardware interrupt handlers, softirq - * handlers, and NMI handlers. + * that started after call_rcu() was invoked. + * + * RCU read-side critical sections are delimited by rcu_read_lock() + * and rcu_read_unlock(), and may be nested. In addition, but only in + * v5.0 and later, regions of code across which interrupts, preemption, + * or softirqs have been disabled also serve as RCU read-side critical + * sections. This includes hardware interrupt handlers, softirq handlers, + * and NMI handlers. * * Note that all CPUs must agree that the grace period extended beyond * all pre-existing RCU read-side critical section. On systems with more @@ -3107,6 +3058,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) * between the call to call_rcu() and the invocation of "func()" -- even * if CPU A and CPU B are the same CPU (but again only if the system has * more than one CPU). + * + * Implementation of these memory-ordering guarantees is described here: + * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -3171,6 +3125,7 @@ struct kfree_rcu_cpu_work { * Even though it is lockless an access has to be protected by the * per-cpu lock. * @page_cache_work: A work to refill the cache when it is empty + * @backoff_page_cache_fill: Delay cache refills * @work_in_progress: Indicates that page_cache_work is running * @hrtimer: A hrtimer for scheduling a page_cache_work * @nr_bkv_objs: number of allocated objects at @bkvcache. @@ -3190,7 +3145,8 @@ struct kfree_rcu_cpu { bool initialized; int count; - struct work_struct page_cache_work; + struct delayed_work page_cache_work; + atomic_t backoff_page_cache_fill; atomic_t work_in_progress; struct hrtimer hrtimer; @@ -3237,7 +3193,7 @@ get_cached_bnode(struct kfree_rcu_cpu *krcp) if (!krcp->nr_bkv_objs) return NULL; - krcp->nr_bkv_objs--; + WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1); return (struct kvfree_rcu_bulk_data *) llist_del_first(&krcp->bkvcache); } @@ -3251,14 +3207,33 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp, return false; llist_add((struct llist_node *) bnode, &krcp->bkvcache); - krcp->nr_bkv_objs++; + WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1); return true; +} +static int +drain_page_cache(struct kfree_rcu_cpu *krcp) +{ + unsigned long flags; + struct llist_node *page_list, *pos, *n; + int freed = 0; + + raw_spin_lock_irqsave(&krcp->lock, flags); + page_list = llist_del_all(&krcp->bkvcache); + WRITE_ONCE(krcp->nr_bkv_objs, 0); + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + llist_for_each_safe(pos, n, page_list) { + free_page((unsigned long)pos); + freed++; + } + + return freed; } /* * This function is invoked in workqueue context after a grace period. - * It frees all the objects queued on ->bhead_free or ->head_free. + * It frees all the objects queued on ->bkvhead_free or ->head_free. */ static void kfree_rcu_work(struct work_struct *work) { @@ -3285,7 +3260,7 @@ static void kfree_rcu_work(struct work_struct *work) krwp->head_free = NULL; raw_spin_unlock_irqrestore(&krcp->lock, flags); - // Handle two first channels. + // Handle the first two channels. for (i = 0; i < FREE_N_CHANNELS; i++) { for (; bkvhead[i]; bkvhead[i] = bnext) { bnext = bkvhead[i]->next; @@ -3323,9 +3298,11 @@ static void kfree_rcu_work(struct work_struct *work) } /* - * Emergency case only. It can happen under low memory - * condition when an allocation gets failed, so the "bulk" - * path can not be temporary maintained. + * This is used when the "bulk" path can not be used for the + * double-argument of kvfree_rcu(). This happens when the + * page-cache is empty, which means that objects are instead + * queued on a linked list through their rcu_head structures. + * This list is named "Channel 3". */ for (; head; head = next) { unsigned long offset = (unsigned long)head->func; @@ -3345,34 +3322,31 @@ static void kfree_rcu_work(struct work_struct *work) } /* - * Schedule the kfree batch RCU work to run in workqueue context after a GP. - * - * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES - * timeout has been reached. + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. */ -static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) +static void kfree_rcu_monitor(struct work_struct *work) { - struct kfree_rcu_cpu_work *krwp; - bool repeat = false; + struct kfree_rcu_cpu *krcp = container_of(work, + struct kfree_rcu_cpu, monitor_work.work); + unsigned long flags; int i, j; - lockdep_assert_held(&krcp->lock); + raw_spin_lock_irqsave(&krcp->lock, flags); + // Attempt to start a new batch. for (i = 0; i < KFREE_N_BATCHES; i++) { - krwp = &(krcp->krw_arr[i]); + struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); - /* - * Try to detach bkvhead or head and attach it over any - * available corresponding free channel. It can be that - * a previous RCU batch is in progress, it means that - * immediately to queue another one is not possible so - * return false to tell caller to retry. - */ + // Try to detach bkvhead or head and attach it over any + // available corresponding free channel. It can be that + // a previous RCU batch is in progress, it means that + // immediately to queue another one is not possible so + // in that case the monitor work is rearmed. if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || (krcp->head && !krwp->head_free)) { - // Channel 1 corresponds to SLAB ptrs. - // Channel 2 corresponds to vmalloc ptrs. + // Channel 1 corresponds to the SLAB-pointer bulk path. + // Channel 2 corresponds to vmalloc-pointer bulk path. for (j = 0; j < FREE_N_CHANNELS; j++) { if (!krwp->bkvhead_free[j]) { krwp->bkvhead_free[j] = krcp->bkvhead[j]; @@ -3380,7 +3354,8 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) } } - // Channel 3 corresponds to emergency path. + // Channel 3 corresponds to both SLAB and vmalloc + // objects queued on the linked list. if (!krwp->head_free) { krwp->head_free = krcp->head; krcp->head = NULL; @@ -3388,65 +3363,35 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) WRITE_ONCE(krcp->count, 0); - /* - * One work is per one batch, so there are three - * "free channels", the batch can handle. It can - * be that the work is in the pending state when - * channels have been detached following by each - * other. - */ + // One work is per one batch, so there are three + // "free channels", the batch can handle. It can + // be that the work is in the pending state when + // channels have been detached following by each + // other. queue_rcu_work(system_wq, &krwp->rcu_work); } - - // Repeat if any "free" corresponding channel is still busy. - if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head) - repeat = true; } - return !repeat; -} - -static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, - unsigned long flags) -{ - // Attempt to start a new batch. - krcp->monitor_todo = false; - if (queue_kfree_rcu_work(krcp)) { - // Success! Our job is done here. - raw_spin_unlock_irqrestore(&krcp->lock, flags); - return; - } + // If there is nothing to detach, it means that our job is + // successfully done here. In case of having at least one + // of the channels that is still busy we should rearm the + // work to repeat an attempt. Because previous batches are + // still in progress. + if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) + krcp->monitor_todo = false; + else + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); - // Previous RCU batch still in progress, try again later. - krcp->monitor_todo = true; - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); raw_spin_unlock_irqrestore(&krcp->lock, flags); } -/* - * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. - * It invokes kfree_rcu_drain_unlock() to attempt to start another batch. - */ -static void kfree_rcu_monitor(struct work_struct *work) -{ - unsigned long flags; - struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, - monitor_work.work); - - raw_spin_lock_irqsave(&krcp->lock, flags); - if (krcp->monitor_todo) - kfree_rcu_drain_unlock(krcp, flags); - else - raw_spin_unlock_irqrestore(&krcp->lock, flags); -} - static enum hrtimer_restart schedule_page_work_fn(struct hrtimer *t) { struct kfree_rcu_cpu *krcp = container_of(t, struct kfree_rcu_cpu, hrtimer); - queue_work(system_highpri_wq, &krcp->page_cache_work); + queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0); return HRTIMER_NORESTART; } @@ -3455,12 +3400,16 @@ static void fill_page_cache_func(struct work_struct *work) struct kvfree_rcu_bulk_data *bnode; struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, - page_cache_work); + page_cache_work.work); unsigned long flags; + int nr_pages; bool pushed; int i; - for (i = 0; i < rcu_min_cached_objs; i++) { + nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? + 1 : rcu_min_cached_objs; + + for (i = 0; i < nr_pages; i++) { bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); @@ -3477,6 +3426,7 @@ static void fill_page_cache_func(struct work_struct *work) } atomic_set(&krcp->work_in_progress, 0); + atomic_set(&krcp->backoff_page_cache_fill, 0); } static void @@ -3484,10 +3434,15 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp) { if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !atomic_xchg(&krcp->work_in_progress, 1)) { - hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - krcp->hrtimer.function = schedule_page_work_fn; - hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); + if (atomic_read(&krcp->backoff_page_cache_fill)) { + queue_delayed_work(system_wq, + &krcp->page_cache_work, + msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); + } else { + hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + krcp->hrtimer.function = schedule_page_work_fn; + hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); + } } } @@ -3552,11 +3507,11 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, } /* - * Queue a request for lazy invocation of appropriate free routine after a - * grace period. Please note there are three paths are maintained, two are the - * main ones that use array of pointers interface and third one is emergency - * one, that is used only when the main path can not be maintained temporary, - * due to memory pressure. + * Queue a request for lazy invocation of the appropriate free routine + * after a grace period. Please note that three paths are maintained, + * two for the common case using arrays of pointers and a third one that + * is used only when the main paths cannot be used, for example, due to + * memory pressure. * * Each kvfree_call_rcu() request is added to a batch. The batch will be drained * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will @@ -3645,6 +3600,8 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count += READ_ONCE(krcp->count); + count += READ_ONCE(krcp->nr_bkv_objs); + atomic_set(&krcp->backoff_page_cache_fill, 1); } return count; @@ -3654,18 +3611,14 @@ static unsigned long kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { int cpu, freed = 0; - unsigned long flags; for_each_possible_cpu(cpu) { int count; struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count = krcp->count; - raw_spin_lock_irqsave(&krcp->lock, flags); - if (krcp->monitor_todo) - kfree_rcu_drain_unlock(krcp, flags); - else - raw_spin_unlock_irqrestore(&krcp->lock, flags); + count += drain_page_cache(krcp); + kfree_rcu_monitor(&krcp->monitor_work.work); sc->nr_to_scan -= count; freed += count; @@ -3693,7 +3646,8 @@ void __init kfree_rcu_scheduler_running(void) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); raw_spin_lock_irqsave(&krcp->lock, flags); - if (!krcp->head || krcp->monitor_todo) { + if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) || + krcp->monitor_todo) { raw_spin_unlock_irqrestore(&krcp->lock, flags); continue; } @@ -3750,10 +3704,12 @@ static int rcu_blocking_is_gp(void) * read-side critical sections have completed. Note, however, that * upon return from synchronize_rcu(), the caller might well be executing * concurrently with new RCU read-side critical sections that began while - * synchronize_rcu() was waiting. RCU read-side critical sections are - * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. - * In addition, regions of code across which interrupts, preemption, or - * softirqs have been disabled also serve as RCU read-side critical + * synchronize_rcu() was waiting. + * + * RCU read-side critical sections are delimited by rcu_read_lock() + * and rcu_read_unlock(), and may be nested. In addition, but only in + * v5.0 and later, regions of code across which interrupts, preemption, + * or softirqs have been disabled also serve as RCU read-side critical * sections. This includes hardware interrupt handlers, softirq handlers, * and NMI handlers. * @@ -3774,6 +3730,9 @@ static int rcu_blocking_is_gp(void) * to have executed a full memory barrier during the execution of * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but * again only if the system has more than one CPU). + * + * Implementation of these memory-ordering guarantees is described here: + * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. */ void synchronize_rcu(void) { @@ -3844,11 +3803,11 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); /** * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period * - * @oldstate: return from call to get_state_synchronize_rcu() or start_poll_synchronize_rcu() + * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu() * * If a full RCU grace period has elapsed since the earlier call from * which oldstate was obtained, return @true, otherwise return @false. - * If @false is returned, it is the caller's responsibilty to invoke this + * If @false is returned, it is the caller's responsibility to invoke this * function later on until it does return @true. Alternatively, the caller * can explicitly wait for a grace period, for example, by passing @oldstate * to cond_synchronize_rcu() or by directly invoking synchronize_rcu(). @@ -3860,6 +3819,11 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); * (many hours even on 32-bit systems) should check them occasionally * and either refresh them or set a flag indicating that the grace period * has completed. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @oldstate, and that returned at the end + * of this function. */ bool poll_state_synchronize_rcu(unsigned long oldstate) { @@ -3874,7 +3838,7 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); /** * cond_synchronize_rcu - Conditionally wait for an RCU grace period * - * @oldstate: return value from earlier call to get_state_synchronize_rcu() + * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu() * * If a full RCU grace period has elapsed since the earlier call to * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return. @@ -3884,6 +3848,11 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); * counter wrap is harmless. If the counter wraps, we have waited for * more than 2 billion grace periods (and way more on a 64-bit system!), * so waiting for one additional grace period should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @oldstate, and that returned at the end + * of this function. */ void cond_synchronize_rcu(unsigned long oldstate) { @@ -3911,7 +3880,7 @@ static int rcu_pending(int user) check_cpu_stall(rdp); /* Does this CPU need a deferred NOCB wakeup? */ - if (rcu_nocb_need_deferred_wakeup(rdp)) + if (rcu_nocb_need_deferred_wakeup(rdp, RCU_NOCB_WAKE)) return 1; /* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */ @@ -4039,7 +4008,7 @@ void rcu_barrier(void) */ init_completion(&rcu_state.barrier_completion); atomic_set(&rcu_state.barrier_cpu_count, 2); - get_online_cpus(); + cpus_read_lock(); /* * Force each CPU with callbacks to register a new callback. @@ -4070,7 +4039,7 @@ void rcu_barrier(void) rcu_state.barrier_sequence); } } - put_online_cpus(); + cpus_read_unlock(); /* * Now that we have an rcu_barrier_callback() callback on each @@ -4094,7 +4063,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier); /* * Propagate ->qsinitmask bits up the rcu_node tree to account for the * first CPU in a given leaf rcu_node structure coming online. The caller - * must hold the corresponding leaf rcu_node ->lock with interrrupts + * must hold the corresponding leaf rcu_node ->lock with interrupts * disabled. */ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) @@ -4189,7 +4158,7 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - rcu_prepare_kthreads(cpu); + rcu_spawn_one_boost_kthread(rnp); rcu_spawn_cpu_nocb_kthread(cpu); WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1); @@ -4472,6 +4441,7 @@ static int __init rcu_spawn_gp_kthread(void) wake_up_process(t); rcu_spawn_nocb_kthreads(); rcu_spawn_boost_kthreads(); + rcu_spawn_core_kthreads(); return 0; } early_initcall(rcu_spawn_gp_kthread); @@ -4582,11 +4552,25 @@ static void __init rcu_init_one(void) * replace the definitions in tree.h because those are needed to size * the ->node array in the rcu_state structure. */ -static void __init rcu_init_geometry(void) +void rcu_init_geometry(void) { ulong d; int i; + static unsigned long old_nr_cpu_ids; int rcu_capacity[RCU_NUM_LVLS]; + static bool initialized; + + if (initialized) { + /* + * Warn if setup_nr_cpu_ids() had not yet been invoked, + * unless nr_cpus_ids == NR_CPUS, in which case who cares? + */ + WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids); + return; + } + + old_nr_cpu_ids = nr_cpu_ids; + initialized = true; /* * Initialize any unspecified boot parameters. @@ -4687,6 +4671,18 @@ static void __init kfree_rcu_batch_init(void) int cpu; int i; + /* Clamp it to [0:100] seconds interval. */ + if (rcu_delay_page_cache_fill_msec < 0 || + rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) { + + rcu_delay_page_cache_fill_msec = + clamp(rcu_delay_page_cache_fill_msec, 0, + (int) (100 * MSEC_PER_SEC)); + + pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n", + rcu_delay_page_cache_fill_msec); + } + for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); @@ -4696,7 +4692,7 @@ static void __init kfree_rcu_batch_init(void) } INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); - INIT_WORK(&krcp->page_cache_work, fill_page_cache_func); + INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; } if (register_shrinker(&kfree_rcu_shrinker)) @@ -4730,12 +4726,11 @@ void __init rcu_init(void) rcutree_online_cpu(cpu); } - /* Create workqueue for expedited GPs and for Tree SRCU. */ + /* Create workqueue for Tree SRCU and for expedited GPs. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_par_gp_wq); - srcu_init(); /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ @@ -4747,4 +4742,5 @@ void __init rcu_init(void) #include "tree_stall.h" #include "tree_exp.h" +#include "tree_nocb.h" #include "tree_plugin.h" |