diff options
Diffstat (limited to 'kernel/rcu')
| -rw-r--r-- | kernel/rcu/Kconfig | 18 | ||||
| -rw-r--r-- | kernel/rcu/rcu.h | 16 | ||||
| -rw-r--r-- | kernel/rcu/rcuscale.c | 282 | ||||
| -rw-r--r-- | kernel/rcu/rcutorture.c | 7 | ||||
| -rw-r--r-- | kernel/rcu/refscale.c | 37 | ||||
| -rw-r--r-- | kernel/rcu/tasks.h | 148 | ||||
| -rw-r--r-- | kernel/rcu/tree.c | 147 | ||||
| -rw-r--r-- | kernel/rcu/tree_exp.h | 2 | ||||
| -rw-r--r-- | kernel/rcu/tree_nocb.h | 56 | ||||
| -rw-r--r-- | kernel/rcu/tree_plugin.h | 4 | ||||
| -rw-r--r-- | kernel/rcu/tree_stall.h | 2 |
11 files changed, 526 insertions, 193 deletions
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 9071182b1284..bdd7eadb33d8 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -314,4 +314,22 @@ config RCU_LAZY To save power, batch RCU callbacks and flush after delay, memory pressure, or callback list growing too big. +config RCU_DOUBLE_CHECK_CB_TIME + bool "RCU callback-batch backup time check" + depends on RCU_EXPERT + default n + help + Use this option to provide more precise enforcement of the + rcutree.rcu_resched_ns module parameter in situations where + a single RCU callback might run for hundreds of microseconds, + thus defeating the 32-callback batching used to amortize the + cost of the fine-grained but expensive local_clock() function. + + This option rounds rcutree.rcu_resched_ns up to the next + jiffy, and overrides the 32-callback batching if this limit + is exceeded. + + Say Y here if you need tighter callback-limit enforcement. + Say N here if you are unsure. + endmenu # "RCU Subsystem" diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 4a1b9622598b..98e13be411af 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -493,7 +493,6 @@ static inline void rcu_expedite_gp(void) { } static inline void rcu_unexpedite_gp(void) { } static inline void rcu_async_hurry(void) { } static inline void rcu_async_relax(void) { } -static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ @@ -508,9 +507,16 @@ void show_rcu_tasks_gp_kthreads(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void show_rcu_tasks_gp_kthreads(void) {} #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ -void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ +#ifdef CONFIG_TASKS_RCU +struct task_struct *get_rcu_tasks_gp_kthread(void); +#endif // # ifdef CONFIG_TASKS_RCU + +#ifdef CONFIG_TASKS_RUDE_RCU +struct task_struct *get_rcu_tasks_rude_gp_kthread(void); +#endif // # ifdef CONFIG_TASKS_RUDE_RCU + #define RCU_SCHEDULER_INACTIVE 0 #define RCU_SCHEDULER_INIT 1 #define RCU_SCHEDULER_RUNNING 2 @@ -642,4 +648,10 @@ void show_rcu_tasks_trace_gp_kthread(void); static inline void show_rcu_tasks_trace_gp_kthread(void) {} #endif +#ifdef CONFIG_TINY_RCU +static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; } +#else +bool rcu_cpu_beenfullyonline(int cpu); +#endif + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index e82ec9f9a5d8..ffdb30495e3c 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -84,15 +84,17 @@ MODULE_AUTHOR("Paul E. McKenney <[email protected]>"); #endif torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); -torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); +torture_param(int, gp_async_max, 1000, "Max # outstanding waits per writer"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); +torture_param(int, minruntime, 0, "Minimum run time (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, RCUSCALE_SHUTDOWN, "Shutdown at end of scalability tests."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); +torture_param(int, writer_holdoff_jiffies, 0, "Holdoff (jiffies) between GPs, zero to disable"); torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?"); torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); torture_param(int, kfree_by_call_rcu, 0, "Use call_rcu() to emulate kfree_rcu()?"); @@ -139,6 +141,7 @@ struct rcu_scale_ops { void (*gp_barrier)(void); void (*sync)(void); void (*exp_sync)(void); + struct task_struct *(*rso_gp_kthread)(void); const char *name; }; @@ -295,6 +298,7 @@ static struct rcu_scale_ops tasks_ops = { .gp_barrier = rcu_barrier_tasks, .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, + .rso_gp_kthread = get_rcu_tasks_gp_kthread, .name = "tasks" }; @@ -306,6 +310,44 @@ static struct rcu_scale_ops tasks_ops = { #endif // #else // #ifdef CONFIG_TASKS_RCU +#ifdef CONFIG_TASKS_RUDE_RCU + +/* + * Definitions for RCU-tasks-rude scalability testing. + */ + +static int tasks_rude_scale_read_lock(void) +{ + return 0; +} + +static void tasks_rude_scale_read_unlock(int idx) +{ +} + +static struct rcu_scale_ops tasks_rude_ops = { + .ptype = RCU_TASKS_RUDE_FLAVOR, + .init = rcu_sync_scale_init, + .readlock = tasks_rude_scale_read_lock, + .readunlock = tasks_rude_scale_read_unlock, + .get_gp_seq = rcu_no_completed, + .gp_diff = rcu_seq_diff, + .async = call_rcu_tasks_rude, + .gp_barrier = rcu_barrier_tasks_rude, + .sync = synchronize_rcu_tasks_rude, + .exp_sync = synchronize_rcu_tasks_rude, + .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread, + .name = "tasks-rude" +}; + +#define TASKS_RUDE_OPS &tasks_rude_ops, + +#else // #ifdef CONFIG_TASKS_RUDE_RCU + +#define TASKS_RUDE_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RUDE_RCU + #ifdef CONFIG_TASKS_TRACE_RCU /* @@ -334,6 +376,7 @@ static struct rcu_scale_ops tasks_tracing_ops = { .gp_barrier = rcu_barrier_tasks_trace, .sync = synchronize_rcu_tasks_trace, .exp_sync = synchronize_rcu_tasks_trace, + .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread, .name = "tasks-tracing" }; @@ -410,10 +453,12 @@ rcu_scale_writer(void *arg) { int i = 0; int i_max; + unsigned long jdone; long me = (long)arg; struct rcu_head *rhp = NULL; bool started = false, done = false, alldone = false; u64 t; + DEFINE_TORTURE_RANDOM(tr); u64 *wdp; u64 *wdpp = writer_durations[me]; @@ -424,7 +469,7 @@ rcu_scale_writer(void *arg) sched_set_fifo_low(current); if (holdoff) - schedule_timeout_uninterruptible(holdoff * HZ); + schedule_timeout_idle(holdoff * HZ); /* * Wait until rcu_end_inkernel_boot() is called for normal GP tests @@ -445,9 +490,12 @@ rcu_scale_writer(void *arg) } } + jdone = jiffies + minruntime * HZ; do { if (writer_holdoff) udelay(writer_holdoff); + if (writer_holdoff_jiffies) + schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1); wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); if (gp_async) { @@ -475,7 +523,7 @@ retry: if (!started && atomic_read(&n_rcu_scale_writer_started) >= nrealwriters) started = true; - if (!done && i >= MIN_MEAS) { + if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) { done = true; sched_set_normal(current, 0); pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n", @@ -518,91 +566,8 @@ static void rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG - "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", - scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown); -} - -static void -rcu_scale_cleanup(void) -{ - int i; - int j; - int ngps = 0; - u64 *wdp; - u64 *wdpp; - - /* - * Would like warning at start, but everything is expedited - * during the mid-boot phase, so have to wait till the end. - */ - if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) - SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); - if (rcu_gp_is_normal() && gp_exp) - SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); - if (gp_exp && gp_async) - SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); - - if (torture_cleanup_begin()) - return; - if (!cur_ops) { - torture_cleanup_end(); - return; - } - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - torture_stop_kthread(rcu_scale_reader, - reader_tasks[i]); - kfree(reader_tasks); - } - - if (writer_tasks) { - for (i = 0; i < nrealwriters; i++) { - torture_stop_kthread(rcu_scale_writer, - writer_tasks[i]); - if (!writer_n_durations) - continue; - j = writer_n_durations[i]; - pr_alert("%s%s writer %d gps: %d\n", - scale_type, SCALE_FLAG, i, j); - ngps += j; - } - pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", - scale_type, SCALE_FLAG, - t_rcu_scale_writer_started, t_rcu_scale_writer_finished, - t_rcu_scale_writer_finished - - t_rcu_scale_writer_started, - ngps, - rcuscale_seq_diff(b_rcu_gp_test_finished, - b_rcu_gp_test_started)); - for (i = 0; i < nrealwriters; i++) { - if (!writer_durations) - break; - if (!writer_n_durations) - continue; - wdpp = writer_durations[i]; - if (!wdpp) - continue; - for (j = 0; j < writer_n_durations[i]; j++) { - wdp = &wdpp[j]; - pr_alert("%s%s %4d writer-duration: %5d %llu\n", - scale_type, SCALE_FLAG, - i, j, *wdp); - if (j % 100 == 0) - schedule_timeout_uninterruptible(1); - } - kfree(writer_durations[i]); - } - kfree(writer_tasks); - kfree(writer_durations); - kfree(writer_n_durations); - } - - /* Do torture-type-specific cleanup operations. */ - if (cur_ops->cleanup != NULL) - cur_ops->cleanup(); - - torture_cleanup_end(); + "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown=%d\n", + scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown); } /* @@ -625,20 +590,6 @@ static int compute_real(int n) } /* - * RCU scalability shutdown kthread. Just waits to be awakened, then shuts - * down system. - */ -static int -rcu_scale_shutdown(void *arg) -{ - wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); - smp_mb(); /* Wake before output. */ - rcu_scale_cleanup(); - kernel_power_off(); - return -EINVAL; -} - -/* * kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number * of iterations and measure total time and number of GP for all iterations to complete. */ @@ -653,6 +604,8 @@ static struct task_struct **kfree_reader_tasks; static int kfree_nrealthreads; static atomic_t n_kfree_scale_thread_started; static atomic_t n_kfree_scale_thread_ended; +static struct task_struct *kthread_tp; +static u64 kthread_stime; struct kfree_obj { char kfree_obj[8]; @@ -798,6 +751,10 @@ kfree_scale_init(void) unsigned long jif_start; unsigned long orig_jif; + pr_alert("%s" SCALE_FLAG + "--- kfree_rcu_test: kfree_mult=%d kfree_by_call_rcu=%d kfree_nthreads=%d kfree_alloc_num=%d kfree_loops=%d kfree_rcu_test_double=%d kfree_rcu_test_single=%d\n", + scale_type, kfree_mult, kfree_by_call_rcu, kfree_nthreads, kfree_alloc_num, kfree_loops, kfree_rcu_test_double, kfree_rcu_test_single); + // Also, do a quick self-test to ensure laziness is as much as // expected. if (kfree_by_call_rcu && !IS_ENABLED(CONFIG_RCU_LAZY)) { @@ -874,13 +831,127 @@ unwind: return firsterr; } +static void +rcu_scale_cleanup(void) +{ + int i; + int j; + int ngps = 0; + u64 *wdp; + u64 *wdpp; + + /* + * Would like warning at start, but everything is expedited + * during the mid-boot phase, so have to wait till the end. + */ + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) + SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + if (rcu_gp_is_normal() && gp_exp) + SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + if (gp_exp && gp_async) + SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); + + // If built-in, just report all of the GP kthread's CPU time. + if (IS_BUILTIN(CONFIG_RCU_SCALE_TEST) && !kthread_tp && cur_ops->rso_gp_kthread) + kthread_tp = cur_ops->rso_gp_kthread(); + if (kthread_tp) { + u32 ns; + u64 us; + + kthread_stime = kthread_tp->stime - kthread_stime; + us = div_u64_rem(kthread_stime, 1000, &ns); + pr_info("rcu_scale: Grace-period kthread CPU time: %llu.%03u us\n", us, ns); + show_rcu_gp_kthreads(); + } + if (kfree_rcu_test) { + kfree_scale_cleanup(); + return; + } + + if (torture_cleanup_begin()) + return; + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_scale_reader, + reader_tasks[i]); + kfree(reader_tasks); + } + + if (writer_tasks) { + for (i = 0; i < nrealwriters; i++) { + torture_stop_kthread(rcu_scale_writer, + writer_tasks[i]); + if (!writer_n_durations) + continue; + j = writer_n_durations[i]; + pr_alert("%s%s writer %d gps: %d\n", + scale_type, SCALE_FLAG, i, j); + ngps += j; + } + pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", + scale_type, SCALE_FLAG, + t_rcu_scale_writer_started, t_rcu_scale_writer_finished, + t_rcu_scale_writer_finished - + t_rcu_scale_writer_started, + ngps, + rcuscale_seq_diff(b_rcu_gp_test_finished, + b_rcu_gp_test_started)); + for (i = 0; i < nrealwriters; i++) { + if (!writer_durations) + break; + if (!writer_n_durations) + continue; + wdpp = writer_durations[i]; + if (!wdpp) + continue; + for (j = 0; j < writer_n_durations[i]; j++) { + wdp = &wdpp[j]; + pr_alert("%s%s %4d writer-duration: %5d %llu\n", + scale_type, SCALE_FLAG, + i, j, *wdp); + if (j % 100 == 0) + schedule_timeout_uninterruptible(1); + } + kfree(writer_durations[i]); + } + kfree(writer_tasks); + kfree(writer_durations); + kfree(writer_n_durations); + } + + /* Do torture-type-specific cleanup operations. */ + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +/* + * RCU scalability shutdown kthread. Just waits to be awakened, then shuts + * down system. + */ +static int +rcu_scale_shutdown(void *arg) +{ + wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); + smp_mb(); /* Wake before output. */ + rcu_scale_cleanup(); + kernel_power_off(); + return -EINVAL; +} + static int __init rcu_scale_init(void) { long i; int firsterr = 0; static struct rcu_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_TRACING_OPS + &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS }; if (!torture_init_begin(scale_type, verbose)) @@ -905,6 +976,11 @@ rcu_scale_init(void) if (cur_ops->init) cur_ops->init(); + if (cur_ops->rso_gp_kthread) { + kthread_tp = cur_ops->rso_gp_kthread(); + if (kthread_tp) + kthread_stime = kthread_tp->stime; + } if (kfree_rcu_test) return kfree_scale_init(); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 147551c23baf..ade42d6a9d9b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1581,6 +1581,7 @@ rcu_torture_writer(void *arg) rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) { tracing_off(); + show_rcu_gp_kthreads(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); rcu_ftrace_dump(DUMP_ALL); } @@ -1876,7 +1877,7 @@ static int rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) { int mask = rcutorture_extend_mask_max(); - unsigned long randmask1 = torture_random(trsp) >> 8; + unsigned long randmask1 = torture_random(trsp); unsigned long randmask2 = randmask1 >> 3; unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ; @@ -1935,7 +1936,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, if (!((mask - 1) & mask)) return rtrsp; /* Current RCU reader not extendable. */ /* Bias towards larger numbers of loops. */ - i = (torture_random(trsp) >> 3); + i = torture_random(trsp); i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1; for (j = 0; j < i; j++) { mask = rcutorture_extend_mask(*readstate, trsp); @@ -2136,7 +2137,7 @@ static int rcu_nocb_toggle(void *arg) toggle_fuzz = NSEC_PER_USEC; do { r = torture_random(&rand); - cpu = (r >> 4) % (maxcpu + 1); + cpu = (r >> 1) % (maxcpu + 1); if (r & 0x1) { rcu_nocb_cpu_offload(cpu); atomic_long_inc(&n_nocb_offload); diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 1970ce5f22d4..91a0fd0d4d9a 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -528,6 +528,38 @@ static struct ref_scale_ops clock_ops = { .name = "clock" }; +static void ref_jiffies_section(const int nloops) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) + x += jiffies; + preempt_enable(); + stopopts = x; +} + +static void ref_jiffies_delay_section(const int nloops, const int udl, const int ndl) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x += jiffies; + un_delay(udl, ndl); + } + preempt_enable(); + stopopts = x; +} + +static struct ref_scale_ops jiffies_ops = { + .readsection = ref_jiffies_section, + .delaysection = ref_jiffies_delay_section, + .name = "jiffies" +}; + //////////////////////////////////////////////////////////////////////// // // Methods leveraging SLAB_TYPESAFE_BY_RCU. @@ -1047,7 +1079,7 @@ ref_scale_init(void) int firsterr = 0; static struct ref_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, - &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, + &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, }; @@ -1107,12 +1139,11 @@ ref_scale_init(void) VERBOSE_SCALEOUT("Starting %d reader threads", nreaders); for (i = 0; i < nreaders; i++) { + init_waitqueue_head(&reader_tasks[i].wq); firsterr = torture_create_kthread(ref_scale_reader, (void *)i, reader_tasks[i].task); if (torture_init_error(firsterr)) goto unwind; - - init_waitqueue_head(&(reader_tasks[i].wq)); } // Main Task diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 5f4fc8184dd0..8d65f7d576a3 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -25,6 +25,8 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @cblist: Callback list. * @lock: Lock protecting per-CPU callback list. * @rtp_jiffies: Jiffies counter value for statistics. + * @lazy_timer: Timer to unlazify callbacks. + * @urgent_gp: Number of additional non-lazy grace periods. * @rtp_n_lock_retries: Rough lock-contention statistic. * @rtp_work: Work queue for invoking callbacks. * @rtp_irq_work: IRQ work queue for deferred wakeups. @@ -38,6 +40,8 @@ struct rcu_tasks_percpu { raw_spinlock_t __private lock; unsigned long rtp_jiffies; unsigned long rtp_n_lock_retries; + struct timer_list lazy_timer; + unsigned int urgent_gp; struct work_struct rtp_work; struct irq_work rtp_irq_work; struct rcu_head barrier_q_head; @@ -51,7 +55,6 @@ struct rcu_tasks_percpu { * @cbs_wait: RCU wait allowing a new callback to get kthread's attention. * @cbs_gbl_lock: Lock protecting callback list. * @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone. - * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. * @gp_state: Grace period's most recent state transition (debugging). * @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping. @@ -61,6 +64,8 @@ struct rcu_tasks_percpu { * @tasks_gp_seq: Number of grace periods completed since boot. * @n_ipis: Number of IPIs sent to encourage grace periods to end. * @n_ipis_fails: Number of IPI-send failures. + * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. + * @lazy_jiffies: Number of jiffies to allow callbacks to be lazy. * @pregp_func: This flavor's pre-grace-period function (optional). * @pertask_func: This flavor's per-task scan function (optional). * @postscan_func: This flavor's post-task scan function (optional). @@ -92,6 +97,7 @@ struct rcu_tasks { unsigned long n_ipis; unsigned long n_ipis_fails; struct task_struct *kthread_ptr; + unsigned long lazy_jiffies; rcu_tasks_gp_func_t gp_func; pregp_func_t pregp_func; pertask_func_t pertask_func; @@ -127,6 +133,7 @@ static struct rcu_tasks rt_name = \ .gp_func = gp, \ .call_func = call, \ .rtpcpu = &rt_name ## __percpu, \ + .lazy_jiffies = DIV_ROUND_UP(HZ, 4), \ .name = n, \ .percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \ .percpu_enqueue_lim = 1, \ @@ -139,9 +146,7 @@ static struct rcu_tasks rt_name = \ #ifdef CONFIG_TASKS_RCU /* Track exiting tasks in order to allow them to be waited for. */ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); -#endif -#ifdef CONFIG_TASKS_RCU /* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall); @@ -171,6 +176,8 @@ static int rcu_task_contend_lim __read_mostly = 100; module_param(rcu_task_contend_lim, int, 0444); static int rcu_task_collapse_lim __read_mostly = 10; module_param(rcu_task_collapse_lim, int, 0444); +static int rcu_task_lazy_lim __read_mostly = 32; +module_param(rcu_task_lazy_lim, int, 0444); /* RCU tasks grace-period state for debugging. */ #define RTGS_INIT 0 @@ -229,7 +236,7 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) #endif /* #ifndef CONFIG_TINY_RCU */ // Initialize per-CPU callback lists for the specified flavor of -// Tasks RCU. +// Tasks RCU. Do not enqueue callbacks before this function is invoked. static void cblist_init_generic(struct rcu_tasks *rtp) { int cpu; @@ -237,11 +244,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp) int lim; int shift; - raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rcu_task_enqueue_lim < 0) { rcu_task_enqueue_lim = 1; rcu_task_cb_adjust = true; - pr_info("%s: Setting adjustable number of callback queues.\n", __func__); } else if (rcu_task_enqueue_lim == 0) { rcu_task_enqueue_lim = 1; } @@ -261,18 +266,46 @@ static void cblist_init_generic(struct rcu_tasks *rtp) WARN_ON_ONCE(!rtpcp); if (cpu) raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock)); - raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. + local_irq_save(flags); // serialize initialization if (rcu_segcblist_empty(&rtpcp->cblist)) rcu_segcblist_init(&rtpcp->cblist); + local_irq_restore(flags); INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); rtpcp->cpu = cpu; rtpcp->rtpp = rtp; if (!rtpcp->rtp_blkd_tasks.next) INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); - raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. } - raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); - pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim)); + + pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name, + data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust); +} + +// Compute wakeup time for lazy callback timer. +static unsigned long rcu_tasks_lazy_time(struct rcu_tasks *rtp) +{ + return jiffies + rtp->lazy_jiffies; +} + +// Timer handler that unlazifies lazy callbacks. +static void call_rcu_tasks_generic_timer(struct timer_list *tlp) +{ + unsigned long flags; + bool needwake = false; + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp = from_timer(rtpcp, tlp, lazy_timer); + + rtp = rtpcp->rtpp; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + if (!rcu_segcblist_empty(&rtpcp->cblist) && rtp->lazy_jiffies) { + if (!rtpcp->urgent_gp) + rtpcp->urgent_gp = 1; + needwake = true; + mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp)); + } + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + if (needwake) + rcuwait_wake_up(&rtp->cbs_wait); } // IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic(). @@ -291,6 +324,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, { int chosen_cpu; unsigned long flags; + bool havekthread = smp_load_acquire(&rtp->kthread_ptr); int ideal_cpu; unsigned long j; bool needadjust = false; @@ -315,12 +349,19 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids) needadjust = true; // Defer adjustment to avoid deadlock. } - if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) { - raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. - cblist_init_generic(rtp); - raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. + // Queuing callbacks before initialization not yet supported. + if (WARN_ON_ONCE(!rcu_segcblist_is_enabled(&rtpcp->cblist))) + rcu_segcblist_init(&rtpcp->cblist); + needwake = (func == wakeme_after_rcu) || + (rcu_segcblist_n_cbs(&rtpcp->cblist) == rcu_task_lazy_lim); + if (havekthread && !needwake && !timer_pending(&rtpcp->lazy_timer)) { + if (rtp->lazy_jiffies) + mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp)); + else + needwake = rcu_segcblist_empty(&rtpcp->cblist); } - needwake = rcu_segcblist_empty(&rtpcp->cblist); + if (needwake) + rtpcp->urgent_gp = 3; rcu_segcblist_enqueue(&rtpcp->cblist, rhp); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); if (unlikely(needadjust)) { @@ -414,9 +455,14 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) } rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); - if (rcu_segcblist_pend_cbs(&rtpcp->cblist)) + if (rtpcp->urgent_gp > 0 && rcu_segcblist_pend_cbs(&rtpcp->cblist)) { + if (rtp->lazy_jiffies) + rtpcp->urgent_gp--; needgpcb |= 0x3; - if (!rcu_segcblist_empty(&rtpcp->cblist)) + } else if (rcu_segcblist_empty(&rtpcp->cblist)) { + rtpcp->urgent_gp = 0; + } + if (rcu_segcblist_ready_cbs(&rtpcp->cblist)) needgpcb |= 0x1; raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); } @@ -463,6 +509,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu { int cpu; int cpunext; + int cpuwq; unsigned long flags; int len; struct rcu_head *rhp; @@ -473,11 +520,13 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu cpunext = cpu * 2 + 1; if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); + cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND; + queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); cpunext++; if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); + cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND; + queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); } } @@ -521,10 +570,12 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot) if (unlikely(midboot)) { needgpcb = 0x2; } else { + mutex_unlock(&rtp->tasks_gp_mutex); set_tasks_gp_state(rtp, RTGS_WAIT_CBS); rcuwait_wait_event(&rtp->cbs_wait, (needgpcb = rcu_tasks_need_gpcb(rtp)), TASK_IDLE); + mutex_lock(&rtp->tasks_gp_mutex); } if (needgpcb & 0x2) { @@ -545,11 +596,19 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot) // RCU-tasks kthread that detects grace periods and invokes callbacks. static int __noreturn rcu_tasks_kthread(void *arg) { + int cpu; struct rcu_tasks *rtp = arg; + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + timer_setup(&rtpcp->lazy_timer, call_rcu_tasks_generic_timer, 0); + rtpcp->urgent_gp = 1; + } + /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ housekeeping_affine(current, HK_TYPE_RCU); - WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start! + smp_store_release(&rtp->kthread_ptr, current); // Let GPs start! /* * Each pass through the following loop makes one check for @@ -631,16 +690,22 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { int cpu; bool havecbs = false; + bool haveurgent = false; + bool haveurgentcbs = false; for_each_possible_cpu(cpu) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); - if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) { + if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) havecbs = true; + if (data_race(rtpcp->urgent_gp)) + haveurgent = true; + if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)) && data_race(rtpcp->urgent_gp)) + haveurgentcbs = true; + if (havecbs && haveurgent && haveurgentcbs) break; - } } - pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", + pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c%c%c l:%lu %s\n", rtp->kname, tasks_gp_state_getname(rtp), data_race(rtp->gp_state), jiffies - data_race(rtp->gp_jiffies), @@ -648,6 +713,9 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], ".C"[havecbs], + ".u"[haveurgent], + ".U"[haveurgentcbs], + rtp->lazy_jiffies, s); } #endif // #ifndef CONFIG_TINY_RCU @@ -1016,11 +1084,16 @@ void rcu_barrier_tasks(void) } EXPORT_SYMBOL_GPL(rcu_barrier_tasks); +int rcu_tasks_lazy_ms = -1; +module_param(rcu_tasks_lazy_ms, int, 0444); + static int __init rcu_spawn_tasks_kthread(void) { cblist_init_generic(&rcu_tasks); rcu_tasks.gp_sleep = HZ / 10; rcu_tasks.init_fract = HZ / 10; + if (rcu_tasks_lazy_ms >= 0) + rcu_tasks.lazy_jiffies = msecs_to_jiffies(rcu_tasks_lazy_ms); rcu_tasks.pregp_func = rcu_tasks_pregp_step; rcu_tasks.pertask_func = rcu_tasks_pertask; rcu_tasks.postscan_func = rcu_tasks_postscan; @@ -1038,6 +1111,12 @@ void show_rcu_tasks_classic_gp_kthread(void) EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread); #endif // !defined(CONFIG_TINY_RCU) +struct task_struct *get_rcu_tasks_gp_kthread(void) +{ + return rcu_tasks.kthread_ptr; +} +EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread); + /* * Contribute to protect against tasklist scan blind spot while the * task is exiting and may be removed from the tasklist. See @@ -1169,10 +1248,15 @@ void rcu_barrier_tasks_rude(void) } EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); +int rcu_tasks_rude_lazy_ms = -1; +module_param(rcu_tasks_rude_lazy_ms, int, 0444); + static int __init rcu_spawn_tasks_rude_kthread(void) { cblist_init_generic(&rcu_tasks_rude); rcu_tasks_rude.gp_sleep = HZ / 10; + if (rcu_tasks_rude_lazy_ms >= 0) + rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms); rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; } @@ -1184,6 +1268,13 @@ void show_rcu_tasks_rude_gp_kthread(void) } EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); #endif // !defined(CONFIG_TINY_RCU) + +struct task_struct *get_rcu_tasks_rude_gp_kthread(void) +{ + return rcu_tasks_rude.kthread_ptr; +} +EXPORT_SYMBOL_GPL(get_rcu_tasks_rude_gp_kthread); + #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ //////////////////////////////////////////////////////////////////////// @@ -1789,6 +1880,9 @@ void rcu_barrier_tasks_trace(void) } EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); +int rcu_tasks_trace_lazy_ms = -1; +module_param(rcu_tasks_trace_lazy_ms, int, 0444); + static int __init rcu_spawn_tasks_trace_kthread(void) { cblist_init_generic(&rcu_tasks_trace); @@ -1803,6 +1897,8 @@ static int __init rcu_spawn_tasks_trace_kthread(void) if (rcu_tasks_trace.init_fract <= 0) rcu_tasks_trace.init_fract = 1; } + if (rcu_tasks_trace_lazy_ms >= 0) + rcu_tasks_trace.lazy_jiffies = msecs_to_jiffies(rcu_tasks_trace_lazy_ms); rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; @@ -1826,6 +1922,12 @@ void show_rcu_tasks_trace_gp_kthread(void) EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread); #endif // !defined(CONFIG_TINY_RCU) +struct task_struct *get_rcu_tasks_trace_gp_kthread(void) +{ + return rcu_tasks_trace.kthread_ptr; +} +EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread); + #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f52ff7241041..cb1caefa8bd0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -632,7 +632,7 @@ void __rcu_irq_enter_check_tick(void) // prevents self-deadlock. So we can safely recheck under the lock. // Note that the nohz_full state currently cannot change. raw_spin_lock_rcu_node(rdp->mynode); - if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + if (READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { // A nohz_full CPU is in the kernel and RCU needs a // quiescent state. Turn on the tick! WRITE_ONCE(rdp->rcu_forced_tick, true); @@ -677,12 +677,16 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } /** - * rcu_is_watching - see if RCU thinks that the current CPU is not idle + * rcu_is_watching - RCU read-side critical sections permitted on current CPU? * - * Return true if RCU is watching the running CPU, which means that this - * CPU can safely enter RCU read-side critical sections. In other words, - * if the current CPU is not in its idle loop or is in an interrupt or - * NMI handler, return true. + * Return @true if RCU is watching the running CPU and @false otherwise. + * An @true return means that this CPU can safely enter RCU read-side + * critical sections. + * + * Although calls to rcu_is_watching() from most parts of the kernel + * will return @true, there are important exceptions. For example, if the + * current CPU is deep within its idle loop, in kernel entry/exit code, + * or offline, rcu_is_watching() will return @false. * * Make notrace because it can be called by the internal functions of * ftrace, and making this notrace removes unnecessary recursion calls. @@ -2046,19 +2050,35 @@ rcu_check_quiescent_state(struct rcu_data *rdp) rcu_report_qs_rdp(rdp); } +/* Return true if callback-invocation time limit exceeded. */ +static bool rcu_do_batch_check_time(long count, long tlimit, + bool jlimit_check, unsigned long jlimit) +{ + // Invoke local_clock() only once per 32 consecutive callbacks. + return unlikely(tlimit) && + (!likely(count & 31) || + (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) && + jlimit_check && time_after(jiffies, jlimit))) && + local_clock() >= tlimit; +} + /* * Invoke any RCU callbacks that have made it to the end of their grace * period. Throttle as specified by rdp->blimit. */ static void rcu_do_batch(struct rcu_data *rdp) { + long bl; + long count = 0; int div; bool __maybe_unused empty; unsigned long flags; - struct rcu_head *rhp; + unsigned long jlimit; + bool jlimit_check = false; + long pending; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); - long bl, count = 0; - long pending, tlimit = 0; + struct rcu_head *rhp; + long tlimit = 0; /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { @@ -2082,11 +2102,15 @@ static void rcu_do_batch(struct rcu_data *rdp) div = READ_ONCE(rcu_divisor); div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); - if (in_serving_softirq() && unlikely(bl > 100)) { + if ((in_serving_softirq() || rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING) && + (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) || unlikely(bl > 100))) { + const long npj = NSEC_PER_SEC / HZ; long rrn = READ_ONCE(rcu_resched_ns); rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; tlimit = local_clock() + rrn; + jlimit = jiffies + (rrn + npj + 1) / npj; + jlimit_check = true; } trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_cbs(&rdp->cblist), bl); @@ -2126,21 +2150,23 @@ static void rcu_do_batch(struct rcu_data *rdp) * Make sure we don't spend too much time here and deprive other * softirq vectors of CPU cycles. */ - if (unlikely(tlimit)) { - /* only call local_clock() every 32 callbacks */ - if (likely((count & 31) || local_clock() < tlimit)) - continue; - /* Exceeded the time limit, so leave. */ + if (rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) break; - } } else { - // In rcuoc context, so no worries about depriving - // other softirq vectors of CPU cycles. + // In rcuc/rcuoc context, so no worries about + // depriving other softirq vectors of CPU cycles. local_bh_enable(); lockdep_assert_irqs_enabled(); cond_resched_tasks_rcu_qs(); lockdep_assert_irqs_enabled(); local_bh_disable(); + // But rcuc kthreads can delay quiescent-state + // reporting, so check time limits for them. + if (rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING && + rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) { + rdp->rcu_cpu_has_work = 1; + break; + } } } @@ -2459,12 +2485,12 @@ static void rcu_cpu_kthread(unsigned int cpu) *statusp = RCU_KTHREAD_RUNNING; local_irq_disable(); work = *workp; - *workp = 0; + WRITE_ONCE(*workp, 0); local_irq_enable(); if (work) rcu_core(); local_bh_enable(); - if (*workp == 0) { + if (!READ_ONCE(*workp)) { trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); *statusp = RCU_KTHREAD_WAITING; return; @@ -2756,7 +2782,7 @@ EXPORT_SYMBOL_GPL(call_rcu); */ struct kvfree_rcu_bulk_data { struct list_head list; - unsigned long gp_snap; + struct rcu_gp_oldstate gp_snap; unsigned long nr_records; void *records[]; }; @@ -2773,6 +2799,7 @@ struct kvfree_rcu_bulk_data { * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period + * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ @@ -2780,6 +2807,7 @@ struct kvfree_rcu_bulk_data { struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; + struct rcu_gp_oldstate head_free_gp_snap; struct list_head bulk_head_free[FREE_N_CHANNELS]; struct kfree_rcu_cpu *krcp; }; @@ -2900,6 +2928,9 @@ drain_page_cache(struct kfree_rcu_cpu *krcp) struct llist_node *page_list, *pos, *n; int freed = 0; + if (!rcu_min_cached_objs) + return 0; + raw_spin_lock_irqsave(&krcp->lock, flags); page_list = llist_del_all(&krcp->bkvcache); WRITE_ONCE(krcp->nr_bkv_objs, 0); @@ -2920,24 +2951,25 @@ kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, unsigned long flags; int i; - debug_rcu_bhead_unqueue(bnode); - - rcu_lock_acquire(&rcu_callback_map); - if (idx == 0) { // kmalloc() / kfree(). - trace_rcu_invoke_kfree_bulk_callback( - rcu_state.name, bnode->nr_records, - bnode->records); - - kfree_bulk(bnode->nr_records, bnode->records); - } else { // vmalloc() / vfree(). - for (i = 0; i < bnode->nr_records; i++) { - trace_rcu_invoke_kvfree_callback( - rcu_state.name, bnode->records[i], 0); - - vfree(bnode->records[i]); + if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { + debug_rcu_bhead_unqueue(bnode); + rcu_lock_acquire(&rcu_callback_map); + if (idx == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + rcu_state.name, bnode->nr_records, + bnode->records); + + kfree_bulk(bnode->nr_records, bnode->records); + } else { // vmalloc() / vfree(). + for (i = 0; i < bnode->nr_records; i++) { + trace_rcu_invoke_kvfree_callback( + rcu_state.name, bnode->records[i], 0); + + vfree(bnode->records[i]); + } } + rcu_lock_release(&rcu_callback_map); } - rcu_lock_release(&rcu_callback_map); raw_spin_lock_irqsave(&krcp->lock, flags); if (put_cached_bnode(krcp, bnode)) @@ -2984,6 +3016,7 @@ static void kfree_rcu_work(struct work_struct *work) struct rcu_head *head; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; + struct rcu_gp_oldstate head_gp_snap; int i; krwp = container_of(to_rcu_work(work), @@ -2998,6 +3031,7 @@ static void kfree_rcu_work(struct work_struct *work) // Channel 3. head = krwp->head_free; krwp->head_free = NULL; + head_gp_snap = krwp->head_free_gp_snap; raw_spin_unlock_irqrestore(&krcp->lock, flags); // Handle the first two channels. @@ -3014,7 +3048,8 @@ static void kfree_rcu_work(struct work_struct *work) * queued on a linked list through their rcu_head structures. * This list is named "Channel 3". */ - kvfree_rcu_list(head); + if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap))) + kvfree_rcu_list(head); } static bool @@ -3081,7 +3116,7 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) INIT_LIST_HEAD(&bulk_ready[i]); list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { - if (!poll_state_synchronize_rcu(bnode->gp_snap)) + if (!poll_state_synchronize_rcu_full(&bnode->gp_snap)) break; atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); @@ -3146,6 +3181,7 @@ static void kfree_rcu_monitor(struct work_struct *work) // objects queued on the linked list. if (!krwp->head_free) { krwp->head_free = krcp->head; + get_state_synchronize_rcu_full(&krwp->head_free_gp_snap); atomic_set(&krcp->head_count, 0); WRITE_ONCE(krcp->head, NULL); } @@ -3194,7 +3230,7 @@ static void fill_page_cache_func(struct work_struct *work) nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? 1 : rcu_min_cached_objs; - for (i = 0; i < nr_pages; i++) { + for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) { bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); @@ -3218,6 +3254,10 @@ static void fill_page_cache_func(struct work_struct *work) static void run_page_cache_worker(struct kfree_rcu_cpu *krcp) { + // If cache disabled, bail out. + if (!rcu_min_cached_objs) + return; + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !atomic_xchg(&krcp->work_in_progress, 1)) { if (atomic_read(&krcp->backoff_page_cache_fill)) { @@ -3272,7 +3312,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, // scenarios. bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - *krcp = krc_this_cpu_lock(flags); + raw_spin_lock_irqsave(&(*krcp)->lock, *flags); } if (!bnode) @@ -3285,7 +3325,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, // Finally insert and update the GP for this page. bnode->records[bnode->nr_records++] = ptr; - bnode->gp_snap = get_state_synchronize_rcu(); + get_state_synchronize_rcu_full(&bnode->gp_snap); atomic_inc(&(*krcp)->bulk_count[idx]); return true; @@ -4283,7 +4323,6 @@ int rcutree_prepare_cpu(unsigned int cpu) */ rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - rdp->beenonline = true; /* We have now been online. */ rdp->gp_seq = READ_ONCE(rnp->gp_seq); rdp->gp_seq_needed = rdp->gp_seq; rdp->cpu_no_qs.b.norm = true; @@ -4311,6 +4350,16 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) } /* + * Has the specified (known valid) CPU ever been fully online? + */ +bool rcu_cpu_beenfullyonline(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + return smp_load_acquire(&rdp->beenonline); +} + +/* * Near the end of the CPU-online process. Pretty much all services * enabled, and the CPU is now very much alive. */ @@ -4368,15 +4417,16 @@ int rcutree_offline_cpu(unsigned int cpu) * Note that this function is special in that it is invoked directly * from the incoming CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. + * This incoming CPU must not have enabled interrupts yet. */ void rcu_cpu_starting(unsigned int cpu) { - unsigned long flags; unsigned long mask; struct rcu_data *rdp; struct rcu_node *rnp; bool newcpu; + lockdep_assert_irqs_disabled(); rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->cpu_started) return; @@ -4384,7 +4434,6 @@ void rcu_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; - local_irq_save(flags); arch_spin_lock(&rcu_state.ofl_lock); rcu_dynticks_eqs_online(); raw_spin_lock(&rcu_state.barrier_lock); @@ -4403,17 +4452,17 @@ void rcu_cpu_starting(unsigned int cpu) /* An incoming CPU should never be blocking a grace period. */ if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */ /* rcu_report_qs_rnp() *really* wants some flags to restore */ - unsigned long flags2; + unsigned long flags; - local_irq_save(flags2); + local_irq_save(flags); rcu_disable_urgency_upon_qs(rdp); /* Report QS -after- changing ->qsmaskinitnext! */ - rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2); + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); } else { raw_spin_unlock_rcu_node(rnp); } arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(flags); + smp_store_release(&rdp->beenonline, true); smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 3b7abb58157d..8239b39d945b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -643,7 +643,7 @@ static void synchronize_rcu_expedited_wait(void) "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rnp->expmaskinit)], "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], - "D."[!!(rdp->cpu_no_qs.b.exp)]); + "D."[!!data_race(rdp->cpu_no_qs.b.exp)]); } } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index f2280616f9d5..5598212d1f27 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -77,9 +77,9 @@ __setup("rcu_nocbs", rcu_nocb_setup); static int __init parse_rcu_nocb_poll(char *arg) { rcu_nocb_poll = true; - return 0; + return 1; } -early_param("rcu_nocb_poll", parse_rcu_nocb_poll); +__setup("rcu_nocb_poll", parse_rcu_nocb_poll); /* * Don't bother bypassing ->cblist if the call_rcu() rate is low. @@ -1319,13 +1319,22 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) int cpu; unsigned long count = 0; + if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask))) + return 0; + + /* Protect rcu_nocb_mask against concurrent (de-)offloading. */ + if (!mutex_trylock(&rcu_state.barrier_mutex)) + return 0; + /* Snapshot count of all CPUs */ - for_each_possible_cpu(cpu) { + for_each_cpu(cpu, rcu_nocb_mask) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); count += READ_ONCE(rdp->lazy_len); } + mutex_unlock(&rcu_state.barrier_mutex); + return count ? count : SHRINK_EMPTY; } @@ -1336,15 +1345,45 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) unsigned long flags; unsigned long count = 0; + if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask))) + return 0; + /* + * Protect against concurrent (de-)offloading. Otherwise nocb locking + * may be ignored or imbalanced. + */ + if (!mutex_trylock(&rcu_state.barrier_mutex)) { + /* + * But really don't insist if barrier_mutex is contended since we + * can't guarantee that it will never engage in a dependency + * chain involving memory allocation. The lock is seldom contended + * anyway. + */ + return 0; + } + /* Snapshot count of all CPUs */ - for_each_possible_cpu(cpu) { + for_each_cpu(cpu, rcu_nocb_mask) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - int _count = READ_ONCE(rdp->lazy_len); + int _count; + + if (WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp))) + continue; - if (_count == 0) + if (!READ_ONCE(rdp->lazy_len)) continue; + rcu_nocb_lock_irqsave(rdp, flags); - WRITE_ONCE(rdp->lazy_len, 0); + /* + * Recheck under the nocb lock. Since we are not holding the bypass + * lock we may still race with increments from the enqueuer but still + * we know for sure if there is at least one lazy callback. + */ + _count = READ_ONCE(rdp->lazy_len); + if (!_count) { + rcu_nocb_unlock_irqrestore(rdp, flags); + continue; + } + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); rcu_nocb_unlock_irqrestore(rdp, flags); wake_nocb_gp(rdp, false); sc->nr_to_scan -= _count; @@ -1352,6 +1391,9 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) if (sc->nr_to_scan <= 0) break; } + + mutex_unlock(&rcu_state.barrier_mutex); + return count ? count : SHRINK_STOP; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7b0fe741a088..41021080ad25 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -257,6 +257,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * GP should not be able to end until we report, so there should be * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) + * + * Interrupts are disabled, so ->cpu_no_qs.b.exp cannot change. */ if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); @@ -941,7 +943,7 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - if (rdp->cpu_no_qs.b.exp) + if (READ_ONCE(rdp->cpu_no_qs.b.exp)) rcu_report_exp_rdp(rdp); } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b10b8349bb2a..6f06dc12904a 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -1035,7 +1035,7 @@ static bool sysrq_rcu; module_param(sysrq_rcu, bool, 0444); /* Dump grace-period-request information due to commandeered sysrq. */ -static void sysrq_show_rcu(int key) +static void sysrq_show_rcu(u8 key) { show_rcu_gp_kthreads(); } |