From 5c889690aa089cc0f36f5cf4abb4d4f0ed81b4da Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 19 Jul 2013 15:09:25 -0700 Subject: [PATCH 01/30] mm: Place preemption point in do_mlockall() loop There is a loop in do_mlockall() that lacks a preemption point, which means that the following can happen on non-preemptible builds of the kernel: > My fuzz tester keeps hitting this. Every instance shows the non-irq stack > came in from mlockall. I'm only seeing this on one box, but that has more > ram (8gb) than my other machines, which might explain it. > > Dave > > INFO: rcu_preempt self-detected stall on CPU { 3} (t=6500 jiffies g=470344 c=470343 q=0) > sending NMI to all CPUs: > NMI backtrace for cpu 3 > CPU: 3 PID: 29664 Comm: trinity-child2 Not tainted 3.11.0-rc1+ #32 > task: ffff88023e743fc0 ti: ffff88022f6f2000 task.ti: ffff88022f6f2000 > RIP: 0010:[] [] trace_hardirqs_off_caller+0x21/0xb0 > RSP: 0018:ffff880244e03c30 EFLAGS: 00000046 > RAX: ffff88023e743fc0 RBX: 0000000000000001 RCX: 000000000000003c > RDX: 000000000000000f RSI: 0000000000000004 RDI: ffffffff81033cab > RBP: ffff880244e03c38 R08: ffff880243288a80 R09: 0000000000000001 > R10: 0000000000000000 R11: 0000000000000001 R12: ffff880243288a80 > R13: ffff8802437eda40 R14: 0000000000080000 R15: 000000000000d010 > FS: 00007f50ae33b740(0000) GS:ffff880244e00000(0000) knlGS:0000000000000000 > CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > CR2: 000000000097f000 CR3: 0000000240fa0000 CR4: 00000000001407e0 > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 > Stack: > ffffffff810bf86d ffff880244e03c98 ffffffff81033cab 0000000000000096 > 000000000000d008 0000000300000002 0000000000000004 0000000000000003 > 0000000000002710 ffffffff81c50d00 ffffffff81c50d00 ffff880244fcde00 > Call Trace: > > [] ? trace_hardirqs_off+0xd/0x10 > [] __x2apic_send_IPI_mask+0x1ab/0x1c0 > [] x2apic_send_IPI_all+0x1c/0x20 > [] arch_trigger_all_cpu_backtrace+0x65/0xa0 > [] rcu_check_callbacks+0x331/0x8e0 > [] ? hrtimer_run_queues+0x20/0x180 > [] ? sched_clock_cpu+0xb5/0x100 > [] update_process_times+0x47/0x80 > [] tick_sched_handle.isra.16+0x25/0x60 > [] tick_sched_timer+0x41/0x60 > [] __run_hrtimer+0x81/0x4e0 > [] ? tick_sched_do_timer+0x60/0x60 > [] hrtimer_interrupt+0xff/0x240 > [] local_apic_timer_interrupt+0x34/0x60 > [] smp_apic_timer_interrupt+0x3f/0x60 > [] apic_timer_interrupt+0x6f/0x80 > [] ? retint_restore_args+0xe/0xe > [] ? __do_softirq+0xb1/0x440 > [] irq_exit+0xcd/0xe0 > [] smp_apic_timer_interrupt+0x45/0x60 > [] apic_timer_interrupt+0x6f/0x80 > > [] ? retint_restore_args+0xe/0xe > [] ? wait_for_completion_killable+0x170/0x170 > [] ? preempt_schedule_irq+0x53/0x90 > [] retint_kernel+0x26/0x30 > [] ? queue_work_on+0x43/0x90 > [] schedule_on_each_cpu+0xc9/0x1a0 > [] ? lru_add_drain+0x50/0x50 > [] lru_add_drain_all+0x15/0x20 > [] SyS_mlockall+0xa5/0x1a0 > [] tracesys+0xdd/0xe2 This commit addresses this problem by inserting the required preemption point. Reported-by: Dave Jones Signed-off-by: Paul E. McKenney Cc: KOSAKI Motohiro Cc: Michel Lespinasse Cc: Andrew Morton Cc: Linus Torvalds --- mm/mlock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/mlock.c b/mm/mlock.c index d63802663242..67ba6da7d0e3 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -736,6 +736,7 @@ static int do_mlockall(int flags) /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); + cond_resched(); } out: return 0; From b3f2d02598fcf16933f72a57bbba7edb22ad8eda Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 Aug 2013 14:37:47 -0700 Subject: [PATCH 02/30] rcu: Use proper cpp macro for ->gp_flags One of the ->gp_flags assignments used a raw number rather than the cpp macro that was intended for this purpose, which this commit fixes. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 32618b3fe4e6..e0fa1920cd67 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1452,7 +1452,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rdp = this_cpu_ptr(rsp->rda); rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ if (cpu_needs_another_gp(rsp, rdp)) - rsp->gp_flags = 1; + rsp->gp_flags = RCU_GP_FLAG_INIT; raw_spin_unlock_irq(&rnp->lock); } From 01896f7e0a122e8f20082e24f6f9a340034b9c01 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 18 Aug 2013 12:14:32 -0700 Subject: [PATCH 03/30] rcu: Convert local functions to static The rcu_cpu_stall_timeout kernel parameter, the rcu_dynticks per-CPU variable, and the rcu_gp_fqs() function are used only locally. This commit therefore marks them as static. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcupdate.c | 2 +- kernel/rcutree.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index b02a339836b4..3260a1074b48 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -298,7 +298,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #endif int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ -int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; +static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_suppress, int, 0644); module_param(rcu_cpu_stall_timeout, int, 0644); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e0fa1920cd67..2712b8991143 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -222,7 +222,7 @@ void rcu_note_context_switch(int cpu) } EXPORT_SYMBOL_GPL(rcu_note_context_switch); -DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { +static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), #ifdef CONFIG_NO_HZ_FULL_SYSIDLE @@ -1366,7 +1366,7 @@ static int rcu_gp_init(struct rcu_state *rsp) /* * Do one round of quiescent-state forcing. */ -int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) +static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) { int fqs_state = fqs_state_in; bool isidle = false; From 829511d8aa7a2179bba57ab4ab277d6f9c77ae5b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 18 Aug 2013 12:21:57 -0700 Subject: [PATCH 04/30] rcu: Fix dubious "if" condition in __call_rcu_nocb_enqueue() This commit replaces an incorrect (but fortunately functional) bitwise OR ("|") operator with the correct logical OR ("||"). Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 130c97b027f2..6f9aecef8ab6 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2108,7 +2108,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, /* If we are not being polled and there is a kthread, awaken it ... */ t = ACCESS_ONCE(rdp->nocb_kthread); - if (rcu_nocb_poll | !t) + if (rcu_nocb_poll || !t) return; len = atomic_long_read(&rdp->nocb_q_count); if (old_rhpp == &rdp->nocb_head) { From 2a855b644c310d5db5a80b8816c0c7748c167977 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Aug 2013 09:40:42 -0700 Subject: [PATCH 05/30] rcu: Make list_splice_init_rcu() account for RCU readers The list_splice_init_rcu() function allows a list visible to RCU readers to be spliced into another list visible to RCU readers. This is OK, except for the use of INIT_LIST_HEAD(), which does pointer updates without doing anything to make those updates safe for concurrent readers. Of course, most of the time INIT_LIST_HEAD() is being used in reader-free contexts, such as initialization or cleanup, so it is OK for it to update pointers in an unsafe-for-RCU-readers manner. This commit therefore creates an INIT_LIST_HEAD_RCU() that uses ACCESS_ONCE() to make the updates reader-safe. The reason that we can use ACCESS_ONCE() instead of the more typical rcu_assign_pointer() is that list_splice_init_rcu() is updating the pointers to reference something that is already visible to readers, so that there is no problem with pre-initialized values. Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4106721c4e5e..45a0a9e81478 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -18,6 +18,21 @@ * be used anywhere you would want to use a list_empty_rcu(). */ +/* + * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers + * @list: list to be initialized + * + * You should instead use INIT_LIST_HEAD() for normal initialization and + * cleanup tasks, when readers have no access to the list being initialized. + * However, if the list being initialized is visible to readers, you + * need to keep the compiler from being too mischievous. + */ +static inline void INIT_LIST_HEAD_RCU(struct list_head *list) +{ + ACCESS_ONCE(list->next) = list; + ACCESS_ONCE(list->prev) = list; +} + /* * return the ->next pointer of a list_head in an rcu safe * way, we must not access it directly @@ -191,9 +206,13 @@ static inline void list_splice_init_rcu(struct list_head *list, if (list_empty(list)) return; - /* "first" and "last" tracking list, so initialize it. */ + /* + * "first" and "last" tracking list, so initialize it. RCU readers + * have access to this list, so we must use INIT_LIST_HEAD_RCU() + * instead of INIT_LIST_HEAD(). + */ - INIT_LIST_HEAD(list); + INIT_LIST_HEAD_RCU(list); /* * At this point, the list body still points to the source list. From c9d4b0af9e0609cc525c55de18229fde7c926d61 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 31 Aug 2013 13:34:10 -0700 Subject: [PATCH 06/30] rcu: Replace __get_cpu_var() uses __get_cpu_var() is used for multiple purposes in the kernel source. One of them is address calculation via the form &__get_cpu_var(x). This calculates the address for the instance of the percpu variable of the current processor based on an offset. Other use cases are for storing and retrieving data from the current processors percpu area. __get_cpu_var() can be used as an lvalue when writing data or on the right side of an assignment. __get_cpu_var() is defined as : __get_cpu_var() always only does an address determination. However, store and retrieve operations could use a segment prefix (or global register on other platforms) to avoid the address calculation. this_cpu_write() and this_cpu_read() can directly take an offset into a percpu area and use optimized assembly code to read and write per cpu variables. This patch converts __get_cpu_var into either an explicit address calculation using this_cpu_ptr() or into a use of this_cpu operations that use the offset. Thereby address calcualtions are avoided and less registers are used when code is generated. At the end of the patchset all uses of __get_cpu_var have been removed so the macro is removed too. The patchset includes passes over all arches as well. Once these operations are used throughout then specialized macros can be defined in non -x86 arches as well in order to optimize per cpu access by f.e. using a global register that may be set to the per cpu base. Transformations done to __get_cpu_var() 1. Determine the address of the percpu instance of the current processor. DEFINE_PER_CPU(int, y); int *x = &__get_cpu_var(y); Converts to int *x = this_cpu_ptr(&y); 2. Same as #1 but this time an array structure is involved. DEFINE_PER_CPU(int, y[20]); int *x = __get_cpu_var(y); Converts to int *x = this_cpu_ptr(y); 3. Retrieve the content of the current processors instance of a per cpu variable. DEFINE_PER_CPU(int, u); int x = __get_cpu_var(y) Converts to int x = __this_cpu_read(y); 4. Retrieve the content of a percpu struct DEFINE_PER_CPU(struct mystruct, y); struct mystruct x = __get_cpu_var(y); Converts to memcpy(this_cpu_ptr(&x), y, sizeof(x)); 5. Assignment to a per cpu variable DEFINE_PER_CPU(int, y) __get_cpu_var(y) = x; Converts to this_cpu_write(y, x); 6. Increment/Decrement etc of a per cpu variable DEFINE_PER_CPU(int, y); __get_cpu_var(y)++ Converts to this_cpu_inc(y) Signed-off-by: Christoph Lameter [ paulmck: Address conflicts. ] Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 22 +++++++++++----------- kernel/rcutree_plugin.h | 14 +++++++------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 2712b8991143..8eb9cfd9e2b1 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -407,7 +407,7 @@ static void rcu_eqs_enter(bool user) long long oldval; struct rcu_dynticks *rdtp; - rdtp = &__get_cpu_var(rcu_dynticks); + rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) @@ -435,7 +435,7 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); - rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); + rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -478,7 +478,7 @@ void rcu_irq_exit(void) struct rcu_dynticks *rdtp; local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); + rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; WARN_ON_ONCE(rdtp->dynticks_nesting < 0); @@ -528,7 +528,7 @@ static void rcu_eqs_exit(bool user) struct rcu_dynticks *rdtp; long long oldval; - rdtp = &__get_cpu_var(rcu_dynticks); + rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(oldval < 0); if (oldval & DYNTICK_TASK_NEST_MASK) @@ -555,7 +555,7 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); - rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); + rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -599,7 +599,7 @@ void rcu_irq_enter(void) long long oldval; local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); + rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; WARN_ON_ONCE(rdtp->dynticks_nesting == 0); @@ -620,7 +620,7 @@ void rcu_irq_enter(void) */ void rcu_nmi_enter(void) { - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); if (rdtp->dynticks_nmi_nesting == 0 && (atomic_read(&rdtp->dynticks) & 0x1)) @@ -642,7 +642,7 @@ void rcu_nmi_enter(void) */ void rcu_nmi_exit(void) { - struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); if (rdtp->dynticks_nmi_nesting == 0 || --rdtp->dynticks_nmi_nesting != 0) @@ -665,7 +665,7 @@ int rcu_is_cpu_idle(void) int ret; preempt_disable(); - ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; + ret = (atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1) == 0; preempt_enable(); return ret; } @@ -703,7 +703,7 @@ bool rcu_lockdep_current_cpu_online(void) if (in_nmi()) return 1; preempt_disable(); - rdp = &__get_cpu_var(rcu_sched_data); + rdp = this_cpu_ptr(&rcu_sched_data); rnp = rdp->mynode; ret = (rdp->grpmask & rnp->qsmaskinit) || !rcu_scheduler_fully_active; @@ -723,7 +723,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); */ static int rcu_is_cpu_rrupt_from_idle(void) { - return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; + return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; } /* diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6f9aecef8ab6..c684f7ab37fa 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -660,7 +660,7 @@ static void rcu_preempt_check_callbacks(int cpu) static void rcu_preempt_do_callbacks(void) { - rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); + rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); } #endif /* #ifdef CONFIG_RCU_BOOST */ @@ -1332,7 +1332,7 @@ static void invoke_rcu_callbacks_kthread(void) */ static bool rcu_is_callbacks_kthread(void) { - return __get_cpu_var(rcu_cpu_kthread_task) == current; + return __this_cpu_read(rcu_cpu_kthread_task) == current; } #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) @@ -1382,8 +1382,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, static void rcu_kthread_do_work(void) { - rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); - rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); + rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); + rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); rcu_preempt_do_callbacks(); } @@ -1402,7 +1402,7 @@ static void rcu_cpu_kthread_park(unsigned int cpu) static int rcu_cpu_kthread_should_run(unsigned int cpu) { - return __get_cpu_var(rcu_cpu_has_work); + return __this_cpu_read(rcu_cpu_has_work); } /* @@ -1412,8 +1412,8 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu) */ static void rcu_cpu_kthread(unsigned int cpu) { - unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); - char work, *workp = &__get_cpu_var(rcu_cpu_has_work); + unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); + char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); int spincnt; for (spincnt = 0; spincnt < 10; spincnt++) { From 289828e62de0334a0d01c0f65df91cd47d3a9e05 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 31 Aug 2013 19:23:29 -0700 Subject: [PATCH 07/30] rcu: Silence unused-variable warnings The "idle" variable in both rcu_eqs_enter_common() and rcu_eqs_exit_common() is only used in a WARN_ON_ONCE(). If the kernel is built disabling WARN_ON_ONCE(), the compiler will complain (rightly) that "idle" is unused. This commit therefore adds a __maybe_unused to the declaration of both variables. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 8eb9cfd9e2b1..e6f2e8f14140 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -371,7 +371,8 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, { trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); + struct task_struct *idle __maybe_unused = + idle_task(smp_processor_id()); trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); ftrace_dump(DUMP_ORIG); @@ -508,7 +509,8 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, rcu_cleanup_after_idle(smp_processor_id()); trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); + struct task_struct *idle __maybe_unused = + idle_task(smp_processor_id()); trace_rcu_dyntick(TPS("Error on exit: not idle task"), oldval, rdtp->dynticks_nesting); From 69c8d28c96445e28f081fcd987e34ea2afa65039 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Sep 2013 09:52:20 -0700 Subject: [PATCH 08/30] rcu: Micro-optimize rcu_cpu_has_callbacks() The for_each_rcu_flavor() loop unconditionally scans all flavors, even when the first flavor might have some non-lazy callbacks. Once the loop has seen a non-lazy callback, further passes through the loop cannot change the state. This is not a huge problem, given that there can be at most three RCU flavors (RCU-bh, RCU-preempt, and RCU-sched), but this code is on the path to idle, so speeding it up even a small amount would have some benefit. This commit therefore does two things: 1. Rearranges the order of the list of RCU flavors in order to place the most active flavor first in the list. The most active RCU flavor is RCU-preempt, or, if there is no RCU-preempt, RCU-sched. 2. Reworks the for_each_rcu_flavor() to exit early when the first non-lazy callback is seen, or, in the case where the caller does not care about non-lazy callbacks (RCU_FAST_NO_HZ=n), when the first callback is seen. Reported-by: Chen Gang Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e6f2e8f14140..49464aded7f7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2727,10 +2727,13 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) for_each_rcu_flavor(rsp) { rdp = per_cpu_ptr(rsp->rda, cpu); - if (rdp->qlen != rdp->qlen_lazy) + if (!rdp->nxtlist) + continue; + hc = true; + if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { al = false; - if (rdp->nxtlist) - hc = true; + break; + } } if (all_lazy) *all_lazy = al; @@ -3297,8 +3300,8 @@ void __init rcu_init(void) rcu_bootup_announce(); rcu_init_geometry(); - rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); + rcu_init_one(&rcu_sched_state, &rcu_sched_data); __rcu_init_preempt(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); From 26cdfedf6a902345f8604ea8e0b7dd2566b37a46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Sep 2013 10:51:13 -0700 Subject: [PATCH 09/30] rcu: Reject memory-order-induced stall-warning false positives If a system is idle from an RCU perspective for longer than specified by CONFIG_RCU_CPU_STALL_TIMEOUT, and if one CPU starts a grace period just as a second checks for CPU stalls, and if this second CPU happens to see the old value of rsp->jiffies_stall, it will incorrectly report a CPU stall. This is quite rare, but apparently occurs deterministically on systems with about 6TB of memory. This commit therefore orders accesses to the data used to determine whether or not a CPU stall is in progress. Grace-period initialization and cleanup first increments rsp->completed to mark the end of the previous grace period, then records the current jiffies in rsp->gp_start, then records the jiffies at which a stall can be expected to occur in rsp->jiffies_stall, and finally increments rsp->gpnum to mark the start of the new grace period. Now, this ordering by itself does not prevent false positives. For example, if grace-period initialization was delayed between recording rsp->gp_start and rsp->jiffies_stall, the CPU stall warning code might still see an old value of rsp->jiffies_stall. Therefore, this commit also orders the CPU stall warning accesses as well, loading rsp->gpnum and jiffies, then rsp->jiffies_stall, then rsp->gp_start, and finally rsp->completed. This ordering means that the false-positive scenario in the previous paragraph would result in rsp->completed being greater than or equal to rsp->gpnum, which is never valid for a CPU stall, allowing the false positive to be rejected. Furthermore, any fetch that gets an old value of rsp->jiffies_stall must also get an old value of rsp->gpnum, which will again be rejected by the comparison of rsp->gpnum and rsp->completed. Situations where rsp->gp_start is later than rsp->jiffies_stall are also rejected, as are situations where jiffies is less than rsp->jiffies_stall. Although use of unsynchronized accesses means that there are likely still some false-positive scenarios (synchronization has proven to be a very bad idea on large systems), this should get rid of a large class of these scenarios. Reported-by: Fabian Herschel Reported-by: Michal Hocko Signed-off-by: Paul E. McKenney Reviewed-by: Michal Hocko Tested-by: Jochen Striepe --- kernel/rcutree.c | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 49464aded7f7..b618d72bd8ec 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -804,8 +804,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, static void record_gp_stall_check_time(struct rcu_state *rsp) { - rsp->gp_start = jiffies; - rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); + unsigned long j = ACCESS_ONCE(jiffies); + + rsp->gp_start = j; + smp_wmb(); /* Record start time before stall time. */ + rsp->jiffies_stall = j + rcu_jiffies_till_stall_check(); } /* @@ -934,17 +937,48 @@ static void print_cpu_stall(struct rcu_state *rsp) static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { + unsigned long completed; + unsigned long gpnum; + unsigned long gps; unsigned long j; unsigned long js; struct rcu_node *rnp; - if (rcu_cpu_stall_suppress) + if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) return; j = ACCESS_ONCE(jiffies); + + /* + * Lots of memory barriers to reject false positives. + * + * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, + * then rsp->gp_start, and finally rsp->completed. These values + * are updated in the opposite order with memory barriers (or + * equivalent) during grace-period initialization and cleanup. + * Now, a false positive can occur if we get an new value of + * rsp->gp_start and a old value of rsp->jiffies_stall. But given + * the memory barriers, the only way that this can happen is if one + * grace period ends and another starts between these two fetches. + * Detect this by comparing rsp->completed with the previous fetch + * from rsp->gpnum. + * + * Given this check, comparisons of jiffies, rsp->jiffies_stall, + * and rsp->gp_start suffice to forestall false positives. + */ + gpnum = ACCESS_ONCE(rsp->gpnum); + smp_rmb(); /* Pick up ->gpnum first... */ js = ACCESS_ONCE(rsp->jiffies_stall); + smp_rmb(); /* ...then ->jiffies_stall before the rest... */ + gps = ACCESS_ONCE(rsp->gp_start); + smp_rmb(); /* ...and finally ->gp_start before ->completed. */ + completed = ACCESS_ONCE(rsp->completed); + if (ULONG_CMP_GE(completed, gpnum) || + ULONG_CMP_LT(j, js) || + ULONG_CMP_GE(gps, js)) + return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; if (rcu_gp_in_progress(rsp) && - (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { + (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); @@ -1317,9 +1351,10 @@ static int rcu_gp_init(struct rcu_state *rsp) } /* Advance to a new grace period and initialize state. */ + record_gp_stall_check_time(rsp); + smp_wmb(); /* Record GP times before starting GP. */ rsp->gpnum++; trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); - record_gp_stall_check_time(rsp); raw_spin_unlock_irq(&rnp->lock); /* Exclude any concurrent CPU-hotplug operations. */ From 0d75292467b0c8554d70c751a35af6514202ac28 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 17 Aug 2013 18:08:37 -0700 Subject: [PATCH 10/30] rcu: Have rcutiny tracepoints use tracepoint_string() This commit extends the work done in f7f7bac9 (rcu: Have the RCU tracepoints use the tracepoint_string infrastructure) to cover rcutiny. Signed-off-by: Paul E. McKenney Cc: Steven Rostedt --- kernel/rcu.h | 7 +++++++ kernel/rcutiny.c | 17 ++++++++++------- kernel/rcutree.c | 7 ------- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/kernel/rcu.h b/kernel/rcu.h index 77131966c4ad..7859a0a3951e 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -122,4 +122,11 @@ int rcu_jiffies_till_stall_check(void); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ +/* + * Strings used in tracepoints need to be exported via the + * tracing system such that tools like perf and trace-cmd can + * translate the string address pointers to actual text. + */ +#define TPS(x) tracepoint_string(x) + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 9ed6075dc562..e99eb5fb10af 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -35,6 +35,7 @@ #include #include #include +#include #ifdef CONFIG_RCU_TRACE #include @@ -58,16 +59,17 @@ static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; static void rcu_idle_enter_common(long long newval) { if (newval) { - RCU_TRACE(trace_rcu_dyntick("--=", + RCU_TRACE(trace_rcu_dyntick(TPS("--="), rcu_dynticks_nesting, newval)); rcu_dynticks_nesting = newval; return; } - RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); + RCU_TRACE(trace_rcu_dyntick(TPS("Start"), + rcu_dynticks_nesting, newval)); if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); - RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", + RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), rcu_dynticks_nesting, newval)); ftrace_dump(DUMP_ALL); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", @@ -120,15 +122,15 @@ EXPORT_SYMBOL_GPL(rcu_irq_exit); static void rcu_idle_exit_common(long long oldval) { if (oldval) { - RCU_TRACE(trace_rcu_dyntick("++=", + RCU_TRACE(trace_rcu_dyntick(TPS("++="), oldval, rcu_dynticks_nesting)); return; } - RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); + RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); if (!is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); - RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", + RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), oldval, rcu_dynticks_nesting)); ftrace_dump(DUMP_ALL); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", @@ -304,7 +306,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) RCU_TRACE(cb_count++); } RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), + RCU_TRACE(trace_rcu_batch_end(rcp->name, + cb_count, 0, need_resched(), is_idle_task(current), false)); } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b618d72bd8ec..62aab5ceefe9 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -61,13 +61,6 @@ #include "rcu.h" -/* - * Strings used in tracepoints need to be exported via the - * tracing system such that tools like perf and trace-cmd can - * translate the string address pointers to actual text. - */ -#define TPS(x) tracepoint_string(x) - /* Data structures. */ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; From f7be82093952ee4a74ffc8c729b2811f908cd9a4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 Aug 2013 18:27:52 -0700 Subject: [PATCH 11/30] rcu: Improve grace-period start logic This commit improves grace-period start logic by checking ->gp_flags under the lock and by issuing a warning if a grace period is already in progress. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 32618b3fe4e6..d679a522c0a2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1297,7 +1297,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) } /* - * Initialize a new grace period. + * Initialize a new grace period. Return 0 if no grace period required. */ static int rcu_gp_init(struct rcu_state *rsp) { @@ -1306,10 +1306,18 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); + if (rsp->gp_flags == 0) { + /* Spurious wakeup, tell caller to go back to sleep. */ + raw_spin_unlock_irq(&rnp->lock); + return 0; + } rsp->gp_flags = 0; /* Clear all flags: New grace period. */ - if (rcu_gp_in_progress(rsp)) { - /* Grace period already in progress, don't start another. */ + if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { + /* + * Grace period already in progress, don't start another. + * Not supposed to be able to happen. + */ raw_spin_unlock_irq(&rnp->lock); return 0; } @@ -1474,8 +1482,7 @@ static int __noreturn rcu_gp_kthread(void *arg) wait_event_interruptible(rsp->gp_wq, rsp->gp_flags & RCU_GP_FLAG_INIT); - if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && - rcu_gp_init(rsp)) + if (rcu_gp_init(rsp)) break; cond_resched(); flush_signals(current); From 88d6df612cc3c99f56cc18461fcc531c3a145544 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 Aug 2013 21:44:31 -0700 Subject: [PATCH 12/30] rcu: Prevent spurious-wakeup DoS attack on rcu_gp_kthread() Spurious wakeups in the force-quiescent-state loop in rcu_gp_kthread() cause the timeout to be recalculated, which would prevent rcu_gp_fqs() from ever being called. This would in turn would prevent the grace period from ever ending for as long as there was at least one CPU in an extended quiescent state that had not yet passed through a quiescent state. This commit therefore avoids recalculating the timeout unless the previous pass's call to wait_event_interruptible_timeout() actually did time out, thus preventing the above scenario. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d679a522c0a2..62b67b78b661 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1470,6 +1470,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) static int __noreturn rcu_gp_kthread(void *arg) { int fqs_state; + int gf; unsigned long j; int ret; struct rcu_state *rsp = arg; @@ -1495,10 +1496,13 @@ static int __noreturn rcu_gp_kthread(void *arg) j = HZ; jiffies_till_first_fqs = HZ; } + ret = 0; for (;;) { - rsp->jiffies_force_qs = jiffies + j; + if (!ret) + rsp->jiffies_force_qs = jiffies + j; ret = wait_event_interruptible_timeout(rsp->gp_wq, - (rsp->gp_flags & RCU_GP_FLAG_FQS) || + ((gf = ACCESS_ONCE(rsp->gp_flags)) & + RCU_GP_FLAG_FQS) || (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)), j); @@ -1507,7 +1511,8 @@ static int __noreturn rcu_gp_kthread(void *arg) !rcu_preempt_blocked_readers_cgp(rnp)) break; /* If time for quiescent-state forcing, do it. */ - if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { + if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || + (gf & RCU_GP_FLAG_FQS)) { fqs_state = rcu_gp_fqs(rsp, fqs_state); cond_resched(); } else { From 591c6d1710cd73824057d08eda302cf2a7cfd18a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 Aug 2013 22:26:23 -0700 Subject: [PATCH 13/30] rcu: Flag lockless access to ->gp_flags with ACCESS_ONCE() This commit applies ACCESS_ONCE() to an outside-of-lock access to ->gp_flags. Although it is hard to imagine any sane compiler messing this particular case up, the documentation benefits are substantial. Plus the definition of "sane compiler" grows ever looser. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 62b67b78b661..6d028fdbf86c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1481,7 +1481,7 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle grace-period start. */ for (;;) { wait_event_interruptible(rsp->gp_wq, - rsp->gp_flags & + ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); if (rcu_gp_init(rsp)) break; From 63c4db78e80407976e47bccaa2a4d8251b5a10bc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 9 Aug 2013 12:19:29 -0700 Subject: [PATCH 14/30] rcu: Add tracing to rcu_gp_kthread() This commit adds tracing to the rcu_gp_kthread() function in order to help trace down hangs potentially involving this kthread. Reported-by: Clark Williams Reported-by: Carsten Emde Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 28 +++++++++++++++++++--------- kernel/rcutree.c | 18 ++++++++++++++++++ 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index ee2376cfaab3..60077e12093c 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -39,15 +39,25 @@ TRACE_EVENT(rcu_utilization, #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) /* - * Tracepoint for grace-period events: starting and ending a grace - * period ("start" and "end", respectively), a CPU noting the start - * of a new grace period or the end of an old grace period ("cpustart" - * and "cpuend", respectively), a CPU passing through a quiescent - * state ("cpuqs"), a CPU coming online or going offline ("cpuonl" - * and "cpuofl", respectively), a CPU being kicked for being too - * long in dyntick-idle mode ("kick"), a CPU accelerating its new - * callbacks to RCU_NEXT_READY_TAIL ("AccReadyCB"), and a CPU - * accelerating its new callbacks to RCU_WAIT_TAIL ("AccWaitCB"). + * Tracepoint for grace-period events. Takes a string identifying the + * RCU flavor, the grace-period number, and a string identifying the + * grace-period-related event as follows: + * + * "AccReadyCB": CPU acclerates new callbacks to RCU_NEXT_READY_TAIL. + * "AccWaitCB": CPU accelerates new callbacks to RCU_WAIT_TAIL. + * "start": Start a grace period. + * "cpustart": CPU first notices a grace-period start. + * "cpuqs": CPU passes through a quiescent state. + * "cpuonl": CPU comes online. + * "cpuofl": CPU goes offline. + * "reqwait": GP kthread sleeps waiting for grace-period request. + * "reqwaitsig": GP kthread awakened by signal from reqwait state. + * "fqswait": GP kthread waiting until time to force quiescent states. + * "fqsstart": GP kthread starts forcing quiescent states. + * "fqsend": GP kthread done forcing quiescent states. + * "fqswaitsig": GP kthread awakened by signal from fqswait state. + * "end": End a grace period. + * "cpuend": CPU first notices a grace-period end. */ TRACE_EVENT(rcu_grace_period, diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6d028fdbf86c..78d371526667 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1480,6 +1480,9 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle grace-period start. */ for (;;) { + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("reqwait")); wait_event_interruptible(rsp->gp_wq, ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); @@ -1487,6 +1490,9 @@ static int __noreturn rcu_gp_kthread(void *arg) break; cond_resched(); flush_signals(current); + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("reqwaitsig")); } /* Handle quiescent-state forcing. */ @@ -1500,6 +1506,9 @@ static int __noreturn rcu_gp_kthread(void *arg) for (;;) { if (!ret) rsp->jiffies_force_qs = jiffies + j; + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("fqswait")); ret = wait_event_interruptible_timeout(rsp->gp_wq, ((gf = ACCESS_ONCE(rsp->gp_flags)) & RCU_GP_FLAG_FQS) || @@ -1513,12 +1522,21 @@ static int __noreturn rcu_gp_kthread(void *arg) /* If time for quiescent-state forcing, do it. */ if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || (gf & RCU_GP_FLAG_FQS)) { + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("fqsstart")); fqs_state = rcu_gp_fqs(rsp, fqs_state); + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("fqsend")); cond_resched(); } else { /* Deal with stray signal. */ cond_resched(); flush_signals(current); + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("fqswaitsig")); } j = jiffies_till_next_fqs; if (j > HZ) { From bb311eccbdab974639263060b8452bf304af0b0c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 9 Aug 2013 16:02:09 -0700 Subject: [PATCH 15/30] rcu: Add tracing of normal (non-NOCB) grace-period requests This commit adds tracing to the normal grace-period request points. These are rcu_gp_cleanup(), which checks for the need for another grace period at the end of the previous grace period, and rcu_start_gp_advanced(), which restarts RCU's state machine after an idle period. These trace events are intended to help track down bugs where RCU remains idle despite there being work for it to do. Reported-by: Clark Williams Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 1 + kernel/rcutree.c | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 60077e12093c..98466c618ebc 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -45,6 +45,7 @@ TRACE_EVENT(rcu_utilization, * * "AccReadyCB": CPU acclerates new callbacks to RCU_NEXT_READY_TAIL. * "AccWaitCB": CPU accelerates new callbacks to RCU_WAIT_TAIL. + * "newreq": Request a new grace period. * "start": Start a grace period. * "cpustart": CPU first notices a grace-period start. * "cpuqs": CPU passes through a quiescent state. diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 78d371526667..54dd6d03dbb5 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1459,8 +1459,12 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ - if (cpu_needs_another_gp(rsp, rdp)) + if (cpu_needs_another_gp(rsp, rdp)) { rsp->gp_flags = 1; + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("newreq")); + } raw_spin_unlock_irq(&rnp->lock); } @@ -1584,6 +1588,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, return; } rsp->gp_flags = RCU_GP_FLAG_INIT; + trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), + TPS("newreq")); /* * We can't do wakeups while holding the rnp->lock, as that From 9261dd0da6c6432f08670719069449c6efe4f7a9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Aug 2013 16:24:26 -0700 Subject: [PATCH 16/30] rcu: Add tracing for rcuo no-CBs CPU wakeup handshake Lost wakeups from call_rcu() to the rcuo kthreads can result in hangs that are difficult to diagnose. This commit therefore adds tracing to help pin down the cause of these hangs. Reported-by: Clark Williams Reported-by: Carsten Emde Signed-off-by: Paul E. McKenney [ paulmck: Add const per kbuild test robot's advice. ] --- include/trace/events/rcu.h | 37 +++++++++++++++++++++++++++++++++++++ kernel/rcutree_plugin.h | 14 +++++++++++++- 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 98466c618ebc..4301cd9e3ee5 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -171,6 +171,42 @@ TRACE_EVENT(rcu_grace_period_init, __entry->grplo, __entry->grphi, __entry->qsmask) ); +/* + * Tracepoint for RCU no-CBs CPU callback handoffs. This event is intended + * to assist debugging of these handoffs. + * + * The first argument is the name of the RCU flavor, and the second is + * the number of the offloaded CPU are extracted. The third and final + * argument is a string as follows: + * + * "WakeEmpty": Wake rcuo kthread, first CB to empty list. + * "WakeOvf": Wake rcuo kthread, CB list is huge. + * "WakeNot": Don't wake rcuo kthread. + * "WakeNotPoll": Don't wake rcuo kthread because it is polling. + * "WokeEmpty": rcuo kthread woke to find empty list. + * "WokeNonEmpty": rcuo kthread woke to find non-empty list. + */ +TRACE_EVENT(rcu_nocb_wake, + + TP_PROTO(const char *rcuname, int cpu, const char *reason), + + TP_ARGS(rcuname, cpu, reason), + + TP_STRUCT__entry( + __field(const char *, rcuname) + __field(int, cpu) + __field(const char *, reason) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->cpu = cpu; + __entry->reason = reason; + ), + + TP_printk("%s %d %s", __entry->rcuname, __entry->cpu, __entry->reason) +); + /* * Tracepoint for tasks blocking within preemptible-RCU read-side * critical sections. Track the type of RCU (which one day might @@ -667,6 +703,7 @@ TRACE_EVENT(rcu_barrier, #define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \ level, grplo, grphi, event) \ do { } while (0) +#define trace_rcu_nocb_wake(rcuname, cpu, reason) do { } while (0) #define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0) #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 130c97b027f2..f4ed24b18e77 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2108,15 +2108,22 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, /* If we are not being polled and there is a kthread, awaken it ... */ t = ACCESS_ONCE(rdp->nocb_kthread); - if (rcu_nocb_poll | !t) + if (rcu_nocb_poll | !t) { + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeNotPoll")); return; + } len = atomic_long_read(&rdp->nocb_q_count); if (old_rhpp == &rdp->nocb_head) { wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ rdp->qlen_last_fqs_check = 0; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else if (len > rdp->qlen_last_fqs_check + qhimark) { wake_up_process(t); /* ... or if many callbacks queued. */ rdp->qlen_last_fqs_check = LONG_MAX / 2; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); + } else { + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); } return; } @@ -2233,10 +2240,15 @@ static int rcu_nocb_kthread(void *arg) wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); list = ACCESS_ONCE(rdp->nocb_head); if (!list) { + if (!rcu_nocb_poll) + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WokeEmpty")); schedule_timeout_interruptible(1); flush_signals(current); continue; } + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WokeNonEmpty")); /* * Extract queued callbacks, update counts, and wait From 756cbf6befe6f59b0b3e0967d92a66c11e2566ed Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Aug 2013 10:12:12 -0700 Subject: [PATCH 17/30] rcu: Distinguish between NOCB and non-NOCB rcu_callback trace events One way to distinguish between NOCB and non-NOCB rcu_callback trace events is that the former always print zero for the lazy and non-lazy queue lengths. Unfortunately, this also means that we cannot see the NOCB queue lengths. This commit therefore accesses the NOCB queue lengths, but negates them. NOCB rcu_callback trace events should therefore have negative queue lengths. Signed-off-by: Paul E. McKenney [ paulmck: Match operand size per kbuild test robot's advice. ] --- kernel/rcutree_plugin.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f4ed24b18e77..24b01b69be92 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2147,10 +2147,12 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, if (__is_kfree_rcu_offset((unsigned long)rhp->func)) trace_rcu_kfree_callback(rdp->rsp->name, rhp, (unsigned long)rhp->func, - rdp->qlen_lazy, rdp->qlen); + -atomic_long_read(&rdp->nocb_q_count_lazy), + -atomic_long_read(&rdp->nocb_q_count)); else trace_rcu_callback(rdp->rsp->name, rhp, - rdp->qlen_lazy, rdp->qlen); + -atomic_long_read(&rdp->nocb_q_count_lazy), + -atomic_long_read(&rdp->nocb_q_count)); return 1; } From 69a79bb12a81024d718e73c52e886907a3777b34 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Aug 2013 13:23:23 -0700 Subject: [PATCH 18/30] rcu: Track rcu_nocb_kthread()'s sleeping and awakening This commit adds event traces to track all of rcu_nocb_kthread()'s blocking and awakening. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 4 ++++ kernel/rcutree_plugin.h | 15 ++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 4301cd9e3ee5..a087d82ed431 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -183,8 +183,12 @@ TRACE_EVENT(rcu_grace_period_init, * "WakeOvf": Wake rcuo kthread, CB list is huge. * "WakeNot": Don't wake rcuo kthread. * "WakeNotPoll": Don't wake rcuo kthread because it is polling. + * "Poll": Start of new polling cycle for rcu_nocb_poll. + * "Sleep": Sleep waiting for CBs for !rcu_nocb_poll. * "WokeEmpty": rcuo kthread woke to find empty list. * "WokeNonEmpty": rcuo kthread woke to find non-empty list. + * "WaitQueue": Enqueue partially done, timed wait for it to complete. + * "WokeQueue": Partial enqueue now complete. */ TRACE_EVENT(rcu_nocb_wake, diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 24b01b69be92..21205b185340 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2230,6 +2230,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) static int rcu_nocb_kthread(void *arg) { int c, cl; + bool firsttime = 1; struct rcu_head *list; struct rcu_head *next; struct rcu_head **tail; @@ -2238,8 +2239,15 @@ static int rcu_nocb_kthread(void *arg) /* Each pass through this loop invokes one batch of callbacks */ for (;;) { /* If not polling, wait for next batch of callbacks. */ - if (!rcu_nocb_poll) + if (!rcu_nocb_poll) { + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("Sleep")); wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); + } else if (firsttime) { + firsttime = 0; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("Poll")); + } list = ACCESS_ONCE(rdp->nocb_head); if (!list) { if (!rcu_nocb_poll) @@ -2249,6 +2257,7 @@ static int rcu_nocb_kthread(void *arg) flush_signals(current); continue; } + firsttime = 1; trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty")); @@ -2271,7 +2280,11 @@ static int rcu_nocb_kthread(void *arg) next = list->next; /* Wait for enqueuing to complete, if needed. */ while (next == NULL && &list->next != tail) { + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WaitQueue")); schedule_timeout_interruptible(1); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WokeQueue")); next = list->next; } debug_rcu_head_unqueue(list); From 15f5191b6acbbb38029b06284e8fd20275e7cfe8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 18 Aug 2013 11:59:25 -0700 Subject: [PATCH 19/30] rcu: Avoid sparse warnings in rcu_nocb_wake trace event The event-tracing macros do not like bool tracing arguments, so this commit makes them be of type char. This change has the knock-on effect of making it illegal to pass a pointer into one of these arguments, so also change rcutiny's first call to trace_rcu_batch_end() to convert from pointer to boolean, prefixing with "!!". Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 10 +++++----- kernel/rcutiny.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index a087d82ed431..aca382266411 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -591,17 +591,17 @@ TRACE_EVENT(rcu_invoke_kfree_callback, TRACE_EVENT(rcu_batch_end, TP_PROTO(const char *rcuname, int callbacks_invoked, - bool cb, bool nr, bool iit, bool risk), + char cb, char nr, char iit, char risk), TP_ARGS(rcuname, callbacks_invoked, cb, nr, iit, risk), TP_STRUCT__entry( __field(const char *, rcuname) __field(int, callbacks_invoked) - __field(bool, cb) - __field(bool, nr) - __field(bool, iit) - __field(bool, risk) + __field(char, cb) + __field(char, nr) + __field(char, iit) + __field(char, risk) ), TP_fast_assign( diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 9ed6075dc562..80b6e273f1c5 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -273,7 +273,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) if (&rcp->rcucblist == rcp->donetail) { RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, - ACCESS_ONCE(rcp->rcucblist), + !!ACCESS_ONCE(rcp->rcucblist), need_resched(), is_idle_task(current), false)); From 5d5a08003d3e678372e375d99c65a24e0d33d2f5 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sun, 15 Sep 2013 17:29:17 +0400 Subject: [PATCH 20/30] rcu: Fix CONFIG_RCU_NOCB_CPU_ALL panic on machines with sparse CPU mask Some architectures have sparse cpu mask. UltraSparc's cpuinfo for example: CPU0: online CPU2: online So, set only possible CPUs when CONFIG_RCU_NOCB_CPU_ALL is enabled. Also, check that user passes right 'rcu_nocbs=' option. Signed-off-by: Kirill Tkhai CC: Dipankar Sarma [ paulmck: Fix pr_info() issue noted by scripts/checkpatch.pl. ] Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c684f7ab37fa..1855d66bf705 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -96,10 +96,15 @@ static void __init rcu_bootup_announce_oddness(void) #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ #ifdef CONFIG_RCU_NOCB_CPU_ALL pr_info("\tOffload RCU callbacks from all CPUs\n"); - cpumask_setall(rcu_nocb_mask); + cpumask_copy(rcu_nocb_mask, cpu_possible_mask); #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ if (have_rcu_nocb_mask) { + if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { + pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); + cpumask_and(rcu_nocb_mask, cpu_possible_mask, + rcu_nocb_mask); + } cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); if (rcu_nocb_poll) From 7a497c963eceac42677ce1f5d7bb470abedd15f4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Aug 2013 18:16:16 -0700 Subject: [PATCH 21/30] rcu: Remove redundant code from rcu_cleanup_after_idle() The rcu_try_advance_all_cbs() function returns a bool saying whether or not there are callbacks ready to invoke, but rcu_cleanup_after_idle() rechecks this regardless. This commit therefore uses the value returned by rcu_try_advance_all_cbs() instead of making rcu_cleanup_after_idle() do this recheck. Reported-by: Tibor Billes Signed-off-by: Paul E. McKenney Tested-by: Tibor Billes Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 130c97b027f2..18d9c91f25d1 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1768,17 +1768,11 @@ static void rcu_prepare_for_idle(int cpu) */ static void rcu_cleanup_after_idle(int cpu) { - struct rcu_data *rdp; - struct rcu_state *rsp; if (rcu_is_nocb_cpu(cpu)) return; - rcu_try_advance_all_cbs(); - for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); - if (cpu_has_callbacks_ready_to_invoke(rdp)) - invoke_rcu_core(); - } + if (rcu_try_advance_all_cbs()) + invoke_rcu_core(); } /* From c229828ca6bc62d6c654f64b1d1b8a9ebd8a56f3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 25 Aug 2013 21:20:47 -0700 Subject: [PATCH 22/30] rcu: Throttle rcu_try_advance_all_cbs() execution The rcu_try_advance_all_cbs() function is invoked on each attempted entry to and every exit from idle. If this function determines that there are callbacks ready to invoke, the caller will invoke the RCU core, which in turn will result in a pair of context switches. If a CPU enters and exits idle extremely frequently, this can result in an excessive number of context switches and high CPU overhead. This commit therefore causes rcu_try_advance_all_cbs() to throttle itself, refusing to do work more than once per jiffy. Reported-by: Tibor Billes Signed-off-by: Paul E. McKenney Tested-by: Tibor Billes Reviewed-by: Josh Triplett --- kernel/rcutree.h | 2 ++ kernel/rcutree_plugin.h | 12 +++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 5f97eab602cd..52be957c9fe2 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -104,6 +104,8 @@ struct rcu_dynticks { /* idle-period nonlazy_posted snapshot. */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ + unsigned long last_advance_all; + /* Last jiffy CBs were all advanced. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 18d9c91f25d1..d81e3856fa91 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1630,17 +1630,23 @@ module_param(rcu_idle_lazy_gp_delay, int, 0644); extern int tick_nohz_enabled; /* - * Try to advance callbacks for all flavors of RCU on the current CPU. - * Afterwards, if there are any callbacks ready for immediate invocation, - * return true. + * Try to advance callbacks for all flavors of RCU on the current CPU, but + * only if it has been awhile since the last time we did so. Afterwards, + * if there are any callbacks ready for immediate invocation, return true. */ static bool rcu_try_advance_all_cbs(void) { bool cbs_ready = false; struct rcu_data *rdp; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); struct rcu_node *rnp; struct rcu_state *rsp; + /* Exit early if we advanced recently. */ + if (jiffies == rdtp->last_advance_all) + return 0; + rdtp->last_advance_all = jiffies; + for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); rnp = rdp->mynode; From c337f8f58ed7cf150651d232af8222421a71463d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Sep 2013 17:02:11 -0700 Subject: [PATCH 23/30] rcu: Throttle invoke_rcu_core() invocations due to non-lazy callbacks If a non-lazy callback arrives on a CPU that has previously gone idle with no non-lazy callbacks, invoke_rcu_core() forces the RCU core to run. However, it does not update the conditions, which could result in several closely spaced invocations of the RCU core, which in turn could result in an excessively high context-switch rate and resulting high overhead. This commit therefore updates the ->all_lazy and ->nonlazy_posted_snap fields to prevent closely spaced invocations. Reported-by: Tibor Billes Signed-off-by: Paul E. McKenney Tested-by: Tibor Billes Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index d81e3856fa91..2c15d7c10684 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1745,6 +1745,8 @@ static void rcu_prepare_for_idle(int cpu) */ if (rdtp->all_lazy && rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { + rdtp->all_lazy = false; + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; invoke_rcu_core(); return; } From cc6783f788d8fe8b23ec6fc2762f5e8c9a418eee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Sep 2013 17:39:49 -0700 Subject: [PATCH 24/30] rcu: Is it safe to enter an RCU read-side critical section? There is currently no way for kernel code to determine whether it is safe to enter an RCU read-side critical section, in other words, whether or not RCU is paying attention to the currently running CPU. Given the large and increasing quantity of code shared by the idle loop and non-idle code, the this shortcoming is becoming increasingly painful. This commit therefore adds __rcu_is_watching(), which returns true if it is safe to enter an RCU read-side critical section on the currently running CPU. This function is quite fast, using only a __this_cpu_read(). However, the caller must disable preemption. Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 8 ++++---- include/linux/rcutiny.h | 9 +++++++++ include/linux/rcutree.h | 2 ++ kernel/rcutiny.c | 4 ++-- kernel/rcutree.c | 13 +++++++++++++ 5 files changed, 30 insertions(+), 6 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f1f1bc39346b..a53a21a2808c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -261,6 +261,10 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, rcu_irq_exit(); \ } while (0) +#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) +extern int rcu_is_cpu_idle(void); +#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ + /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. @@ -297,10 +301,6 @@ static inline void destroy_rcu_head_on_stack(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SMP) -extern int rcu_is_cpu_idle(void); -#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SMP) */ - #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) bool rcu_lockdep_current_cpu_online(void); #else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index e31005ee339e..bee665964878 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -132,4 +132,13 @@ static inline void rcu_scheduler_starting(void) } #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#ifdef CONFIG_RCU_TRACE + +static inline bool __rcu_is_watching(void) +{ + return !rcu_is_cpu_idle(); +} + +#endif /* #ifdef CONFIG_RCU_TRACE */ + #endif /* __LINUX_RCUTINY_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 226169d1bd2b..293613dfd2a5 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -90,4 +90,6 @@ extern void exit_rcu(void); extern void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; +extern bool __rcu_is_watching(void); + #endif /* __LINUX_RCUTREE_H */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 9ed6075dc562..b4bc61874d77 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -174,7 +174,7 @@ void rcu_irq_enter(void) } EXPORT_SYMBOL_GPL(rcu_irq_enter); -#ifdef CONFIG_DEBUG_LOCK_ALLOC +#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) /* * Test whether RCU thinks that the current CPU is idle. @@ -185,7 +185,7 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ /* * Test whether the current CPU was interrupted from idle. Nested diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 32618b3fe4e6..910d868808dc 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -671,6 +671,19 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); +/** + * __rcu_is_watching - are RCU read-side critical sections safe? + * + * Return true if RCU is watching the running CPU, which means that + * this CPU can safely enter RCU read-side critical sections. Unlike + * rcu_is_cpu_idle(), the caller of __rcu_is_watching() must have at + * least disabled preemption. + */ +bool __rcu_is_watching(void) +{ + return !!(atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1); +} + #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* From f9ffc31ebd38d2d74dbfe9f0b67274e99ad668f5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 8 Sep 2013 11:51:06 -0700 Subject: [PATCH 25/30] rcu: Change EXPORT_SYMBOL() to EXPORT_SYMBOL_GPL() Commit e6b80a3b (rcu: Detect illegal rcu dereference in extended quiescent state) exported the pre-existing rcu_is_cpu_idle() function using EXPORT_SYMBOL(). However, this is inconsistent with the remaining exports from RCU, which are all EXPORT_SYMBOL_GPL(). The current state of affairs means that a non-GPL module could use rcu_is_cpu_idle(), but in a CONFIG_TREE_PREEMPT_RCU=y kernel would be unable to invoke rcu_read_lock() and rcu_read_unlock(). This commit therefore makes rcu_is_cpu_idle()'s export be consistent with the rest of RCU, namely EXPORT_SYMBOL_GPL(). Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Reviewed-by: Josh Triplett --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 910d868808dc..1b123e179d71 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -669,7 +669,7 @@ int rcu_is_cpu_idle(void) preempt_enable(); return ret; } -EXPORT_SYMBOL(rcu_is_cpu_idle); +EXPORT_SYMBOL_GPL(rcu_is_cpu_idle); /** * __rcu_is_watching - are RCU read-side critical sections safe? From 5c173eb8bcb9c1aa888bd6d14a4cb746f3dd2420 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Sep 2013 17:20:11 -0700 Subject: [PATCH 26/30] rcu: Consistent rcu_is_watching() naming The old rcu_is_cpu_idle() function is just __rcu_is_watching() with preemption disabled. This commit therefore renames rcu_is_cpu_idle() to rcu_is_watching. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 18 +++++++++--------- include/linux/rcutiny.h | 16 ++++++++++++---- include/linux/rcutree.h | 2 +- kernel/lockdep.c | 4 ++-- kernel/rcupdate.c | 2 +- kernel/rcutiny.c | 6 +++--- kernel/rcutree.c | 40 ++++++++++++++++++++-------------------- 7 files changed, 48 insertions(+), 40 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index a53a21a2808c..39cbb889e20d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -262,7 +262,7 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, } while (0) #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) -extern int rcu_is_cpu_idle(void); +extern bool __rcu_is_watching(void); #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ /* @@ -351,7 +351,7 @@ static inline int rcu_read_lock_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; - if (rcu_is_cpu_idle()) + if (!rcu_is_watching()) return 0; if (!rcu_lockdep_current_cpu_online()) return 0; @@ -402,7 +402,7 @@ static inline int rcu_read_lock_sched_held(void) if (!debug_lockdep_rcu_enabled()) return 1; - if (rcu_is_cpu_idle()) + if (!rcu_is_watching()) return 0; if (!rcu_lockdep_current_cpu_online()) return 0; @@ -771,7 +771,7 @@ static inline void rcu_read_lock(void) __rcu_read_lock(); __acquire(RCU); rcu_lock_acquire(&rcu_lock_map); - rcu_lockdep_assert(!rcu_is_cpu_idle(), + rcu_lockdep_assert(rcu_is_watching(), "rcu_read_lock() used illegally while idle"); } @@ -792,7 +792,7 @@ static inline void rcu_read_lock(void) */ static inline void rcu_read_unlock(void) { - rcu_lockdep_assert(!rcu_is_cpu_idle(), + rcu_lockdep_assert(rcu_is_watching(), "rcu_read_unlock() used illegally while idle"); rcu_lock_release(&rcu_lock_map); __release(RCU); @@ -821,7 +821,7 @@ static inline void rcu_read_lock_bh(void) local_bh_disable(); __acquire(RCU_BH); rcu_lock_acquire(&rcu_bh_lock_map); - rcu_lockdep_assert(!rcu_is_cpu_idle(), + rcu_lockdep_assert(rcu_is_watching(), "rcu_read_lock_bh() used illegally while idle"); } @@ -832,7 +832,7 @@ static inline void rcu_read_lock_bh(void) */ static inline void rcu_read_unlock_bh(void) { - rcu_lockdep_assert(!rcu_is_cpu_idle(), + rcu_lockdep_assert(rcu_is_watching(), "rcu_read_unlock_bh() used illegally while idle"); rcu_lock_release(&rcu_bh_lock_map); __release(RCU_BH); @@ -857,7 +857,7 @@ static inline void rcu_read_lock_sched(void) preempt_disable(); __acquire(RCU_SCHED); rcu_lock_acquire(&rcu_sched_lock_map); - rcu_lockdep_assert(!rcu_is_cpu_idle(), + rcu_lockdep_assert(rcu_is_watching(), "rcu_read_lock_sched() used illegally while idle"); } @@ -875,7 +875,7 @@ static inline notrace void rcu_read_lock_sched_notrace(void) */ static inline void rcu_read_unlock_sched(void) { - rcu_lockdep_assert(!rcu_is_cpu_idle(), + rcu_lockdep_assert(rcu_is_watching(), "rcu_read_unlock_sched() used illegally while idle"); rcu_lock_release(&rcu_sched_lock_map); __release(RCU_SCHED); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index bee665964878..09ebcbe9fd78 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -132,13 +132,21 @@ static inline void rcu_scheduler_starting(void) } #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -#ifdef CONFIG_RCU_TRACE +#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) -static inline bool __rcu_is_watching(void) +static inline bool rcu_is_watching(void) { - return !rcu_is_cpu_idle(); + return __rcu_is_watching(); } -#endif /* #ifdef CONFIG_RCU_TRACE */ +#else /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ + +static inline bool rcu_is_watching(void) +{ + return true; +} + + +#endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ #endif /* __LINUX_RCUTINY_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 293613dfd2a5..4b9c81548742 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -90,6 +90,6 @@ extern void exit_rcu(void); extern void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; -extern bool __rcu_is_watching(void); +extern bool rcu_is_watching(void); #endif /* __LINUX_RCUTREE_H */ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index e16c45b9ee77..4e8e14c34e42 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4224,7 +4224,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" - : rcu_is_cpu_idle() + : !rcu_is_watching() ? "RCU used illegally from idle CPU!\n" : "", rcu_scheduler_active, debug_locks); @@ -4247,7 +4247,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) * So complain bitterly if someone does call rcu_read_lock(), * rcu_read_lock_bh() and so on from extended quiescent states. */ - if (rcu_is_cpu_idle()) + if (!rcu_is_watching()) printk("RCU used illegally from extended quiescent state!\n"); lockdep_print_held_locks(curr); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index b02a339836b4..3b3c0464d1eb 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -148,7 +148,7 @@ int rcu_read_lock_bh_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; - if (rcu_is_cpu_idle()) + if (!rcu_is_watching()) return 0; if (!rcu_lockdep_current_cpu_online()) return 0; diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index b4bc61874d77..0fa061dfa55d 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -179,11 +179,11 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter); /* * Test whether RCU thinks that the current CPU is idle. */ -int rcu_is_cpu_idle(void) +bool __rcu_is_watching(void) { - return !rcu_dynticks_nesting; + return rcu_dynticks_nesting; } -EXPORT_SYMBOL(rcu_is_cpu_idle); +EXPORT_SYMBOL(__rcu_is_watching); #endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 1b123e179d71..981d0c15a389 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -654,36 +654,36 @@ void rcu_nmi_exit(void) WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); } -/** - * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle - * - * If the current CPU is in its idle loop and is neither in an interrupt - * or NMI handler, return true. - */ -int rcu_is_cpu_idle(void) -{ - int ret; - - preempt_disable(); - ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; - preempt_enable(); - return ret; -} -EXPORT_SYMBOL_GPL(rcu_is_cpu_idle); - /** * __rcu_is_watching - are RCU read-side critical sections safe? * * Return true if RCU is watching the running CPU, which means that * this CPU can safely enter RCU read-side critical sections. Unlike - * rcu_is_cpu_idle(), the caller of __rcu_is_watching() must have at + * rcu_is_watching(), the caller of __rcu_is_watching() must have at * least disabled preemption. */ bool __rcu_is_watching(void) { - return !!(atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1); + return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; } +/** + * rcu_is_watching - see if RCU thinks that the current CPU is idle + * + * If the current CPU is in its idle loop and is neither in an interrupt + * or NMI handler, return true. + */ +bool rcu_is_watching(void) +{ + int ret; + + preempt_disable(); + ret = __rcu_is_watching(); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(rcu_is_watching); + #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* @@ -2268,7 +2268,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. */ - if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) + if (!rcu_is_watching() && cpu_online(smp_processor_id())) invoke_rcu_core(); /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ From 64d3b7a1d5289486df2d8bce36e23ed5ebc80a3d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Aug 2013 11:59:43 -0700 Subject: [PATCH 27/30] rcu: Update stall-warning documentation Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/RCU/stallwarn.txt | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 8e9359de1d28..6f3a0057548e 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -12,12 +12,12 @@ CONFIG_RCU_CPU_STALL_TIMEOUT This kernel configuration parameter defines the period of time that RCU will wait from the beginning of a grace period until it issues an RCU CPU stall warning. This time period is normally - sixty seconds. + 21 seconds. This configuration parameter may be changed at runtime via the /sys/module/rcutree/parameters/rcu_cpu_stall_timeout, however this parameter is checked only at the beginning of a cycle. - So if you are 30 seconds into a 70-second stall, setting this + So if you are 10 seconds into a 40-second stall, setting this sysfs parameter to (say) five will shorten the timeout for the -next- stall, or the following warning for the current stall (assuming the stall lasts long enough). It will not affect the @@ -32,7 +32,7 @@ CONFIG_RCU_CPU_STALL_VERBOSE also dump the stacks of any tasks that are blocking the current RCU-preempt grace period. -RCU_CPU_STALL_INFO +CONFIG_RCU_CPU_STALL_INFO This kernel configuration parameter causes the stall warning to print out additional per-CPU diagnostic information, including @@ -43,7 +43,8 @@ RCU_STALL_DELAY_DELTA Although the lockdep facility is extremely useful, it does add some overhead. Therefore, under CONFIG_PROVE_RCU, the RCU_STALL_DELAY_DELTA macro allows five extra seconds before - giving an RCU CPU stall warning message. + giving an RCU CPU stall warning message. (This is a cpp + macro, not a kernel configuration parameter.) RCU_STALL_RAT_DELAY @@ -52,7 +53,8 @@ RCU_STALL_RAT_DELAY However, if the offending CPU does not detect its own stall in the number of jiffies specified by RCU_STALL_RAT_DELAY, then some other CPU will complain. This delay is normally set to - two jiffies. + two jiffies. (This is a cpp macro, not a kernel configuration + parameter.) When a CPU detects that it is stalling, it will print a message similar to the following: @@ -86,7 +88,12 @@ printing, there will be a spurious stall-warning message: INFO: rcu_bh_state detected stalls on CPUs/tasks: { } (detected by 4, 2502 jiffies) -This is rare, but does happen from time to time in real life. +This is rare, but does happen from time to time in real life. It is also +possible for a zero-jiffy stall to be flagged in this case, depending +on how the stall warning and the grace-period initialization happen to +interact. Please note that it is not possible to entirely eliminate this +sort of false positive without resorting to things like stop_machine(), +which is overkill for this sort of problem. If the CONFIG_RCU_CPU_STALL_INFO kernel configuration parameter is set, more information is printed with the stall-warning message, for example: @@ -216,4 +223,5 @@ that portion of the stack which remains the same from trace to trace. If you can reliably trigger the stall, ftrace can be quite helpful. RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE -and with RCU's event tracing. +and with RCU's event tracing. For information on RCU's event tracing, +see include/trace/events/rcu.h. From 64f26e5c86af9ce8615721340c8282f2b148c9aa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Sep 2013 08:26:09 -0700 Subject: [PATCH 28/30] kthread: Add pointer to vmstat-avoidance patch Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/kernel-per-CPU-kthreads.txt | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt index 32351bfabf20..827104fb9364 100644 --- a/Documentation/kernel-per-CPU-kthreads.txt +++ b/Documentation/kernel-per-CPU-kthreads.txt @@ -181,12 +181,17 @@ To reduce its OS jitter, do any of the following: make sure that this is safe on your particular system. d. It is not possible to entirely get rid of OS jitter from vmstat_update() on CONFIG_SMP=y systems, but you - can decrease its frequency by writing a large value to - /proc/sys/vm/stat_interval. The default value is HZ, - for an interval of one second. Of course, larger values - will make your virtual-memory statistics update more - slowly. Of course, you can also run your workload at - a real-time priority, thus preempting vmstat_update(). + can decrease its frequency by writing a large value + to /proc/sys/vm/stat_interval. The default value is + HZ, for an interval of one second. Of course, larger + values will make your virtual-memory statistics update + more slowly. Of course, you can also run your workload + at a real-time priority, thus preempting vmstat_update(), + but if your workload is CPU-bound, this is a bad idea. + However, there is an RFC patch from Christoph Lameter + (based on an earlier one from Gilad Ben-Yossef) that + reduces or even eliminates vmstat overhead for some + workloads at https://lkml.org/lkml/2013/9/4/379. e. If running on high-end powerpc servers, build with CONFIG_PPC_RTAS_DAEMON=n. This prevents the RTAS daemon from running on each CPU every second or so. From 4b0d3f0fde41a3c4454adb4d474618c23cfd4131 Mon Sep 17 00:00:00 2001 From: Michael Opdenacker Date: Mon, 23 Sep 2013 12:40:41 -0700 Subject: [PATCH 29/30] rcu: Fix occurrence of "the the" in checklist.txt Signed-off-by: Michael Opdenacker Signed-off-by: Paul E. McKenney [ paulmck: Add "then" as suggested by Josh Triplett. ] Reviewed-by: Josh Triplett --- Documentation/RCU/checklist.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 7703ec73a9bb..91266193b8f4 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -202,8 +202,8 @@ over a rather long period of time, but improvements are always welcome! updater uses call_rcu_sched() or synchronize_sched(), then the corresponding readers must disable preemption, possibly by calling rcu_read_lock_sched() and rcu_read_unlock_sched(). - If the updater uses synchronize_srcu() or call_srcu(), - the the corresponding readers must use srcu_read_lock() and + If the updater uses synchronize_srcu() or call_srcu(), then + the corresponding readers must use srcu_read_lock() and srcu_read_unlock(), and with the same srcu_struct. The rules for the expedited primitives are the same as for their non-expedited counterparts. Mixing things up will result in confusion and From 4102adab9189c8ea2f0cdd2f88345fd25d2790f1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 Oct 2013 20:23:47 -0700 Subject: [PATCH 30/30] rcu: Move RCU-related source code to kernel/rcu directory Signed-off-by: Paul E. McKenney Reviewed-by: Ingo Molnar --- Documentation/DocBook/device-drivers.tmpl | 5 +- Documentation/kernel-parameters.txt | 95 +++++++++++-------- MAINTAINERS | 11 ++- kernel/Makefile | 11 +-- kernel/rcu/Makefile | 6 ++ kernel/{ => rcu}/rcu.h | 0 kernel/{ => rcu}/srcu.c | 0 kernel/{rcutiny.c => rcu/tiny.c} | 8 +- .../{rcutiny_plugin.h => rcu/tiny_plugin.h} | 0 kernel/{rcutorture.c => rcu/torture.c} | 6 ++ kernel/{rcutree.c => rcu/tree.c} | 13 ++- kernel/{rcutree.h => rcu/tree.h} | 0 .../{rcutree_plugin.h => rcu/tree_plugin.h} | 4 +- kernel/{rcutree_trace.c => rcu/tree_trace.c} | 2 +- kernel/{rcupdate.c => rcu/update.c} | 6 ++ 15 files changed, 105 insertions(+), 62 deletions(-) create mode 100644 kernel/rcu/Makefile rename kernel/{ => rcu}/rcu.h (100%) rename kernel/{ => rcu}/srcu.c (100%) rename kernel/{rcutiny.c => rcu/tiny.c} (97%) rename kernel/{rcutiny_plugin.h => rcu/tiny_plugin.h} (100%) rename kernel/{rcutorture.c => rcu/torture.c} (99%) rename kernel/{rcutree.c => rcu/tree.c} (99%) rename kernel/{rcutree.h => rcu/tree.h} (100%) rename kernel/{rcutree_plugin.h => rcu/tree_plugin.h} (99%) rename kernel/{rcutree_trace.c => rcu/tree_trace.c} (99%) rename kernel/{rcupdate.c => rcu/update.c} (98%) diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl index fe397f90a34f..6c9d9d37c83a 100644 --- a/Documentation/DocBook/device-drivers.tmpl +++ b/Documentation/DocBook/device-drivers.tmpl @@ -87,7 +87,10 @@ X!Iinclude/linux/kobject.h !Ekernel/printk/printk.c !Ekernel/panic.c !Ekernel/sys.c -!Ekernel/rcupdate.c +!Ekernel/rcu/srcu.c +!Ekernel/rcu/tree.c +!Ekernel/rcu/tree_plugin.h +!Ekernel/rcu/update.c Device Resource Management diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 1a036cd972fb..c3dc13e90a40 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2595,7 +2595,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ramdisk_size= [RAM] Sizes of RAM disks in kilobytes See Documentation/blockdev/ramdisk.txt. - rcu_nocbs= [KNL,BOOT] + rcu_nocbs= [KNL] In kernels built with CONFIG_RCU_NOCB_CPU=y, set the specified list of CPUs to be no-callback CPUs. Invocation of these CPUs' RCU callbacks will @@ -2608,7 +2608,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. real-time workloads. It can also improve energy efficiency for asymmetric multiprocessors. - rcu_nocb_poll [KNL,BOOT] + rcu_nocb_poll [KNL] Rather than requiring that offloaded CPUs (specified by rcu_nocbs= above) explicitly awaken the corresponding "rcuoN" kthreads, @@ -2619,126 +2619,145 @@ bytes respectively. Such letter suffixes can also be entirely omitted. energy efficiency by requiring that the kthreads periodically wake up to do the polling. - rcutree.blimit= [KNL,BOOT] + rcutree.blimit= [KNL] Set maximum number of finished RCU callbacks to process in one batch. - rcutree.fanout_leaf= [KNL,BOOT] + rcutree.rcu_fanout_leaf= [KNL] Increase the number of CPUs assigned to each leaf rcu_node structure. Useful for very large systems. - rcutree.jiffies_till_first_fqs= [KNL,BOOT] + rcutree.jiffies_till_first_fqs= [KNL] Set delay from grace-period initialization to first attempt to force quiescent states. Units are jiffies, minimum value is zero, and maximum value is HZ. - rcutree.jiffies_till_next_fqs= [KNL,BOOT] + rcutree.jiffies_till_next_fqs= [KNL] Set delay between subsequent attempts to force quiescent states. Units are jiffies, minimum value is one, and maximum value is HZ. - rcutree.qhimark= [KNL,BOOT] + rcutree.qhimark= [KNL] Set threshold of queued RCU callbacks over which batch limiting is disabled. - rcutree.qlowmark= [KNL,BOOT] + rcutree.qlowmark= [KNL] Set threshold of queued RCU callbacks below which batch limiting is re-enabled. - rcutree.rcu_cpu_stall_suppress= [KNL,BOOT] - Suppress RCU CPU stall warning messages. - - rcutree.rcu_cpu_stall_timeout= [KNL,BOOT] - Set timeout for RCU CPU stall warning messages. - - rcutree.rcu_idle_gp_delay= [KNL,BOOT] + rcutree.rcu_idle_gp_delay= [KNL] Set wakeup interval for idle CPUs that have RCU callbacks (RCU_FAST_NO_HZ=y). - rcutree.rcu_idle_lazy_gp_delay= [KNL,BOOT] + rcutree.rcu_idle_lazy_gp_delay= [KNL] Set wakeup interval for idle CPUs that have only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y). Lazy RCU callbacks are those which RCU can prove do nothing more than free memory. - rcutorture.fqs_duration= [KNL,BOOT] + rcutorture.fqs_duration= [KNL] Set duration of force_quiescent_state bursts. - rcutorture.fqs_holdoff= [KNL,BOOT] + rcutorture.fqs_holdoff= [KNL] Set holdoff time within force_quiescent_state bursts. - rcutorture.fqs_stutter= [KNL,BOOT] + rcutorture.fqs_stutter= [KNL] Set wait time between force_quiescent_state bursts. - rcutorture.irqreader= [KNL,BOOT] - Test RCU readers from irq handlers. + rcutorture.gp_exp= [KNL] + Use expedited update-side primitives. - rcutorture.n_barrier_cbs= [KNL,BOOT] + rcutorture.gp_normal= [KNL] + Use normal (non-expedited) update-side primitives. + If both gp_exp and gp_normal are set, do both. + If neither gp_exp nor gp_normal are set, still + do both. + + rcutorture.n_barrier_cbs= [KNL] Set callbacks/threads for rcu_barrier() testing. - rcutorture.nfakewriters= [KNL,BOOT] + rcutorture.nfakewriters= [KNL] Set number of concurrent RCU writers. These just stress RCU, they don't participate in the actual test, hence the "fake". - rcutorture.nreaders= [KNL,BOOT] + rcutorture.nreaders= [KNL] Set number of RCU readers. - rcutorture.onoff_holdoff= [KNL,BOOT] + rcutorture.object_debug= [KNL] + Enable debug-object double-call_rcu() testing. + + rcutorture.onoff_holdoff= [KNL] Set time (s) after boot for CPU-hotplug testing. - rcutorture.onoff_interval= [KNL,BOOT] + rcutorture.onoff_interval= [KNL] Set time (s) between CPU-hotplug operations, or zero to disable CPU-hotplug testing. - rcutorture.shuffle_interval= [KNL,BOOT] + rcutorture.rcutorture_runnable= [BOOT] + Start rcutorture running at boot time. + + rcutorture.shuffle_interval= [KNL] Set task-shuffle interval (s). Shuffling tasks allows some CPUs to go into dyntick-idle mode during the rcutorture test. - rcutorture.shutdown_secs= [KNL,BOOT] + rcutorture.shutdown_secs= [KNL] Set time (s) after boot system shutdown. This is useful for hands-off automated testing. - rcutorture.stall_cpu= [KNL,BOOT] + rcutorture.stall_cpu= [KNL] Duration of CPU stall (s) to test RCU CPU stall warnings, zero to disable. - rcutorture.stall_cpu_holdoff= [KNL,BOOT] + rcutorture.stall_cpu_holdoff= [KNL] Time to wait (s) after boot before inducing stall. - rcutorture.stat_interval= [KNL,BOOT] + rcutorture.stat_interval= [KNL] Time (s) between statistics printk()s. - rcutorture.stutter= [KNL,BOOT] + rcutorture.stutter= [KNL] Time (s) to stutter testing, for example, specifying five seconds causes the test to run for five seconds, wait for five seconds, and so on. This tests RCU's ability to transition abruptly to and from idle. - rcutorture.test_boost= [KNL,BOOT] + rcutorture.test_boost= [KNL] Test RCU priority boosting? 0=no, 1=maybe, 2=yes. "Maybe" means test if the RCU implementation under test support RCU priority boosting. - rcutorture.test_boost_duration= [KNL,BOOT] + rcutorture.test_boost_duration= [KNL] Duration (s) of each individual boost test. - rcutorture.test_boost_interval= [KNL,BOOT] + rcutorture.test_boost_interval= [KNL] Interval (s) between each boost test. - rcutorture.test_no_idle_hz= [KNL,BOOT] + rcutorture.test_no_idle_hz= [KNL] Test RCU's dyntick-idle handling. See also the rcutorture.shuffle_interval parameter. - rcutorture.torture_type= [KNL,BOOT] + rcutorture.torture_type= [KNL] Specify the RCU implementation to test. - rcutorture.verbose= [KNL,BOOT] + rcutorture.verbose= [KNL] Enable additional printk() statements. + rcupdate.rcu_expedited= [KNL] + Use expedited grace-period primitives, for + example, synchronize_rcu_expedited() instead + of synchronize_rcu(). This reduces latency, + but can increase CPU utilization, degrade + real-time latency, and degrade energy efficiency. + + rcupdate.rcu_cpu_stall_suppress= [KNL] + Suppress RCU CPU stall warning messages. + + rcupdate.rcu_cpu_stall_timeout= [KNL] + Set timeout for RCU CPU stall warning messages. + rdinit= [KNL] Format: Run specified binary instead of /init from the ramdisk, diff --git a/MAINTAINERS b/MAINTAINERS index e61c2e83fc2b..28f2478b6794 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6903,7 +6903,7 @@ M: "Paul E. McKenney" S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git F: Documentation/RCU/torture.txt -F: kernel/rcutorture.c +F: kernel/rcu/torture.c RDC R-321X SoC M: Florian Fainelli @@ -6930,8 +6930,9 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git F: Documentation/RCU/ X: Documentation/RCU/torture.txt F: include/linux/rcu* -F: kernel/rcu* -X: kernel/rcutorture.c +X: include/linux/srcu.h +F: kernel/rcu/ +X: kernel/rcu/torture.c REAL TIME CLOCK (RTC) SUBSYSTEM M: Alessandro Zummo @@ -7618,8 +7619,8 @@ M: "Paul E. McKenney" W: http://www.rdrop.com/users/paulmck/RCU/ S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git -F: include/linux/srcu* -F: kernel/srcu* +F: include/linux/srcu.h +F: kernel/rcu/srcu.c SMACK SECURITY MODULE M: Casey Schaufler diff --git a/kernel/Makefile b/kernel/Makefile index 1ce47553fb02..f99d908b5550 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -6,9 +6,9 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ - rcupdate.o extable.o params.o posix-timers.o \ + extable.o params.o posix-timers.o \ kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ + hrtimer.o rwsem.o nsproxy.o semaphore.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o lglock.o smpboot.o @@ -27,6 +27,7 @@ obj-y += power/ obj-y += printk/ obj-y += cpu/ obj-y += irq/ +obj-y += rcu/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o @@ -81,12 +82,6 @@ obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o obj-$(CONFIG_SECCOMP) += seccomp.o -obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o -obj-$(CONFIG_TREE_RCU) += rcutree.o -obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o -obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o -obj-$(CONFIG_TINY_RCU) += rcutiny.o -obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile new file mode 100644 index 000000000000..01e9ec37a3e3 --- /dev/null +++ b/kernel/rcu/Makefile @@ -0,0 +1,6 @@ +obj-y += update.o srcu.o +obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o +obj-$(CONFIG_TREE_RCU) += tree.o +obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o +obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o +obj-$(CONFIG_TINY_RCU) += tiny.o diff --git a/kernel/rcu.h b/kernel/rcu/rcu.h similarity index 100% rename from kernel/rcu.h rename to kernel/rcu/rcu.h diff --git a/kernel/srcu.c b/kernel/rcu/srcu.c similarity index 100% rename from kernel/srcu.c rename to kernel/rcu/srcu.c diff --git a/kernel/rcutiny.c b/kernel/rcu/tiny.c similarity index 97% rename from kernel/rcutiny.c rename to kernel/rcu/tiny.c index 312e9709713f..0c9a934cfec1 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcu/tiny.c @@ -43,7 +43,7 @@ #include "rcu.h" -/* Forward declarations for rcutiny_plugin.h. */ +/* Forward declarations for tiny_plugin.h. */ struct rcu_ctrlblk; static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); static void rcu_process_callbacks(struct softirq_action *unused); @@ -53,7 +53,7 @@ static void __call_rcu(struct rcu_head *head, static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; -#include "rcutiny_plugin.h" +#include "tiny_plugin.h" /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ static void rcu_idle_enter_common(long long newval) @@ -67,7 +67,7 @@ static void rcu_idle_enter_common(long long newval) RCU_TRACE(trace_rcu_dyntick(TPS("Start"), rcu_dynticks_nesting, newval)); if (!is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); + struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), rcu_dynticks_nesting, newval)); @@ -128,7 +128,7 @@ static void rcu_idle_exit_common(long long oldval) } RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); if (!is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); + struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), oldval, rcu_dynticks_nesting)); diff --git a/kernel/rcutiny_plugin.h b/kernel/rcu/tiny_plugin.h similarity index 100% rename from kernel/rcutiny_plugin.h rename to kernel/rcu/tiny_plugin.h diff --git a/kernel/rcutorture.c b/kernel/rcu/torture.c similarity index 99% rename from kernel/rcutorture.c rename to kernel/rcu/torture.c index be63101c6175..3929cd451511 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcu/torture.c @@ -52,6 +52,12 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); +MODULE_ALIAS("rcutorture"); +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "rcutorture." + static int fqs_duration; module_param(fqs_duration, int, 0444); MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); diff --git a/kernel/rcutree.c b/kernel/rcu/tree.c similarity index 99% rename from kernel/rcutree.c rename to kernel/rcu/tree.c index 240604aa3f70..8a2c81e86dda 100644 --- a/kernel/rcutree.c +++ b/kernel/rcu/tree.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -56,11 +57,17 @@ #include #include -#include "rcutree.h" +#include "tree.h" #include #include "rcu.h" +MODULE_ALIAS("rcutree"); +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "rcutree." + /* Data structures. */ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; @@ -3298,7 +3305,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, /* * Compute the rcu_node tree geometry from kernel parameters. This cannot - * replace the definitions in rcutree.h because those are needed to size + * replace the definitions in tree.h because those are needed to size * the ->node array in the rcu_state structure. */ static void __init rcu_init_geometry(void) @@ -3393,4 +3400,4 @@ void __init rcu_init(void) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); } -#include "rcutree_plugin.h" +#include "tree_plugin.h" diff --git a/kernel/rcutree.h b/kernel/rcu/tree.h similarity index 100% rename from kernel/rcutree.h rename to kernel/rcu/tree.h diff --git a/kernel/rcutree_plugin.h b/kernel/rcu/tree_plugin.h similarity index 99% rename from kernel/rcutree_plugin.h rename to kernel/rcu/tree_plugin.h index 8d85a5ce093a..3822ac0c4b27 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -28,7 +28,7 @@ #include #include #include -#include "time/tick-internal.h" +#include "../time/tick-internal.h" #define RCU_KTHREAD_PRIO 1 @@ -1133,7 +1133,7 @@ void exit_rcu(void) #ifdef CONFIG_RCU_BOOST -#include "rtmutex_common.h" +#include "../rtmutex_common.h" #ifdef CONFIG_RCU_TRACE diff --git a/kernel/rcutree_trace.c b/kernel/rcu/tree_trace.c similarity index 99% rename from kernel/rcutree_trace.c rename to kernel/rcu/tree_trace.c index cf6c17412932..3596797b7e46 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -44,7 +44,7 @@ #include #define RCU_TREE_NONCORE -#include "rcutree.h" +#include "tree.h" static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) diff --git a/kernel/rcupdate.c b/kernel/rcu/update.c similarity index 98% rename from kernel/rcupdate.c rename to kernel/rcu/update.c index c07af1c4e1bb..6cb3dff89e2b 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcu/update.c @@ -53,6 +53,12 @@ #include "rcu.h" +MODULE_ALIAS("rcupdate"); +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "rcupdate." + module_param(rcu_expedited, int, 0); #ifdef CONFIG_PREEMPT_RCU