From 7717c6be699975f6733d278b13b7c4295d73caf6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 13 Jan 2016 15:48:54 -0500 Subject: tracing: Fix stacktrace skip depth in trace_buffer_unlock_commit_regs() While cleaning the stacktrace code I unintentially changed the skip depth of trace_buffer_unlock_commit_regs() from 0 to 6. kprobes uses this function, and with skipping 6 call backs, it can easily produce no stack. Here's how I tested it: # echo 'p:ext4_sync_fs ext4_sync_fs ' > /sys/kernel/debug/tracing/kprobe_events # echo 1 > /sys/kernel/debug/tracing/events/kprobes/enable # cat /sys/kernel/debug/trace sync-2394 [005] 502.457060: ext4_sync_fs: (ffffffff81317650) sync-2394 [005] 502.457063: kernel_stack: sync-2394 [005] 502.457086: ext4_sync_fs: (ffffffff81317650) sync-2394 [005] 502.457087: kernel_stack: sync-2394 [005] 502.457091: ext4_sync_fs: (ffffffff81317650) After putting back the skip stack to zero, we have: sync-2270 [000] 748.052693: ext4_sync_fs: (ffffffff81317650) sync-2270 [000] 748.052695: kernel_stack: => iterate_supers (ffffffff8126412e) => sys_sync (ffffffff8129c4b6) => entry_SYSCALL_64_fastpath (ffffffff8181f0b2) sync-2270 [000] 748.053017: ext4_sync_fs: (ffffffff81317650) sync-2270 [000] 748.053019: kernel_stack: => iterate_supers (ffffffff8126412e) => sys_sync (ffffffff8129c4b6) => entry_SYSCALL_64_fastpath (ffffffff8181f0b2) sync-2270 [000] 748.053381: ext4_sync_fs: (ffffffff81317650) sync-2270 [000] 748.053383: kernel_stack: => iterate_supers (ffffffff8126412e) => sys_sync (ffffffff8129c4b6) => entry_SYSCALL_64_fastpath (ffffffff8181f0b2) Cc: stable@vger.kernel.org # v4.4+ Fixes: 73dddbb57bb0 "tracing: Only create stacktrace option when STACKTRACE is configured" Reported-by: Brendan Gregg Tested-by: Brendan Gregg Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 87fb9801bd9e..d9293402ee68 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1751,7 +1751,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, flags, 6, pc, regs); + ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); -- cgit From 570540d50710ed192e98e2f7f74578c9486b6b05 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Jan 2016 14:07:25 +0100 Subject: genirq: Validate action before dereferencing it in handle_irq_event_percpu() commit 71f64340fc0e changed the handling of irq_desc->action from CPU 0 CPU 1 free_irq() lock(desc) lock(desc) handle_edge_irq() if (desc->action) { handle_irq_event() action = desc->action unlock(desc) desc->action = NULL handle_irq_event_percpu(desc, action) action->xxx to CPU 0 CPU 1 free_irq() lock(desc) lock(desc) handle_edge_irq() if (desc->action) { handle_irq_event() unlock(desc) desc->action = NULL handle_irq_event_percpu(desc, action) action = desc->action action->xxx So if free_irq manages to set the action to NULL between the unlock and before the readout, we happily dereference a null pointer. We could simply revert 71f64340fc0e, but we want to preserve the better code generation. A simple solution is to change the action loop from a do {} while to a while {} loop. This is safe because we either see a valid desc->action or NULL. If the action is about to be removed it is still valid as free_irq() is blocked on synchronize_irq(). CPU 0 CPU 1 free_irq() lock(desc) lock(desc) handle_edge_irq() handle_irq_event(desc) set(INPROGRESS) unlock(desc) handle_irq_event_percpu(desc) action = desc->action desc->action = NULL while (action) { action->xxx ... action = action->next; sychronize_irq() while(INPROGRESS); lock(desc) clr(INPROGRESS) free(action) That's basically the same mechanism as we have for shared interrupts. action->next can become NULL while handle_irq_event_percpu() runs. Either it sees the action or NULL. It does not matter, because action itself cannot go away before the interrupt in progress flag has been cleared. Fixes: commit 71f64340fc0e "genirq: Remove the second parameter from handle_irq_event_percpu()" Reported-by: zyjzyj2000@gmail.com Signed-off-by: Thomas Gleixner Cc: Huang Shijie Cc: Jiang Liu Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1601131224190.3575@nanos --- kernel/irq/handle.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a302cf9a2126..57bff7857e87 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -138,7 +138,8 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) unsigned int flags = 0, irq = desc->irq_data.irq; struct irqaction *action = desc->action; - do { + /* action might have become NULL since we dropped the lock */ + while (action) { irqreturn_t res; trace_irq_handler_entry(irq, action); @@ -173,7 +174,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) retval |= res; action = action->next; - } while (action); + } add_interrupt_randomness(irq, flags); -- cgit From 203cbf77de59fc8f13502dcfd11350c6d4a5c95f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Jan 2016 16:54:46 +0000 Subject: hrtimer: Handle remaining time proper for TIME_LOW_RES If CONFIG_TIME_LOW_RES is enabled we add a jiffie to the relative timeout to prevent short sleeps, but we do not account for that in interfaces which retrieve the remaining time. Helge observed that timerfd can return a remaining time larger than the relative timeout. That's not expected and breaks userland test programs. Store the information that the timer was armed relative and provide functions to adjust the remaining time. To avoid bloating the hrtimer struct make state a u8, which as a bonus results in better code on x86 at least. Reported-and-tested-by: Helge Deller Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Cc: linux-m68k@lists.linux-m68k.org Cc: dhowells@redhat.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160114164159.273328486@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 34 +++++++++++++++++++++++++++--- kernel/time/hrtimer.c | 55 ++++++++++++++++++++++++++++++++---------------- kernel/time/timer_list.c | 2 +- 3 files changed, 69 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 76dd4f0da5ca..2ead22dd74a0 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -87,7 +87,8 @@ enum hrtimer_restart { * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) - * @start_pid: timer statistics field to store the pid of the task which + * @is_rel: Set if the timer was armed relative + * @start_pid: timer statistics field to store the pid of the task which * started the timer * @start_site: timer statistics field to store the site where the timer * was started @@ -101,7 +102,8 @@ struct hrtimer { ktime_t _softexpires; enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; - unsigned long state; + u8 state; + u8 is_rel; #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; @@ -321,6 +323,27 @@ static inline void clock_was_set_delayed(void) { } #endif +static inline ktime_t +__hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) +{ + ktime_t rem = ktime_sub(timer->node.expires, now); + + /* + * Adjust relative timers for the extra we added in + * hrtimer_start_range_ns() to prevent short timeouts. + */ + if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel) + rem.tv64 -= hrtimer_resolution; + return rem; +} + +static inline ktime_t +hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) +{ + return __hrtimer_expires_remaining_adjusted(timer, + timer->base->get_time()); +} + extern void clock_was_set(void); #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); @@ -390,7 +413,12 @@ static inline void hrtimer_restart(struct hrtimer *timer) } /* Query timers: */ -extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); +extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust); + +static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +{ + return __hrtimer_get_remaining(timer, false); +} extern u64 hrtimer_get_next_event(void); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 435b8850dd80..fa909f9fd559 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -897,10 +897,10 @@ static int enqueue_hrtimer(struct hrtimer *timer, */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - unsigned long newstate, int reprogram) + u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - unsigned int state = timer->state; + u8 state = timer->state; timer->state = newstate; if (!(state & HRTIMER_STATE_ENQUEUED)) @@ -930,7 +930,7 @@ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { if (hrtimer_is_queued(timer)) { - unsigned long state = timer->state; + u8 state = timer->state; int reprogram; /* @@ -954,6 +954,22 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest return 0; } +static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) +{ +#ifdef CONFIG_TIME_LOW_RES + /* + * CONFIG_TIME_LOW_RES indicates that the system has no way to return + * granular time values. For relative timers we add hrtimer_resolution + * (i.e. one jiffie) to prevent short timeouts. + */ + timer->is_rel = mode & HRTIMER_MODE_REL; + if (timer->is_rel) + tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); +#endif + return tim; +} + /** * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU * @timer: the timer to be added @@ -974,19 +990,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Remove an active timer from the queue: */ remove_hrtimer(timer, base, true); - if (mode & HRTIMER_MODE_REL) { + if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); - /* - * CONFIG_TIME_LOW_RES is a temporary way for architectures - * to signal that they simply return xtime in - * do_gettimeoffset(). In this case we want to round up by - * resolution when starting a relative timer, to avoid short - * timeouts. This will go away with the GTOD framework. - */ -#ifdef CONFIG_TIME_LOW_RES - tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); -#endif - } + + tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); @@ -1074,19 +1081,23 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read + * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ -ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) { unsigned long flags; ktime_t rem; lock_hrtimer_base(timer, &flags); - rem = hrtimer_expires_remaining(timer); + if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) + rem = hrtimer_expires_remaining_adjusted(timer); + else + rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; } -EXPORT_SYMBOL_GPL(hrtimer_get_remaining); +EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); #ifdef CONFIG_NO_HZ_COMMON /** @@ -1219,6 +1230,14 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, timer_stats_account_hrtimer(timer); fn = timer->function; + /* + * Clear the 'is relative' flag for the TIME_LOW_RES case. If the + * timer is restarted with a period then it becomes an absolute + * timer. If its not restarted it does not matter. + */ + if (IS_ENABLED(CONFIG_TIME_LOW_RES)) + timer->is_rel = false; + /* * Because we run timers from hardirq context, there is no chance * they get migrated to another cpu, therefore its safe to unlock diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index f75e35b60149..ba7d8b288bb3 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -69,7 +69,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); - SEQ_printf(m, ", S:%02lx", timer->state); + SEQ_printf(m, ", S:%02x", timer->state); #ifdef CONFIG_TIMER_STATS SEQ_printf(m, ", "); print_name_offset(m, timer->start_site); -- cgit From 572c39172684c3711e4a03c9a7380067e2b0661c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Jan 2016 16:54:47 +0000 Subject: posix-timers: Handle relative timers with CONFIG_TIME_LOW_RES proper As Helge reported for timerfd we have the same issue in posix timers. We return remaining time larger than the programmed relative time to user space in case of CONFIG_TIME_LOW_RES=y. Use the proper function to adjust the extra time added in hrtimer_start_range_ns(). Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Helge Deller Cc: John Stultz Cc: linux-m68k@lists.linux-m68k.org Cc: dhowells@redhat.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160114164159.450510905@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/posix-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 31d11ac9fa47..f2826c35e918 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -760,7 +760,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); - remaining = ktime_sub(hrtimer_get_expires(timer), now); + remaining = __hrtimer_expires_remaining_adjusted(timer, now); /* Return 0 only, when the timer is expired and not pending */ if (remaining.tv64 <= 0) { /* -- cgit From 51cbb5242a41700a3f250ecfb48dcfb7e4375ea4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Jan 2016 16:54:48 +0000 Subject: itimers: Handle relative timers with CONFIG_TIME_LOW_RES proper As Helge reported for timerfd we have the same issue in itimers. We return remaining time larger than the programmed relative time to user space in case of CONFIG_TIME_LOW_RES=y. Use the proper function to adjust the extra time added in hrtimer_start_range_ns(). Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Helge Deller Cc: John Stultz Cc: linux-m68k@lists.linux-m68k.org Cc: dhowells@redhat.com Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160114164159.528222587@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/itimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 8d262b467573..1d5c7204ddc9 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -26,7 +26,7 @@ */ static struct timeval itimer_get_remtime(struct hrtimer *timer) { - ktime_t rem = hrtimer_get_remaining(timer); + ktime_t rem = __hrtimer_get_remaining(timer, true); /* * Racy but safe: if the itimer expires after the above -- cgit From 9c03ee147193645be4c186d3688232fa438c57c7 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Sat, 16 Jan 2016 00:31:23 +0530 Subject: sched: Fix crash in sched_init_numa() The following PowerPC commit: c118baf80256 ("arch/powerpc/mm/numa.c: do not allocate bootmem memory for non existing nodes") avoids allocating bootmem memory for non existent nodes. But when DEBUG_PER_CPU_MAPS=y is enabled, my powerNV system failed to boot because in sched_init_numa(), cpumask_or() operation was done on unallocated nodes. Fix that by making cpumask_or() operation only on existing nodes. [ Tested with and w/o DEBUG_PER_CPU_MAPS=y on x86 and PowerPC. ] Reported-by: Jan Stancek Tested-by: Jan Stancek Signed-off-by: Raghavendra K T Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1452884483-11676-1-git-send-email-raghavendra.kt@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44253adb3c36..474658b51fe6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6840,7 +6840,7 @@ static void sched_init_numa(void) sched_domains_numa_masks[i][j] = mask; - for (k = 0; k < nr_node_ids; k++) { + for_each_node(k) { if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; -- cgit From c994d6136738fd8b24a79f5ad8df40a6a79e2cf7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 09:20:23 +0100 Subject: perf: Add lockdep assertions Make various bugs easier to see. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index bf8244190d0f..c77b05d9a37d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1246,6 +1246,8 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { + lockdep_assert_held(&ctx->lock); + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; @@ -2342,8 +2344,10 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) { - struct perf_event *event; int is_active = ctx->is_active; + struct perf_event *event; + + lockdep_assert_held(&ctx->lock); ctx->is_active &= ~event_type; if (likely(!ctx->nr_events)) @@ -2725,8 +2729,10 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type, struct task_struct *task) { - u64 now; int is_active = ctx->is_active; + u64 now; + + lockdep_assert_held(&ctx->lock); ctx->is_active |= event_type; if (likely(!ctx->nr_events)) -- cgit From 7e41d17753e6e0da55d343997454dd4fbe8d28a8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 09:21:40 +0100 Subject: perf: Fix cgroup event scheduling There appears to be a problem in __perf_event_task_sched_in() wrt cgroup event scheduling. The normal event scheduling order is: CPU pinned Task pinned CPU flexible Task flexible And since perf_cgroup_sched*() only schedules the cpu context, we must call this _before_ adding the task events. Note: double check what happens on the ctx switch optimization where the task ctx isn't scheduled. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c77b05d9a37d..9d1195af819c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2806,6 +2806,16 @@ void __perf_event_task_sched_in(struct task_struct *prev, struct perf_event_context *ctx; int ctxn; + /* + * If cgroup events exist on this CPU, then we need to check if we have + * to switch in PMU state; cgroup event are system-wide mode only. + * + * Since cgroup events are CPU events, we must schedule these in before + * we schedule in the task events. + */ + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) + perf_cgroup_sched_in(prev, task); + for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (likely(!ctx)) @@ -2813,13 +2823,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, perf_event_context_sched_in(ctx, task); } - /* - * if cgroup events exist on this CPU, then we need - * to check if we have to switch in PMU state. - * cgroup event are system-wide mode only - */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_in(prev, task); if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); -- cgit From 70a0165752944e0be0b1de4a9020473079962c18 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 09:29:16 +0100 Subject: perf: Fix cgroup scheduling in perf_enable_on_exec() There is a comment that states that perf_event_context_sched_in() will also switch in the cgroup events, I cannot find it does so. Therefore all the resulting logic goes out the window too. Clean that up. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9d1195af819c..e7bda0ed8d40 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -579,13 +579,7 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, * we are holding the rcu lock */ cgrp1 = perf_cgroup_from_task(task, NULL); - - /* - * next is NULL when called from perf_event_enable_on_exec() - * that will systematically cause a cgroup_switch() - */ - if (next) - cgrp2 = perf_cgroup_from_task(next, NULL); + cgrp2 = perf_cgroup_from_task(next, NULL); /* * only schedule out current cgroup events if we know @@ -611,8 +605,6 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, * we are holding the rcu lock */ cgrp1 = perf_cgroup_from_task(task, NULL); - - /* prev can never be NULL */ cgrp2 = perf_cgroup_from_task(prev, NULL); /* @@ -1450,11 +1442,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) { ctx->nr_cgroups--; + /* + * Because cgroup events are always per-cpu events, this will + * always be called from the right CPU. + */ cpuctx = __get_cpu_context(ctx); /* - * if there are no more cgroup events - * then cler cgrp to avoid stale pointer - * in update_cgrp_time_from_cpuctx() + * If there are no more cgroup events then clear cgrp to avoid + * stale pointer in update_cgrp_time_from_cpuctx(). */ if (!ctx->nr_cgroups) cpuctx->cgrp = NULL; @@ -3118,15 +3113,6 @@ static void perf_event_enable_on_exec(int ctxn) if (!ctx || !ctx->nr_events) goto out; - /* - * We must ctxsw out cgroup events to avoid conflict - * when invoking perf_task_event_sched_in() later on - * in this function. Otherwise we end up trying to - * ctxswin cgroup events which are already scheduled - * in. - */ - perf_cgroup_sched_out(current, NULL); - raw_spin_lock(&ctx->lock); task_ctx_sched_out(ctx); @@ -3144,9 +3130,6 @@ static void perf_event_enable_on_exec(int ctxn) raw_spin_unlock(&ctx->lock); - /* - * Also calls ctxswin for cgroup events, if any: - */ perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); -- cgit From 5947f6576e2edee1189b00bf5b2a3f2c653caa6b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 09:43:38 +0100 Subject: perf: Remove stale comment The comment here is horribly out of date, remove it. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index e7bda0ed8d40..751538ce19a7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2130,13 +2130,6 @@ static int __perf_install_in_context(void *info) /* * Attach a performance event to a context - * - * First we add the event to the list with the hardware enable bit - * in event->hw_config cleared. - * - * If the event is attached to a task which is on a CPU we use a smp - * call to enable it in the task context. The task might have been - * scheduled away, but we check this in the smp call again. */ static void perf_install_in_context(struct perf_event_context *ctx, -- cgit From 3e349507d12de93b08b0aa814fc2aa0dee91c5ba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 10:01:18 +0100 Subject: perf: Fix perf_enable_on_exec() event scheduling There are two problems with the current perf_enable_on_exec() event scheduling: - the newly enabled events will be immediately scheduled irrespective of their ctx event list order. - there's a hole in the ctx->lock between scheduling the events out and putting them back on. Esp. the latter issue is a real problem because a hole in event scheduling leaves the thing in an observable inconsistent state, confusing things. Fix both issues by first doing the enable iteration and at the end, when there are newly enabled events, reschedule the ctx in one go. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 751538ce19a7..0679e73f5f63 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2036,7 +2036,8 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } -static void task_ctx_sched_out(struct perf_event_context *ctx); +static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx); static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, @@ -2067,6 +2068,17 @@ static void ___perf_install_in_context(void *info) add_event_to_ctx(event, ctx); } +static void ctx_resched(struct perf_cpu_context *cpuctx, + struct perf_event_context *task_ctx) +{ + perf_pmu_disable(cpuctx->ctx.pmu); + if (task_ctx) + task_ctx_sched_out(cpuctx, task_ctx); + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + perf_event_sched_in(cpuctx, task_ctx, current); + perf_pmu_enable(cpuctx->ctx.pmu); +} + /* * Cross CPU call to install and enable a performance event * @@ -2087,7 +2099,7 @@ static int __perf_install_in_context(void *info) * If there was an active task_ctx schedule it out. */ if (task_ctx) - task_ctx_sched_out(task_ctx); + task_ctx_sched_out(cpuctx, task_ctx); /* * If the context we're installing events in is not the @@ -2629,10 +2641,9 @@ void __perf_event_task_sched_out(struct task_struct *task, perf_cgroup_sched_out(task, next); } -static void task_ctx_sched_out(struct perf_event_context *ctx) +static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) { - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - if (!cpuctx->task_ctx) return; @@ -3096,34 +3107,30 @@ static int event_enable_on_exec(struct perf_event *event, static void perf_event_enable_on_exec(int ctxn) { struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_cpu_context *cpuctx; struct perf_event *event; unsigned long flags; int enabled = 0; - int ret; local_irq_save(flags); ctx = current->perf_event_ctxp[ctxn]; if (!ctx || !ctx->nr_events) goto out; - raw_spin_lock(&ctx->lock); - task_ctx_sched_out(ctx); - - list_for_each_entry(event, &ctx->event_list, event_entry) { - ret = event_enable_on_exec(event, ctx); - if (ret) - enabled = 1; - } + cpuctx = __get_cpu_context(ctx); + perf_ctx_lock(cpuctx, ctx); + list_for_each_entry(event, &ctx->event_list, event_entry) + enabled |= event_enable_on_exec(event, ctx); /* - * Unclone this context if we enabled any event. + * Unclone and reschedule this context if we enabled any event. */ - if (enabled) + if (enabled) { clone_ctx = unclone_ctx(ctx); + ctx_resched(cpuctx, ctx); + } + perf_ctx_unlock(cpuctx, ctx); - raw_spin_unlock(&ctx->lock); - - perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); @@ -8737,7 +8744,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * incremented the context's refcount before we do put_ctx below. */ raw_spin_lock(&child_ctx->lock); - task_ctx_sched_out(child_ctx); + task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); child->perf_event_ctxp[ctxn] = NULL; /* -- cgit From 8833d0e286c12fd4456089a7a553faf4921e4b08 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 10:02:37 +0100 Subject: perf: Use task_ctx_sched_out() We have a function that does exactly what we want here, use it. This reduces the amount of cpuctx->task_ctx muckery. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0679e73f5f63..12f1d4a52da9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2545,8 +2545,7 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; + task_ctx_sched_out(cpuctx, ctx); raw_spin_unlock(&ctx->lock); } } -- cgit From aee7dbc45f8aa976913de9b352fa6da816f1f3cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 10:45:11 +0100 Subject: perf: Simplify/fix perf_event_enable() event scheduling Like perf_enable_on_exec(), perf_event_enable() event scheduling has problems respecting the context hierarchy when trying to schedule events (for example, it will try and add a pinned event without first removing existing flexible events). So simplify it by using the new ctx_resched() call which will DTRT. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 31 +++++-------------------------- 1 file changed, 5 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 12f1d4a52da9..079eb9fcaaa8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2188,7 +2188,7 @@ static int __perf_event_enable(void *info) struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - int err; + struct perf_event_context *task_ctx = cpuctx->task_ctx; /* * There's a time window between 'ctx->is_active' check @@ -2202,7 +2202,8 @@ static int __perf_event_enable(void *info) if (!ctx->is_active) return -EINVAL; - raw_spin_lock(&ctx->lock); + perf_ctx_lock(cpuctx, task_ctx); + WARN_ON_ONCE(&cpuctx->ctx != ctx && task_ctx != ctx); update_context_time(ctx); if (event->state >= PERF_EVENT_STATE_INACTIVE) @@ -2228,32 +2229,10 @@ static int __perf_event_enable(void *info) if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) goto unlock; - if (!group_can_go_on(event, cpuctx, 1)) { - err = -EEXIST; - } else { - if (event == leader) - err = group_sched_in(event, cpuctx, ctx); - else - err = event_sched_in(event, cpuctx, ctx); - } - - if (err) { - /* - * If this event can't go on and it's part of a - * group, then the whole group has to come off. - */ - if (leader != event) { - group_sched_out(leader, cpuctx, ctx); - perf_mux_hrtimer_restart(cpuctx); - } - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_EVENT_STATE_ERROR; - } - } + ctx_resched(cpuctx, task_ctx); unlock: - raw_spin_unlock(&ctx->lock); + perf_ctx_unlock(cpuctx, task_ctx); return 0; } -- cgit From 25432ae96a9889774a05bf5f0f6fd8dbcdec5e72 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 11:05:09 +0100 Subject: perf: Optimize perf_sched_events() usage It doesn't make sense to take up-to _4_ references on perf_sched_events() per event, avoid doing this. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 079eb9fcaaa8..935aefd16354 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3491,11 +3491,13 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) static void unaccount_event(struct perf_event *event) { + bool dec = false; + if (event->parent) return; if (event->attach_state & PERF_ATTACH_TASK) - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -3505,12 +3507,15 @@ static void unaccount_event(struct perf_event *event) if (event->attr.freq) atomic_dec(&nr_freq_events); if (event->attr.context_switch) { - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; atomic_dec(&nr_switch_events); } if (is_cgroup_event(event)) - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; if (has_branch_stack(event)) + dec = true; + + if (dec) static_key_slow_dec_deferred(&perf_sched_events); unaccount_event_cpu(event, event->cpu); @@ -7723,11 +7728,13 @@ static void account_event_cpu(struct perf_event *event, int cpu) static void account_event(struct perf_event *event) { + bool inc = false; + if (event->parent) return; if (event->attach_state & PERF_ATTACH_TASK) - static_key_slow_inc(&perf_sched_events.key); + inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -7740,11 +7747,14 @@ static void account_event(struct perf_event *event) } if (event->attr.context_switch) { atomic_inc(&nr_switch_events); - static_key_slow_inc(&perf_sched_events.key); + inc = true; } if (has_branch_stack(event)) - static_key_slow_inc(&perf_sched_events.key); + inc = true; if (is_cgroup_event(event)) + inc = true; + + if (inc) static_key_slow_inc(&perf_sched_events.key); account_event_cpu(event, event->cpu); -- cgit From 63e30d3e52d4d85854ce6c761ffc6ab55209a630 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 Jan 2016 11:39:10 +0100 Subject: perf: Make ctx->is_active and cpuctx->task_ctx consistent For no apparent reason and to great confusion the rules for ctx->is_active and cpuctx->task_ctx are different. This means that its not always possible to find all active (task) contexts. Fix this such that if ctx->is_active gets set, we also set (or verify) cpuctx->task_ctx. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 935aefd16354..89b47050a2e8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2329,6 +2329,12 @@ static void ctx_sched_out(struct perf_event_context *ctx, lockdep_assert_held(&ctx->lock); ctx->is_active &= ~event_type; + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + if (!ctx->is_active) + cpuctx->task_ctx = NULL; + } + if (likely(!ctx->nr_events)) return; @@ -2629,7 +2635,6 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, return; ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; } /* @@ -2712,6 +2717,13 @@ ctx_sched_in(struct perf_event_context *ctx, lockdep_assert_held(&ctx->lock); ctx->is_active |= event_type; + if (ctx->task) { + if (!is_active) + cpuctx->task_ctx = ctx; + else + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + } + if (likely(!ctx->nr_events)) return; @@ -2756,12 +2768,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * cpu flexible, task flexible. */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - - if (ctx->nr_events) - cpuctx->task_ctx = ctx; - - perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); - + perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); } -- cgit From 39a4364076921511e212bc42f94fbf062c989576 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Jan 2016 12:46:35 +0100 Subject: perf: Fix task context scheduling There is a very nasty problem wrt disabling the perf task scheduling hooks. Currently we {set,clear} ctx->is_active on every __perf_event_task_sched_{in,out}, _however_ this means that if we disable these calls we'll have task contexts with ->is_active set that are not active and 'active' task contexts without ->is_active set. This can result in event_function_call() looping on the ctx->is_active condition basically indefinitely. Resolve this by changing things such that contexts without events do not set ->is_active like we used to. From this invariant it trivially follows that if there are no (task) events, every task ctx is inactive and disabling the context switch hooks is harmless. This leaves two places that need attention (and already had accumulated weird and wonderful hacks to work around, without recognising this actual problem). Namely: - perf_install_in_context() will need to deal with installing events in an inactive context, meaning it cannot rely on ctx-is_active for its IPIs. - perf_remove_from_context() will have to mark a context as inactive when it removes the last event. For specific detail, see the patch/comments. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Dmitry Vyukov Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 155 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 91 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 89b47050a2e8..c27e04655d86 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -126,6 +126,38 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } +/* + * On task ctx scheduling... + * + * When !ctx->nr_events a task context will not be scheduled. This means + * we can disable the scheduler hooks (for performance) without leaving + * pending task ctx state. + * + * This however results in two special cases: + * + * - removing the last event from a task ctx; this is relatively straight + * forward and is done in __perf_remove_from_context. + * + * - adding the first event to a task ctx; this is tricky because we cannot + * rely on ctx->is_active and therefore cannot use event_function_call(). + * See perf_install_in_context(). + * + * This is because we need a ctx->lock serialized variable (ctx->is_active) + * to reliably determine if a particular task/context is scheduled in. The + * task_curr() use in task_function_call() is racy in that a remote context + * switch is not a single atomic operation. + * + * As is, the situation is 'safe' because we set rq->curr before we do the + * actual context switch. This means that task_curr() will fail early, but + * we'll continue spinning on ctx->is_active until we've passed + * perf_event_task_sched_out(). + * + * Without this ctx->lock serialized variable we could have race where we find + * the task (and hence the context) would not be active while in fact they are. + * + * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. + */ + static void event_function_call(struct perf_event *event, int (*active)(void *), void (*inactive)(void *), @@ -1686,9 +1718,13 @@ static int __perf_remove_from_context(void *info) if (re->detach_group) perf_group_detach(event); list_del_event(event, ctx); - if (!ctx->nr_events && cpuctx->task_ctx == ctx) { + + if (!ctx->nr_events && ctx->is_active) { ctx->is_active = 0; - cpuctx->task_ctx = NULL; + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + cpuctx->task_ctx = NULL; + } } raw_spin_unlock(&ctx->lock); @@ -2056,18 +2092,6 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); } -static void ___perf_install_in_context(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - - /* - * Since the task isn't running, its safe to add the event, us holding - * the ctx->lock ensures the task won't get scheduled in. - */ - add_event_to_ctx(event, ctx); -} - static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx) { @@ -2086,55 +2110,27 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, */ static int __perf_install_in_context(void *info) { - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; + struct perf_event_context *ctx = info; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; - struct task_struct *task = current; - - perf_ctx_lock(cpuctx, task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); - /* - * If there was an active task_ctx schedule it out. - */ - if (task_ctx) - task_ctx_sched_out(cpuctx, task_ctx); + if (ctx->task) { + /* + * If we hit the 'wrong' task, we've since scheduled and + * everything should be sorted, nothing to do! + */ + if (ctx->task != current) + return 0; - /* - * If the context we're installing events in is not the - * active task_ctx, flip them. - */ - if (ctx->task && task_ctx != ctx) { - if (task_ctx) - raw_spin_unlock(&task_ctx->lock); - raw_spin_lock(&ctx->lock); + /* + * If task_ctx is set, it had better be to us. + */ + WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); task_ctx = ctx; } - if (task_ctx) { - cpuctx->task_ctx = task_ctx; - task = task_ctx->task; - } - - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - - update_context_time(ctx); - /* - * update cgrp time only if current cgrp - * matches event->cgrp. Must be done before - * calling add_event_to_ctx() - */ - update_cgrp_time_from_event(event); - - add_event_to_ctx(event, ctx); - - /* - * Schedule everything back in - */ - perf_event_sched_in(cpuctx, task_ctx, task); - - perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_lock(cpuctx, task_ctx); + ctx_resched(cpuctx, task_ctx); perf_ctx_unlock(cpuctx, task_ctx); return 0; @@ -2148,14 +2144,38 @@ perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { + struct task_struct *task = NULL; + lockdep_assert_held(&ctx->mutex); event->ctx = ctx; if (event->cpu != -1) event->cpu = cpu; - event_function_call(event, __perf_install_in_context, - ___perf_install_in_context, event); + /* + * Installing events is tricky because we cannot rely on ctx->is_active + * to be set in case this is the nr_events 0 -> 1 transition. + * + * So what we do is we add the event to the list here, which will allow + * a future context switch to DTRT and then send a racy IPI. If the IPI + * fails to hit the right task, this means a context switch must have + * happened and that will have taken care of business. + */ + raw_spin_lock_irq(&ctx->lock); + update_context_time(ctx); + /* + * Update cgrp time only if current cgrp matches event->cgrp. + * Must be done before calling add_event_to_ctx(). + */ + update_cgrp_time_from_event(event); + add_event_to_ctx(event, ctx); + task = ctx->task; + raw_spin_unlock_irq(&ctx->lock); + + if (task) + task_function_call(task, __perf_install_in_context, ctx); + else + cpu_function_call(cpu, __perf_install_in_context, ctx); } /* @@ -2328,6 +2348,16 @@ static void ctx_sched_out(struct perf_event_context *ctx, lockdep_assert_held(&ctx->lock); + if (likely(!ctx->nr_events)) { + /* + * See __perf_remove_from_context(). + */ + WARN_ON_ONCE(ctx->is_active); + if (ctx->task) + WARN_ON_ONCE(cpuctx->task_ctx); + return; + } + ctx->is_active &= ~event_type; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); @@ -2335,9 +2365,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, cpuctx->task_ctx = NULL; } - if (likely(!ctx->nr_events)) - return; - update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx); if (!ctx->nr_active) @@ -2716,6 +2743,9 @@ ctx_sched_in(struct perf_event_context *ctx, lockdep_assert_held(&ctx->lock); + if (likely(!ctx->nr_events)) + return; + ctx->is_active |= event_type; if (ctx->task) { if (!is_active) @@ -2724,9 +2754,6 @@ ctx_sched_in(struct perf_event_context *ctx, WARN_ON_ONCE(cpuctx->task_ctx != ctx); } - if (likely(!ctx->nr_events)) - return; - now = perf_clock(); ctx->timestamp = now; perf_cgroup_set_timestamp(task, ctx); -- cgit From 32132a3d0d5d6f127388be3e3fd7759f798c2eb4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Jan 2016 15:40:59 +0100 Subject: perf: Specialize perf_event_exit_task() The perf_remove_from_context() usage in __perf_event_exit_task() is different from the other usages in that this site has already detached and scheduled out the task context. This will stand in the way of stronger assertions checking the (task) context scheduling invariants. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c27e04655d86..66c9ad4f8707 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8726,7 +8726,13 @@ __perf_event_exit_task(struct perf_event *child_event, * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ - perf_remove_from_context(child_event, !!child_event->parent); + raw_spin_lock_irq(&child_ctx->lock); + WARN_ON_ONCE(child_ctx->is_active); + + if (!!child_event->parent) + perf_group_detach(child_event); + list_del_event(child_event, child_ctx); + raw_spin_unlock_irq(&child_ctx->lock); /* * It can happen that the parent exits first, and has events @@ -8746,17 +8752,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { struct perf_event *child_event, *next; struct perf_event_context *child_ctx, *clone_ctx = NULL; - unsigned long flags; if (likely(!child->perf_event_ctxp[ctxn])) return; - local_irq_save(flags); + local_irq_disable(); + WARN_ON_ONCE(child != current); /* * We can't reschedule here because interrupts are disabled, - * and either child is current or it is a task that can't be - * scheduled, so we are now safe from rescheduling changing - * our context. + * and child must be current. */ child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); @@ -8776,7 +8780,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ clone_ctx = unclone_ctx(child_ctx); update_context_time(child_ctx); - raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + raw_spin_unlock_irq(&child_ctx->lock); if (clone_ctx) put_ctx(clone_ctx); -- cgit From fae3fde65138b6071b1b0e0b567d4058a8b6a88c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Jan 2016 15:00:50 +0100 Subject: perf: Collapse and fix event_function_call() users There is one common bug left in all the event_function_call() users, between loading ctx->task and getting to the remote_function(), ctx->task can already have been changed. Therefore we need to double check and retry if ctx->task != current. Insert another trampoline specific to event_function_call() that checks for this and further validates state. This also allows getting rid of the active/inactive functions. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 +- kernel/events/core.c | 365 +++++++++++++++++++----------------------- kernel/events/hw_breakpoint.c | 2 +- 3 files changed, 168 insertions(+), 201 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f9828a48f16a..6612732d8fd0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1044,7 +1044,7 @@ extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); -extern int __perf_event_disable(void *info); +extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_task_tick(void); #else /* !CONFIG_PERF_EVENTS: */ static inline void * diff --git a/kernel/events/core.c b/kernel/events/core.c index 66c9ad4f8707..6620432491f6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -126,6 +126,28 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +static void perf_ctx_lock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + raw_spin_lock(&cpuctx->ctx.lock); + if (ctx) + raw_spin_lock(&ctx->lock); +} + +static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (ctx) + raw_spin_unlock(&ctx->lock); + raw_spin_unlock(&cpuctx->ctx.lock); +} + /* * On task ctx scheduling... * @@ -158,21 +180,96 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. */ -static void event_function_call(struct perf_event *event, - int (*active)(void *), - void (*inactive)(void *), - void *data) +typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, + struct perf_event_context *, void *); + +struct event_function_struct { + struct perf_event *event; + event_f func; + void *data; +}; + +static int event_function(void *info) +{ + struct event_function_struct *efs = info; + struct perf_event *event = efs->event; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event_context *task_ctx = cpuctx->task_ctx; + + WARN_ON_ONCE(!irqs_disabled()); + + /* + * Since we do the IPI call without holding ctx->lock things can have + * changed, double check we hit the task we set out to hit. + * + * If ctx->task == current, we know things must remain valid because + * we have IRQs disabled so we cannot schedule. + */ + if (ctx->task) { + if (ctx->task != current) + return -EAGAIN; + + WARN_ON_ONCE(task_ctx != ctx); + } else { + WARN_ON_ONCE(&cpuctx->ctx != ctx); + } + + perf_ctx_lock(cpuctx, task_ctx); + /* + * Now that we hold locks, double check state. Paranoia pays. + */ + if (task_ctx) { + WARN_ON_ONCE(task_ctx->task != current); + /* + * We only use event_function_call() on established contexts, + * and event_function() is only ever called when active (or + * rather, we'll have bailed in task_function_call() or the + * above ctx->task != current test), therefore we must have + * ctx->is_active here. + */ + WARN_ON_ONCE(!ctx->is_active); + /* + * And since we have ctx->is_active, cpuctx->task_ctx must + * match. + */ + WARN_ON_ONCE(cpuctx->task_ctx != task_ctx); + } + efs->func(event, cpuctx, ctx, efs->data); + perf_ctx_unlock(cpuctx, task_ctx); + + return 0; +} + +static void event_function_local(struct perf_event *event, event_f func, void *data) +{ + struct event_function_struct efs = { + .event = event, + .func = func, + .data = data, + }; + + int ret = event_function(&efs); + WARN_ON_ONCE(ret); +} + +static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; + struct event_function_struct efs = { + .event = event, + .func = func, + .data = data, + }; if (!task) { - cpu_function_call(event->cpu, active, data); + cpu_function_call(event->cpu, event_function, &efs); return; } again: - if (!task_function_call(task, active, data)) + if (!task_function_call(task, event_function, &efs)) return; raw_spin_lock_irq(&ctx->lock); @@ -185,7 +282,7 @@ again: raw_spin_unlock_irq(&ctx->lock); goto again; } - inactive(data); + func(event, NULL, ctx, data); raw_spin_unlock_irq(&ctx->lock); } @@ -400,28 +497,6 @@ static inline u64 perf_event_clock(struct perf_event *event) return event->clock(); } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - -static void perf_ctx_lock(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - raw_spin_lock(&cpuctx->ctx.lock); - if (ctx) - raw_spin_lock(&ctx->lock); -} - -static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - if (ctx) - raw_spin_unlock(&ctx->lock); - raw_spin_unlock(&cpuctx->ctx.lock); -} - #ifdef CONFIG_CGROUP_PERF static inline bool @@ -1684,38 +1759,22 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } -struct remove_event { - struct perf_event *event; - bool detach_group; -}; - -static void ___perf_remove_from_context(void *info) -{ - struct remove_event *re = info; - struct perf_event *event = re->event; - struct perf_event_context *ctx = event->ctx; - - if (re->detach_group) - perf_group_detach(event); - list_del_event(event, ctx); -} - /* * Cross CPU call to remove a performance event * * We disable the event on the hardware level first. After that we * remove it from the context list. */ -static int __perf_remove_from_context(void *info) +static void +__perf_remove_from_context(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct remove_event *re = info; - struct perf_event *event = re->event; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + bool detach_group = (unsigned long)info; - raw_spin_lock(&ctx->lock); event_sched_out(event, cpuctx, ctx); - if (re->detach_group) + if (detach_group) perf_group_detach(event); list_del_event(event, ctx); @@ -1726,17 +1785,11 @@ static int __perf_remove_from_context(void *info) cpuctx->task_ctx = NULL; } } - raw_spin_unlock(&ctx->lock); - - return 0; } /* * Remove the event from a task's (or a CPU's) list of events. * - * CPU events are removed with a smp call. For task events we only - * call when the task is on a CPU. - * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since @@ -1746,71 +1799,31 @@ static int __perf_remove_from_context(void *info) */ static void perf_remove_from_context(struct perf_event *event, bool detach_group) { - struct perf_event_context *ctx = event->ctx; - struct remove_event re = { - .event = event, - .detach_group = detach_group, - }; - - lockdep_assert_held(&ctx->mutex); + lockdep_assert_held(&event->ctx->mutex); event_function_call(event, __perf_remove_from_context, - ___perf_remove_from_context, &re); + (void *)(unsigned long)detach_group); } /* * Cross CPU call to disable a performance event */ -int __perf_event_disable(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - /* - * If this is a per-task event, need to check whether this - * event's task is the current task on this cpu. - * - * Can trigger due to concurrent perf_event_context_sched_out() - * flipping contexts around. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return -EINVAL; - - raw_spin_lock(&ctx->lock); - - /* - * If the event is on, turn it off. - * If it is in error state, leave it in error state. - */ - if (event->state >= PERF_EVENT_STATE_INACTIVE) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - update_group_times(event); - if (event == event->group_leader) - group_sched_out(event, cpuctx, ctx); - else - event_sched_out(event, cpuctx, ctx); - event->state = PERF_EVENT_STATE_OFF; - } - - raw_spin_unlock(&ctx->lock); - - return 0; -} - -void ___perf_event_disable(void *info) +static void __perf_event_disable(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct perf_event *event = info; + if (event->state < PERF_EVENT_STATE_INACTIVE) + return; - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_OFF; - } + update_context_time(ctx); + update_cgrp_time_from_event(event); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); + else + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; } /* @@ -1837,8 +1850,12 @@ static void _perf_event_disable(struct perf_event *event) } raw_spin_unlock_irq(&ctx->lock); - event_function_call(event, __perf_event_disable, - ___perf_event_disable, event); + event_function_call(event, __perf_event_disable, NULL); +} + +void perf_event_disable_local(struct perf_event *event) +{ + event_function_local(event, __perf_event_disable, NULL); } /* @@ -2202,44 +2219,29 @@ static void __perf_event_mark_enabled(struct perf_event *event) /* * Cross CPU call to enable a performance event */ -static int __perf_event_enable(void *info) +static void __perf_event_enable(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - struct perf_event_context *task_ctx = cpuctx->task_ctx; - - /* - * There's a time window between 'ctx->is_active' check - * in perf_event_enable function and this place having: - * - IRQs on - * - ctx->lock unlocked - * - * where the task could be killed and 'ctx' deactivated - * by perf_event_exit_task. - */ - if (!ctx->is_active) - return -EINVAL; - - perf_ctx_lock(cpuctx, task_ctx); - WARN_ON_ONCE(&cpuctx->ctx != ctx && task_ctx != ctx); - update_context_time(ctx); + struct perf_event_context *task_ctx; if (event->state >= PERF_EVENT_STATE_INACTIVE) - goto unlock; - - /* - * set current task's cgroup time reference point - */ - perf_cgroup_set_timestamp(current, ctx); + return; + update_context_time(ctx); __perf_event_mark_enabled(event); + if (!ctx->is_active) + return; + if (!event_filter_match(event)) { - if (is_cgroup_event(event)) + if (is_cgroup_event(event)) { + perf_cgroup_set_timestamp(current, ctx); // XXX ? perf_cgroup_defer_enabled(event); - goto unlock; + } + return; } /* @@ -2247,19 +2249,13 @@ static int __perf_event_enable(void *info) * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) - goto unlock; + return; - ctx_resched(cpuctx, task_ctx); + task_ctx = cpuctx->task_ctx; + if (ctx->task) + WARN_ON_ONCE(task_ctx != ctx); -unlock: - perf_ctx_unlock(cpuctx, task_ctx); - - return 0; -} - -void ___perf_event_enable(void *info) -{ - __perf_event_mark_enabled((struct perf_event *)info); + ctx_resched(cpuctx, task_ctx); } /* @@ -2292,8 +2288,7 @@ static void _perf_event_enable(struct perf_event *event) event->state = PERF_EVENT_STATE_OFF; raw_spin_unlock_irq(&ctx->lock); - event_function_call(event, __perf_event_enable, - ___perf_event_enable, event); + event_function_call(event, __perf_event_enable, NULL); } /* @@ -4095,36 +4090,14 @@ static void perf_event_for_each(struct perf_event *event, perf_event_for_each_child(sibling, func); } -struct period_event { - struct perf_event *event; - u64 value; -}; - -static void ___perf_event_period(void *info) -{ - struct period_event *pe = info; - struct perf_event *event = pe->event; - u64 value = pe->value; - - if (event->attr.freq) { - event->attr.sample_freq = value; - } else { - event->attr.sample_period = value; - event->hw.sample_period = value; - } - - local64_set(&event->hw.period_left, 0); -} - -static int __perf_event_period(void *info) +static void __perf_event_period(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct period_event *pe = info; - struct perf_event *event = pe->event; - struct perf_event_context *ctx = event->ctx; - u64 value = pe->value; + u64 value = *((u64 *)info); bool active; - raw_spin_lock(&ctx->lock); if (event->attr.freq) { event->attr.sample_freq = value; } else { @@ -4144,14 +4117,10 @@ static int __perf_event_period(void *info) event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(ctx->pmu); } - raw_spin_unlock(&ctx->lock); - - return 0; } static int perf_event_period(struct perf_event *event, u64 __user *arg) { - struct period_event pe = { .event = event, }; u64 value; if (!is_sampling_event(event)) @@ -4166,10 +4135,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; - pe.value = value; - - event_function_call(event, __perf_event_period, - ___perf_event_period, &pe); + event_function_call(event, __perf_event_period, &value); return 0; } @@ -4941,7 +4907,7 @@ static void perf_pending_event(struct irq_work *entry) if (event->pending_disable) { event->pending_disable = 0; - __perf_event_disable(event); + perf_event_disable_local(event); } if (event->pending_wakeup) { @@ -9239,13 +9205,14 @@ static void perf_event_init_cpu(int cpu) #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { - struct remove_event re = { .detach_group = true }; struct perf_event_context *ctx = __info; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event *event; - rcu_read_lock(); - list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) - __perf_remove_from_context(&re); - rcu_read_unlock(); + raw_spin_lock(&ctx->lock); + list_for_each_entry(event, &ctx->event_list, event_entry) + __perf_remove_from_context(event, cpuctx, ctx, (void *)(unsigned long)true); + raw_spin_unlock(&ctx->lock); } static void perf_event_exit_cpu_context(int cpu) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 92ce5f4ccc26..3f8cb1e14588 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att * current task. */ if (irqs_disabled() && bp->ctx && bp->ctx->task == current) - __perf_event_disable(bp); + perf_event_disable_local(bp); else perf_event_disable(bp); -- cgit From c97f473643a9d3e618c0f0426b926bc3a3e23944 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Jan 2016 10:51:03 +0100 Subject: perf: Add more assertions Try to trigger warnings before races do damage. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6620432491f6..3eaf91b104e9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -263,6 +263,15 @@ static void event_function_call(struct perf_event *event, event_f func, void *da .data = data, }; + if (!event->parent) { + /* + * If this is a !child event, we must hold ctx::mutex to + * stabilize the the event->ctx relation. See + * perf_event_ctx_lock(). + */ + lockdep_assert_held(&ctx->mutex); + } + if (!task) { cpu_function_call(event->cpu, event_function, &efs); return; -- cgit From 63b6da39bb38e8f1a1ef3180d32a39d6baf9da84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Jan 2016 16:05:37 +0100 Subject: perf: Fix perf_event_exit_task() race There is a race against perf_event_exit_task() vs event_function_call(),find_get_context(),perf_install_in_context() (iow, everyone). Since there is no permanent marker on a context that its dead, it is quite possible that we access (and even modify) a context after its passed through perf_event_exit_task(). For instance, find_get_context() might find the context still installed, but by the time we get to perf_install_in_context() it might already have passed through perf_event_exit_task() and be considered dead, we will however still add the event to it. Solve this by marking a ctx dead by setting its ctx->task value to -1, it must be !0 so we still know its a (former) task context. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 151 +++++++++++++++++++++++++++++---------------------- 1 file changed, 85 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3eaf91b104e9..9de4d352ba8c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -148,6 +148,13 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, raw_spin_unlock(&cpuctx->ctx.lock); } +#define TASK_TOMBSTONE ((void *)-1L) + +static bool is_kernel_event(struct perf_event *event) +{ + return event->owner == TASK_TOMBSTONE; +} + /* * On task ctx scheduling... * @@ -196,31 +203,21 @@ static int event_function(void *info) struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; + int ret = 0; WARN_ON_ONCE(!irqs_disabled()); + perf_ctx_lock(cpuctx, task_ctx); /* * Since we do the IPI call without holding ctx->lock things can have * changed, double check we hit the task we set out to hit. - * - * If ctx->task == current, we know things must remain valid because - * we have IRQs disabled so we cannot schedule. */ if (ctx->task) { - if (ctx->task != current) - return -EAGAIN; - - WARN_ON_ONCE(task_ctx != ctx); - } else { - WARN_ON_ONCE(&cpuctx->ctx != ctx); - } + if (ctx->task != current) { + ret = -EAGAIN; + goto unlock; + } - perf_ctx_lock(cpuctx, task_ctx); - /* - * Now that we hold locks, double check state. Paranoia pays. - */ - if (task_ctx) { - WARN_ON_ONCE(task_ctx->task != current); /* * We only use event_function_call() on established contexts, * and event_function() is only ever called when active (or @@ -233,12 +230,16 @@ static int event_function(void *info) * And since we have ctx->is_active, cpuctx->task_ctx must * match. */ - WARN_ON_ONCE(cpuctx->task_ctx != task_ctx); + WARN_ON_ONCE(task_ctx != ctx); + } else { + WARN_ON_ONCE(&cpuctx->ctx != ctx); } + efs->func(event, cpuctx, ctx, efs->data); +unlock: perf_ctx_unlock(cpuctx, task_ctx); - return 0; + return ret; } static void event_function_local(struct perf_event *event, event_f func, void *data) @@ -256,7 +257,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; + struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ struct event_function_struct efs = { .event = event, .func = func, @@ -278,30 +279,28 @@ static void event_function_call(struct perf_event *event, event_f func, void *da } again: + if (task == TASK_TOMBSTONE) + return; + if (!task_function_call(task, event_function, &efs)) return; raw_spin_lock_irq(&ctx->lock); - if (ctx->is_active) { - /* - * Reload the task pointer, it might have been changed by - * a concurrent perf_event_context_sched_out(). - */ - task = ctx->task; - raw_spin_unlock_irq(&ctx->lock); - goto again; + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; + if (task != TASK_TOMBSTONE) { + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + goto again; + } + func(event, NULL, ctx, data); } - func(event, NULL, ctx, data); raw_spin_unlock_irq(&ctx->lock); } -#define EVENT_OWNER_KERNEL ((void *) -1) - -static bool is_kernel_event(struct perf_event *event) -{ - return event->owner == EVENT_OWNER_KERNEL; -} - #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ @@ -1025,7 +1024,7 @@ static void put_ctx(struct perf_event_context *ctx) if (atomic_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); - if (ctx->task) + if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); } @@ -1186,6 +1185,7 @@ static u64 primary_event_id(struct perf_event *event) /* * Get the perf_event_context for a task and lock it. + * * This has to cope with with the fact that until it is locked, * the context could get moved to another task. */ @@ -1226,10 +1226,13 @@ retry: goto retry; } - if (!atomic_inc_not_zero(&ctx->refcount)) { + if (ctx->task == TASK_TOMBSTONE || + !atomic_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; } + + WARN_ON_ONCE(ctx->task != task); } rcu_read_unlock(); if (!ctx) @@ -2140,23 +2143,27 @@ static int __perf_install_in_context(void *info) struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; + raw_spin_lock(&cpuctx->ctx.lock); if (ctx->task) { + raw_spin_lock(&ctx->lock); /* * If we hit the 'wrong' task, we've since scheduled and * everything should be sorted, nothing to do! */ + task_ctx = ctx; if (ctx->task != current) - return 0; + goto unlock; /* * If task_ctx is set, it had better be to us. */ WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); - task_ctx = ctx; + } else if (task_ctx) { + raw_spin_lock(&task_ctx->lock); } - perf_ctx_lock(cpuctx, task_ctx); ctx_resched(cpuctx, task_ctx); +unlock: perf_ctx_unlock(cpuctx, task_ctx); return 0; @@ -2188,6 +2195,17 @@ perf_install_in_context(struct perf_event_context *ctx, * happened and that will have taken care of business. */ raw_spin_lock_irq(&ctx->lock); + task = ctx->task; + /* + * Worse, we cannot even rely on the ctx actually existing anymore. If + * between find_get_context() and perf_install_in_context() the task + * went through perf_event_exit_task() its dead and we should not be + * adding new events. + */ + if (task == TASK_TOMBSTONE) { + raw_spin_unlock_irq(&ctx->lock); + return; + } update_context_time(ctx); /* * Update cgrp time only if current cgrp matches event->cgrp. @@ -2195,7 +2213,6 @@ perf_install_in_context(struct perf_event_context *ctx, */ update_cgrp_time_from_event(event); add_event_to_ctx(event, ctx); - task = ctx->task; raw_spin_unlock_irq(&ctx->lock); if (task) @@ -2538,17 +2555,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - /* - * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_event_ctxp - */ - task->perf_event_ctxp[ctxn] = next_ctx; - next->perf_event_ctxp[ctxn] = ctx; - ctx->task = next; - next_ctx->task = task; + WRITE_ONCE(ctx->task, next); + WRITE_ONCE(next_ctx->task, task); swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + /* + * RCU_INIT_POINTER here is safe because we've not + * modified the ctx and the above modification of + * ctx->task and ctx->task_ctx_data are immaterial + * since those values are always verified under + * ctx->lock which we're now holding. + */ + RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); + RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); + do_switch = 0; perf_event_sync_stat(ctx, next_ctx); @@ -8545,7 +8566,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, } /* Mark owner so we could distinguish it from user events. */ - event->owner = EVENT_OWNER_KERNEL; + event->owner = TASK_TOMBSTONE; account_event(event); @@ -8725,28 +8746,26 @@ __perf_event_exit_task(struct perf_event *child_event, static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { - struct perf_event *child_event, *next; struct perf_event_context *child_ctx, *clone_ctx = NULL; + struct perf_event *child_event, *next; + unsigned long flags; - if (likely(!child->perf_event_ctxp[ctxn])) + WARN_ON_ONCE(child != current); + + child_ctx = perf_lock_task_context(child, ctxn, &flags); + if (!child_ctx) return; - local_irq_disable(); - WARN_ON_ONCE(child != current); - /* - * We can't reschedule here because interrupts are disabled, - * and child must be current. - */ - child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); + task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); /* - * Take the context lock here so that if find_get_context is - * reading child->perf_event_ctxp, we wait until it has - * incremented the context's refcount before we do put_ctx below. + * Now that the context is inactive, destroy the task <-> ctx relation + * and mark the context dead. */ - raw_spin_lock(&child_ctx->lock); - task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); - child->perf_event_ctxp[ctxn] = NULL; + RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); + put_ctx(child_ctx); /* cannot be last */ + WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); + put_task_struct(current); /* cannot be last */ /* * If this context is a clone; unclone it so it can't get @@ -8755,7 +8774,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ clone_ctx = unclone_ctx(child_ctx); update_context_time(child_ctx); - raw_spin_unlock_irq(&child_ctx->lock); + raw_spin_unlock_irqrestore(&child_ctx->lock, flags); if (clone_ctx) put_ctx(clone_ctx); -- cgit From 45c815f06b80031659c63d7b93e580015d6024dd Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 19 Jan 2016 17:14:29 +0200 Subject: perf: Synchronously free aux pages in case of allocation failure We are currently using asynchronous deallocation in the error path in AUX mmap code, which is unnecessary and also presents a problem for users that wish to probe for the biggest possible buffer size they can get: they'll get -EINVAL on all subsequent attemts to allocate a smaller buffer before the asynchronous deallocation callback frees up the pages from the previous unsuccessful attempt. Currently, gdb does that for allocating AUX buffers for Intel PT traces. More specifically, overwrite mode of AUX pmus that don't support hardware sg (some implementations of Intel PT, for instance) is limited to only one contiguous high order allocation for its buffer and there is no way of knowing its size without trying. This patch changes error path freeing to be synchronous as there won't be any contenders for the AUX pages at that point. Reported-by: Markus Metzger Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Link: http://lkml.kernel.org/r/1453216469-9509-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index adfdc0536117..1faad2cfdb9e 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -459,6 +459,25 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx) __free_page(page); } +static void __rb_free_aux(struct ring_buffer *rb) +{ + int pg; + + if (rb->aux_priv) { + rb->free_aux(rb->aux_priv); + rb->free_aux = NULL; + rb->aux_priv = NULL; + } + + if (rb->aux_nr_pages) { + for (pg = 0; pg < rb->aux_nr_pages; pg++) + rb_free_aux_page(rb, pg); + + kfree(rb->aux_pages); + rb->aux_nr_pages = 0; + } +} + int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags) { @@ -547,30 +566,11 @@ out: if (!ret) rb->aux_pgoff = pgoff; else - rb_free_aux(rb); + __rb_free_aux(rb); return ret; } -static void __rb_free_aux(struct ring_buffer *rb) -{ - int pg; - - if (rb->aux_priv) { - rb->free_aux(rb->aux_priv); - rb->free_aux = NULL; - rb->aux_priv = NULL; - } - - if (rb->aux_nr_pages) { - for (pg = 0; pg < rb->aux_nr_pages; pg++) - rb_free_aux_page(rb, pg); - - kfree(rb->aux_pages); - rb->aux_nr_pages = 0; - } -} - void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) -- cgit From 6f16886b7c050c934305b1f285c3458ff1b6e4e4 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 21 Jan 2016 11:19:29 +0000 Subject: cpuidle: fix fallback mechanism for suspend to idle in absence of enter_freeze Commit 51164251f5c3 "sched / idle: Drop default_idle_call() fallback from call_cpuidle()" made find_deepest_state() return non-negative value and check all the states with index > 0. Also as a result, find_deepest_state() returns 0 even when enter_freeze callbacks are not implemented and enter_freeze_proper() is called which ends up crashing the kernel. This patch updates the check for index > 0 in cpuidle_enter_freeze and cpuidle_idle_call(when idle_should_freeze is true) to restore the suspend-to-idle functionality in absence of enter_freeze callback. Fixes: 51164251f5c3 "sched / idle: Drop default_idle_call() fallback from call_cpuidle()" Signed-off-by: Sudeep Holla Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 2 +- kernel/sched/idle.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 046423b0c5ca..f996efc56605 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -153,7 +153,7 @@ int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev) * be frozen safely. */ index = find_deepest_state(drv, dev, UINT_MAX, 0, true); - if (index >= 0) + if (index > 0) enter_freeze_proper(drv, dev, index); return index; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 34852ee70d54..30f7b6ad3920 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -162,7 +162,7 @@ static void cpuidle_idle_call(void) */ if (idle_should_freeze()) { entered_state = cpuidle_enter_freeze(drv, dev); - if (entered_state >= 0) { + if (entered_state > 0) { local_irq_enable(); goto exit_idle; } -- cgit From dd4e17ab704269bce71402285f5e8b9ac24b1eff Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 21 Jan 2016 15:03:34 -0800 Subject: ntp: Fix ADJ_SETOFFSET being used w/ ADJ_NANO Recently, in commit 37cf4dc3370f I forgot to check if the timeval being passed was actually a timespec (as is signaled with ADJ_NANO). This resulted in that patch breaking ADJ_SETOFFSET users who set ADJ_NANO, by rejecting valid timespecs that were compared with valid timeval ranges. This patch addresses this by checking for the ADJ_NANO flag and using the timepsec check instead in that case. Reported-by: Harald Hoyer Reported-by: Kay Sievers Fixes: 37cf4dc3370f "time: Verify time values in adjtimex ADJ_SETOFFSET to avoid overflow" Signed-off-by: John Stultz Cc: Sasha Levin Cc: Richard Cochran Cc: Prarit Bhargava Cc: David Herrmann Link: http://lkml.kernel.org/r/1453417415-19110-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 36f2ca09aa5e..6df8927c58a5 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -685,8 +685,18 @@ int ntp_validate_timex(struct timex *txc) if (!capable(CAP_SYS_TIME)) return -EPERM; - if (!timeval_inject_offset_valid(&txc->time)) - return -EINVAL; + if (txc->modes & ADJ_NANO) { + struct timespec ts; + + ts.tv_sec = txc->time.tv_sec; + ts.tv_nsec = txc->time.tv_usec; + if (!timespec_inject_offset_valid(&ts)) + return -EINVAL; + + } else { + if (!timeval_inject_offset_valid(&txc->time)) + return -EINVAL; + } } /* -- cgit From 1dff76b92f69051e579bdc131e01500da9fa2a91 Mon Sep 17 00:00:00 2001 From: Gavin Guo Date: Wed, 20 Jan 2016 12:36:58 +0800 Subject: sched/numa: Fix use-after-free bug in the task_numa_compare The following message can be observed on the Ubuntu v3.13.0-65 with KASan backported: ================================================================== BUG: KASan: use after free in task_numa_find_cpu+0x64c/0x890 at addr ffff880dd393ecd8 Read of size 8 by task qemu-system-x86/3998900 ============================================================================= BUG kmalloc-128 (Tainted: G B ): kasan: bad access detected ----------------------------------------------------------------------------- INFO: Allocated in task_numa_fault+0xc1b/0xed0 age=41980 cpu=18 pid=3998890 __slab_alloc+0x4f8/0x560 __kmalloc+0x1eb/0x280 task_numa_fault+0xc1b/0xed0 do_numa_page+0x192/0x200 handle_mm_fault+0x808/0x1160 __do_page_fault+0x218/0x750 do_page_fault+0x1a/0x70 page_fault+0x28/0x30 SyS_poll+0x66/0x1a0 system_call_fastpath+0x1a/0x1f INFO: Freed in task_numa_free+0x1d2/0x200 age=62 cpu=18 pid=0 __slab_free+0x2ab/0x3f0 kfree+0x161/0x170 task_numa_free+0x1d2/0x200 finish_task_switch+0x1d2/0x210 __schedule+0x5d4/0xc60 schedule_preempt_disabled+0x40/0xc0 cpu_startup_entry+0x2da/0x340 start_secondary+0x28f/0x360 Call Trace: [] dump_stack+0x45/0x56 [] print_trailer+0xfd/0x170 [] object_err+0x36/0x40 [] kasan_report_error+0x1e9/0x3a0 [] kasan_report+0x40/0x50 [] ? task_numa_find_cpu+0x64c/0x890 [] __asan_load8+0x69/0xa0 [] ? find_next_bit+0xd8/0x120 [] task_numa_find_cpu+0x64c/0x890 [] task_numa_migrate+0x4ac/0x7b0 [] numa_migrate_preferred+0xb3/0xc0 [] task_numa_fault+0xb88/0xed0 [] do_numa_page+0x192/0x200 [] handle_mm_fault+0x808/0x1160 [] ? sched_clock_cpu+0x10d/0x160 [] ? native_load_tls+0x82/0xa0 [] __do_page_fault+0x218/0x750 [] ? hrtimer_try_to_cancel+0x76/0x160 [] ? schedule_hrtimeout_range_clock.part.24+0xf7/0x1c0 [] do_page_fault+0x1a/0x70 [] page_fault+0x28/0x30 [] ? do_sys_poll+0x1c4/0x6d0 [] ? enqueue_task_fair+0x4b6/0xaa0 [] ? sched_clock+0x9/0x10 [] ? resched_task+0x7a/0xc0 [] ? check_preempt_curr+0xb3/0x130 [] ? poll_select_copy_remaining+0x170/0x170 [] ? wake_up_state+0x10/0x20 [] ? drop_futex_key_refs.isra.14+0x1f/0x90 [] ? futex_requeue+0x3de/0xba0 [] ? do_futex+0xbe/0x8f0 [] ? read_tsc+0x9/0x20 [] ? ktime_get_ts+0x12d/0x170 [] ? timespec_add_safe+0x59/0xe0 [] SyS_poll+0x66/0x1a0 [] system_call_fastpath+0x1a/0x1f As commit 1effd9f19324 ("sched/numa: Fix unsafe get_task_struct() in task_numa_assign()") points out, the rcu_read_lock() cannot protect the task_struct from being freed in the finish_task_switch(). And the bug happens in the process of calculation of imp which requires the access of p->numa_faults being freed in the following path: do_exit() current->flags |= PF_EXITING; release_task() ~~delayed_put_task_struct()~~ schedule() ... ... rq->curr = next; context_switch() finish_task_switch() put_task_struct() __put_task_struct() task_numa_free() The fix here to get_task_struct() early before end of dst_rq->lock to protect the calculation process and also put_task_struct() in the corresponding point if finally the dst_rq->curr somehow cannot be assigned. Additional credit to Liang Chen who helped fix the error logic and add the put_task_struct() to the place it missed. Signed-off-by: Gavin Guo Signed-off-by: Peter Zijlstra (Intel) Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: jay.vosburgh@canonical.com Cc: liang.chen@canonical.com Link: http://lkml.kernel.org/r/1453264618-17645-1-git-send-email-gavin.guo@canonical.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1926606ece80..56b7d4b83947 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1220,8 +1220,6 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); - if (p) - get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1289,20 +1287,30 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; + bool assigned = false; rcu_read_lock(); raw_spin_lock_irq(&dst_rq->lock); cur = dst_rq->curr; /* - * No need to move the exiting task, and this ensures that ->curr - * wasn't reaped and thus get_task_struct() in task_numa_assign() - * is safe under RCU read lock. - * Note that rcu_read_lock() itself can't protect from the final - * put_task_struct() after the last schedule(). + * No need to move the exiting task or idle task. */ if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + else { + /* + * The task_struct must be protected here to protect the + * p->numa_faults access in the task_weight since the + * numa_faults could already be freed in the following path: + * finish_task_switch() + * --> put_task_struct() + * --> __put_task_struct() + * --> task_numa_free() + */ + get_task_struct(cur); + } + raw_spin_unlock_irq(&dst_rq->lock); /* @@ -1386,6 +1394,7 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; + put_task_struct(cur); cur = NULL; goto assign; } @@ -1411,9 +1420,16 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: + assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); + /* + * The dst_rq->curr isn't assigned. The protection for task_struct is + * finished. + */ + if (cur && !assigned) + put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, -- cgit From e93ad19d05648397ef3bcb838d26aec06c245dc0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Jan 2016 12:18:41 -0500 Subject: cpuset: make mm migration asynchronous If "cpuset.memory_migrate" is set, when a process is moved from one cpuset to another with a different memory node mask, pages in used by the process are migrated to the new set of nodes. This was performed synchronously in the ->attach() callback, which is synchronized against process management. Recently, the synchronization was changed from per-process rwsem to global percpu rwsem for simplicity and optimization. Combined with the synchronous mm migration, this led to deadlocks because mm migration could schedule a work item which may in turn try to create a new worker blocking on the process management lock held from cgroup process migration path. This heavy an operation shouldn't be performed synchronously from that deep inside cgroup migration in the first place. This patch punts the actual migration to an ordered workqueue and updates cgroup process migration and cpuset config update paths to flush the workqueue after all locks are released. This way, the operations still seem synchronous to userland without entangling mm migration with process management synchronization. CPU hotplug can also invoke mm migration but there's no reason for it to wait for mm migrations and thus doesn't synchronize against their completions. Signed-off-by: Tejun Heo Reported-and-tested-by: Christian Borntraeger Cc: stable@vger.kernel.org # v4.4+ --- include/linux/cpuset.h | 6 +++++ kernel/cgroup.c | 2 ++ kernel/cpuset.c | 71 ++++++++++++++++++++++++++++++++++---------------- 3 files changed, 57 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 85a868ccb493..fea160ee5803 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -137,6 +137,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) task_unlock(current); } +extern void cpuset_post_attach_flush(void); + #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } @@ -243,6 +245,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq) return false; } +static inline void cpuset_post_attach_flush(void) +{ +} + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c03a640ef6da..88abd4d076d8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -58,6 +58,7 @@ #include #include #include +#include #include /* @@ -2739,6 +2740,7 @@ out_unlock_rcu: out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); cgroup_kn_unlock(of->kn); + cpuset_post_attach_flush(); return ret ?: nbytes; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e945fcd8179..41989ab4db57 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -287,6 +287,8 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); static DEFINE_SPINLOCK(callback_lock); +static struct workqueue_struct *cpuset_migrate_mm_wq; + /* * CPU / memory hotplug is handled asynchronously. */ @@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } /* - * cpuset_migrate_mm - * - * Migrate memory region from one set of nodes to another. - * - * Temporarilly set tasks mems_allowed to target nodes of migration, - * so that the migration code can allocate pages on these nodes. - * - * While the mm_struct we are migrating is typically from some - * other task, the task_struct mems_allowed that we are hacking - * is for our current task, which must allocate new pages for that - * migrating memory region. + * Migrate memory region from one set of nodes to another. This is + * performed asynchronously as it can be called from process migration path + * holding locks involved in process management. All mm migrations are + * performed in the queued order and can be waited for by flushing + * cpuset_migrate_mm_wq. */ +struct cpuset_migrate_mm_work { + struct work_struct work; + struct mm_struct *mm; + nodemask_t from; + nodemask_t to; +}; + +static void cpuset_migrate_mm_workfn(struct work_struct *work) +{ + struct cpuset_migrate_mm_work *mwork = + container_of(work, struct cpuset_migrate_mm_work, work); + + /* on a wq worker, no need to worry about %current's mems_allowed */ + do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); + mmput(mwork->mm); + kfree(mwork); +} + static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { - struct task_struct *tsk = current; - - tsk->mems_allowed = *to; + struct cpuset_migrate_mm_work *mwork; - do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); + if (mwork) { + mwork->mm = mm; + mwork->from = *from; + mwork->to = *to; + INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); + queue_work(cpuset_migrate_mm_wq, &mwork->work); + } else { + mmput(mm); + } +} - rcu_read_lock(); - guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); - rcu_read_unlock(); +void cpuset_post_attach_flush(void) +{ + flush_workqueue(cpuset_migrate_mm_wq); } /* @@ -1097,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs) mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); - mmput(mm); + else + mmput(mm); } css_task_iter_end(&it); @@ -1545,11 +1568,11 @@ static void cpuset_attach(struct cgroup_taskset *tset) * @old_mems_allowed is the right nodesets that we * migrate mm from. */ - if (is_memory_migrate(cs)) { + if (is_memory_migrate(cs)) cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); - } - mmput(mm); + else + mmput(mm); } } @@ -1714,6 +1737,7 @@ out_unlock: mutex_unlock(&cpuset_mutex); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); + flush_workqueue(cpuset_migrate_mm_wq); return retval ?: nbytes; } @@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void) top_cpuset.effective_mems = node_states[N_MEMORY]; register_hotmemory_notifier(&cpuset_track_online_nodes_nb); + + cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); + BUG_ON(!cpuset_migrate_mm_wq); } /** -- cgit From aa226ff4a1ce79f229c6b7a4c0a14e17fececd01 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Jan 2016 15:31:11 -0500 Subject: cgroup: make sure a parent css isn't offlined before its children There are three subsystem callbacks in css shutdown path - css_offline(), css_released() and css_free(). Except for css_released(), cgroup core didn't guarantee the order of invocation. css_offline() or css_free() could be called on a parent css before its children. This behavior is unexpected and led to bugs in cpu and memory controller. This patch updates offline path so that a parent css is never offlined before its children. Each css keeps online_cnt which reaches zero iff itself and all its children are offline and offline_css() is invoked only after online_cnt reaches zero. This fixes the memory controller bug and allows the fix for cpu controller. Signed-off-by: Tejun Heo Reported-and-tested-by: Christian Borntraeger Reported-by: Brian Christiansen Link: http://lkml.kernel.org/g/5698A023.9070703@de.ibm.com Link: http://lkml.kernel.org/g/CAKB58ikDkzc8REt31WBkD99+hxNzjK4+FBmhkgS+NVrC9vjMSg@mail.gmail.com Cc: Heiko Carstens Cc: Peter Zijlstra Cc: stable@vger.kernel.org --- include/linux/cgroup-defs.h | 6 ++++++ kernel/cgroup.c | 22 +++++++++++++++++----- 2 files changed, 23 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 7f540f7f588d..789471dba6fb 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -127,6 +127,12 @@ struct cgroup_subsys_state { */ u64 serial_nr; + /* + * Incremented by online self and children. Used to guarantee that + * parents are not offlined before their children. + */ + atomic_t online_cnt; + /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; struct work_struct destroy_work; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 88abd4d076d8..d01587793865 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4760,6 +4760,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); css->serial_nr = css_serial_nr_next++; + atomic_set(&css->online_cnt, 0); if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); @@ -4782,6 +4783,10 @@ static int online_css(struct cgroup_subsys_state *css) if (!ret) { css->flags |= CSS_ONLINE; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); + + atomic_inc(&css->online_cnt); + if (css->parent) + atomic_inc(&css->parent->online_cnt); } return ret; } @@ -5019,10 +5024,15 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); mutex_lock(&cgroup_mutex); - offline_css(css); - mutex_unlock(&cgroup_mutex); - css_put(css); + do { + offline_css(css); + css_put(css); + /* @css can't go away while we're holding cgroup_mutex */ + css = css->parent; + } while (css && atomic_dec_and_test(&css->online_cnt)); + + mutex_unlock(&cgroup_mutex); } /* css kill confirmation processing requires process context, bounce */ @@ -5031,8 +5041,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + if (atomic_dec_and_test(&css->online_cnt)) { + INIT_WORK(&css->destroy_work, css_killed_work_fn); + queue_work(cgroup_destroy_wq, &css->destroy_work); + } } /** -- cgit From 8bb5ef79bc0f4016ecf79e8dce6096a3c63603e4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Jan 2016 15:32:15 -0500 Subject: cgroup: make sure a parent css isn't freed before its children There are three subsystem callbacks in css shutdown path - css_offline(), css_released() and css_free(). Except for css_released(), cgroup core didn't guarantee the order of invocation. css_offline() or css_free() could be called on a parent css before its children. This behavior is unexpected and led to bugs in cpu and memory controller. The previous patch updated ordering for css_offline() which fixes the cpu controller issue. While there currently isn't a known bug caused by misordering of css_free() invocations, let's fix it too for consistency. css_free() ordering can be trivially fixed by moving putting of the parent css below css_free() invocation. Signed-off-by: Tejun Heo Cc: Peter Zijlstra --- kernel/cgroup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d01587793865..d27904c193da 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4657,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work) if (ss) { /* css free path */ + struct cgroup_subsys_state *parent = css->parent; int id = css->id; - if (css->parent) - css_put(css->parent); - ss->css_free(css); cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); + + if (parent) + css_put(parent); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); -- cgit From 133e1e5acd4a63c4a0dcc413e90d5decdbce9c4a Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 25 Jan 2016 18:04:15 -0500 Subject: audit: stop an old auditd being starved out by a new auditd Nothing prevents a new auditd starting up and replacing a valid audit_pid when an old auditd is still running, effectively starving out the old auditd since audit_pid no longer points to the old valid auditd. If no message to auditd has been attempted since auditd died unnaturally or got killed, audit_pid will still indicate it is alive. There isn't an easy way to detect if an old auditd is still running on the existing audit_pid other than attempting to send a message to see if it fails. An -ECONNREFUSED almost certainly means it disappeared and can be replaced. Other errors are not so straightforward and may indicate transient problems that will resolve themselves and the old auditd will recover. Yet others will likely need manual intervention for which a new auditd will not solve the problem. Send a new message type (AUDIT_REPLACE) to the old auditd containing a u32 with the PID of the new auditd. If the audit replace message succeeds (or doesn't fail with certainty), fail to register the new auditd and return an error (-EEXIST). This is expected to make the patch preventing an old auditd orphaning a new auditd redundant. V3: Switch audit message type from 1000 to 1300 block. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/uapi/linux/audit.h | 1 + kernel/audit.c | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 843540c398eb..d820aa979620 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -110,6 +110,7 @@ #define AUDIT_SECCOMP 1326 /* Secure Computing event */ #define AUDIT_PROCTITLE 1327 /* Proctitle emit event */ #define AUDIT_FEATURE_CHANGE 1328 /* audit log listing feature changes */ +#define AUDIT_REPLACE 1329 /* Replace auditd if this packet unanswerd */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ diff --git a/kernel/audit.c b/kernel/audit.c index d6dd95cc59e6..2fd63d6879c5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -809,6 +809,16 @@ static int audit_set_feature(struct sk_buff *skb) return 0; } +static int audit_replace(pid_t pid) +{ + struct sk_buff *skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, + &pid, sizeof(pid)); + + if (!skb) + return -ENOMEM; + return netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); +} + static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; @@ -870,9 +880,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } if (s.mask & AUDIT_STATUS_PID) { int new_pid = s.pid; + pid_t requesting_pid = task_tgid_vnr(current); - if ((!new_pid) && (task_tgid_vnr(current) != audit_pid)) + if ((!new_pid) && (requesting_pid != audit_pid)) return -EACCES; + if (audit_pid && new_pid && + audit_replace(requesting_pid) != -ECONNREFUSED) + return -EEXIST; if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, audit_pid, 1); audit_pid = new_pid; -- cgit From 935c9e7ff06abf12c45155f75ec2f712d3768095 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 25 Jan 2016 18:04:15 -0500 Subject: audit: log failed attempts to change audit_pid configuration Failed attempts to change the audit_pid configuration are not presently logged. One case is an attempt to starve an old auditd by starting up a new auditd when the old one is still alive and active. The other case is an attempt to orphan a new auditd when an old auditd shuts down. Log both as AUDIT_CONFIG_CHANGE messages with failure result. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 2fd63d6879c5..8fa7533bf106 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -882,11 +882,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) int new_pid = s.pid; pid_t requesting_pid = task_tgid_vnr(current); - if ((!new_pid) && (requesting_pid != audit_pid)) + if ((!new_pid) && (requesting_pid != audit_pid)) { + audit_log_config_change("audit_pid", new_pid, audit_pid, 0); return -EACCES; + } if (audit_pid && new_pid && - audit_replace(requesting_pid) != -ECONNREFUSED) + audit_replace(requesting_pid) != -ECONNREFUSED) { + audit_log_config_change("audit_pid", new_pid, audit_pid, 0); return -EEXIST; + } if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, audit_pid, 1); audit_pid = new_pid; -- cgit From b4abf91047cf054f203dcfac97e1038388826937 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Jan 2016 11:25:38 +0100 Subject: rtmutex: Make wait_lock irq safe Sasha reported a lockdep splat about a potential deadlock between RCU boosting rtmutex and the posix timer it_lock. CPU0 CPU1 rtmutex_lock(&rcu->rt_mutex) spin_lock(&rcu->rt_mutex.wait_lock) local_irq_disable() spin_lock(&timer->it_lock) spin_lock(&rcu->mutex.wait_lock) --> Interrupt spin_lock(&timer->it_lock) This is caused by the following code sequence on CPU1 rcu_read_lock() x = lookup(); if (x) spin_lock_irqsave(&x->it_lock); rcu_read_unlock(); return x; We could fix that in the posix timer code by keeping rcu read locked across the spinlocked and irq disabled section, but the above sequence is common and there is no reason not to support it. Taking rt_mutex.wait_lock irq safe prevents the deadlock. Reported-by: Sasha Levin Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Paul McKenney --- kernel/futex.c | 18 +++---- kernel/locking/rtmutex.c | 135 +++++++++++++++++++++++++---------------------- 2 files changed, 81 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 0773f2b23b10..5d6ce6413ef1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1191,7 +1191,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (pi_state->owner != current) return -EINVAL; - raw_spin_lock(&pi_state->pi_mutex.wait_lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* @@ -1217,22 +1217,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, else if (curval != uval) ret = -EINVAL; if (ret) { - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return ret; } - raw_spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock(&pi_state->owner->pi_lock); - raw_spin_lock_irq(&new_owner->pi_lock); + raw_spin_lock(&new_owner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; - raw_spin_unlock_irq(&new_owner->pi_lock); + raw_spin_unlock(&new_owner->pi_lock); - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); @@ -2127,11 +2127,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * we returned due to timeout or signal without taking the * rt_mutex. Too late. */ - raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); + raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); owner = rt_mutex_owner(&q->pi_state->pi_mutex); if (!owner) owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); ret = fixup_pi_state_owner(uaddr, q, owner); goto out; } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8251e75dd9c0..3e746607abe5 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -99,13 +99,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) * 2) Drop lock->wait_lock * 3) Try to unlock the lock with cmpxchg */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + unsigned long flags) __releases(lock->wait_lock) { struct task_struct *owner = rt_mutex_owner(lock); clear_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* * If a new waiter comes in between the unlock and the cmpxchg * we have two situations: @@ -147,11 +148,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) /* * Simple slow path only version: lock->owner is protected by lock->wait_lock. */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + unsigned long flags) __releases(lock->wait_lock) { lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return true; } #endif @@ -433,7 +435,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, int ret = 0, depth = 0; struct rt_mutex *lock; bool detect_deadlock; - unsigned long flags; bool requeue = true; detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk); @@ -476,7 +477,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * [1] Task cannot go away as we did a get_task() before ! */ - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irq(&task->pi_lock); /* * [2] Get the waiter on which @task is blocked on. @@ -560,7 +561,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * operations. */ if (!raw_spin_trylock(&lock->wait_lock)) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irq(&task->pi_lock); cpu_relax(); goto retry; } @@ -591,7 +592,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * No requeue[7] here. Just release @task [8] */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); put_task_struct(task); /* @@ -599,14 +600,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * If there is no owner of the lock, end of chain. */ if (!rt_mutex_owner(lock)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 0; } /* [10] Grab the next task, i.e. owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); /* * No requeue [11] here. We just do deadlock detection. @@ -621,8 +622,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* [13] Drop locks */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock(&task->pi_lock); + raw_spin_unlock_irq(&lock->wait_lock); /* If owner is not blocked, end of chain. */ if (!next_lock) @@ -643,7 +644,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, rt_mutex_enqueue(lock, waiter); /* [8] Release the task */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); put_task_struct(task); /* @@ -661,14 +662,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, */ if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) wake_up_process(rt_mutex_top_waiter(lock)->task); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 0; } /* [10] Grab the next task, i.e. the owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); /* [11] requeue the pi waiters if necessary */ if (waiter == rt_mutex_top_waiter(lock)) { @@ -722,8 +723,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* [13] Drop the locks */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock(&task->pi_lock); + raw_spin_unlock_irq(&lock->wait_lock); /* * Make the actual exit decisions [12], based on the stored @@ -746,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto again; out_unlock_pi: - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irq(&task->pi_lock); out_put_task: put_task_struct(task); @@ -756,7 +757,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * Try to take an rt-mutex * - * Must be called with lock->wait_lock held. + * Must be called with lock->wait_lock held and interrupts disabled * * @lock: The lock to be acquired. * @task: The task which wants to acquire the lock @@ -766,8 +767,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, struct rt_mutex_waiter *waiter) { - unsigned long flags; - /* * Before testing whether we can acquire @lock, we set the * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all @@ -852,7 +851,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * case, but conditionals are more expensive than a redundant * store. */ - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); task->pi_blocked_on = NULL; /* * Finish the lock acquisition. @task is the new owner. If @@ -861,7 +860,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, */ if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock)); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); takeit: /* We got the lock. */ @@ -883,7 +882,7 @@ takeit: * * Prepare waiter and propagate pi chain * - * This must be called with lock->wait_lock held. + * This must be called with lock->wait_lock held and interrupts disabled */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, @@ -894,7 +893,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *top_waiter = waiter; struct rt_mutex *next_lock; int chain_walk = 0, res; - unsigned long flags; /* * Early deadlock detection. We really don't want the task to @@ -908,7 +906,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, if (owner == task) return -EDEADLK; - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; @@ -921,12 +919,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, task->pi_blocked_on = waiter; - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); if (!owner) return 0; - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock(&owner->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); @@ -941,7 +939,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock(&owner->pi_lock); /* * Even if full deadlock detection is on, if the owner is not * blocked itself, we can avoid finding this out in the chain @@ -957,12 +955,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, */ get_task_struct(owner); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); return res; } @@ -971,15 +969,14 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * Remove the top waiter from the current tasks pi waiter tree and * queue it up. * - * Called with lock->wait_lock held. + * Called with lock->wait_lock held and interrupts disabled. */ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, struct rt_mutex *lock) { struct rt_mutex_waiter *waiter; - unsigned long flags; - raw_spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); @@ -1001,7 +998,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, */ lock->owner = (void *) RT_MUTEX_HAS_WAITERS; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock(¤t->pi_lock); wake_q_add(wake_q, waiter->task); } @@ -1009,7 +1006,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, /* * Remove a waiter from a lock and give up * - * Must be called with lock->wait_lock held and + * Must be called with lock->wait_lock held and interrupts disabled. I must * have just failed to try_to_take_rt_mutex(). */ static void remove_waiter(struct rt_mutex *lock, @@ -1018,12 +1015,11 @@ static void remove_waiter(struct rt_mutex *lock, bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex *next_lock; - unsigned long flags; - raw_spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock(¤t->pi_lock); rt_mutex_dequeue(lock, waiter); current->pi_blocked_on = NULL; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock(¤t->pi_lock); /* * Only update priority if the waiter was the highest priority @@ -1032,7 +1028,7 @@ static void remove_waiter(struct rt_mutex *lock, if (!owner || !is_top_waiter) return; - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock(&owner->pi_lock); rt_mutex_dequeue_pi(owner, waiter); @@ -1044,7 +1040,7 @@ static void remove_waiter(struct rt_mutex *lock, /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock(&owner->pi_lock); /* * Don't walk the chain, if the owner task is not blocked @@ -1056,12 +1052,12 @@ static void remove_waiter(struct rt_mutex *lock, /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, next_lock, NULL, current); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); } /* @@ -1097,11 +1093,11 @@ void rt_mutex_adjust_pi(struct task_struct *task) * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take * @state: the state the task should block in (TASK_INTERRUPTIBLE - * or TASK_UNINTERRUPTIBLE) + * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter * - * lock->wait_lock must be held by the caller. + * Must be called with lock->wait_lock held and interrupts disabled */ static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, @@ -1129,13 +1125,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); schedule(); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); } @@ -1172,17 +1168,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, enum rtmutex_chainwalk chwalk) { struct rt_mutex_waiter waiter; + unsigned long flags; int ret = 0; debug_rt_mutex_init_waiter(&waiter); RB_CLEAR_NODE(&waiter.pi_tree_entry); RB_CLEAR_NODE(&waiter.tree_entry); - raw_spin_lock(&lock->wait_lock); + /* + * Technically we could use raw_spin_[un]lock_irq() here, but this can + * be called in early boot if the cmpxchg() fast path is disabled + * (debug, no architecture support). In this case we will acquire the + * rtmutex with lock->wait_lock held. But we cannot unconditionally + * enable interrupts in that early boot case. So we need to use the + * irqsave/restore variants. + */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return 0; } @@ -1211,7 +1216,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* Remove pending timer: */ if (unlikely(timeout)) @@ -1227,6 +1232,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) { + unsigned long flags; int ret; /* @@ -1238,10 +1244,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) return 0; /* - * The mutex has currently no owner. Lock the wait lock and - * try to acquire the lock. + * The mutex has currently no owner. Lock the wait lock and try to + * acquire the lock. We use irqsave here to support early boot calls. */ - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); ret = try_to_take_rt_mutex(lock, current, NULL); @@ -1251,7 +1257,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return ret; } @@ -1263,7 +1269,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, struct wake_q_head *wake_q) { - raw_spin_lock(&lock->wait_lock); + unsigned long flags; + + /* irqsave required to support early boot calls */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_rt_mutex_unlock(lock); @@ -1302,10 +1311,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, */ while (!rt_mutex_has_waiters(lock)) { /* Drops lock->wait_lock ! */ - if (unlock_rt_mutex_safe(lock) == true) + if (unlock_rt_mutex_safe(lock, flags) == true) return false; /* Relock the rtmutex and try again */ - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); } /* @@ -1316,7 +1325,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, */ mark_wakeup_next_waiter(wake_q, lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* check PI boosting */ return true; @@ -1596,10 +1605,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, { int ret; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); if (try_to_take_rt_mutex(lock, task, NULL)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 1; } @@ -1620,7 +1629,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (unlikely(ret)) remove_waiter(lock, waiter); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); @@ -1668,7 +1677,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, { int ret; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); set_current_state(TASK_INTERRUPTIBLE); @@ -1684,7 +1693,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return ret; } -- cgit From 530cbe100ef7587aa5b5ac3a4b670cda4d50e598 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 26 Jan 2016 13:52:25 +0000 Subject: irqdomain: Allow domain lookup with DOMAIN_BUS_WIRED token Let's take the (outlandish) example of an interrupt controller capable of handling both wired interrupts and PCI MSIs. With the current code, the PCI MSI domain is going to be tagged with DOMAIN_BUS_PCI_MSI, and the wired domain with DOMAIN_BUS_ANY. Things get hairy when we start looking up the domain for a wired interrupt (typically when creating it based on some firmware information - DT or ACPI). In irq_create_fwspec_mapping(), we perform the lookup using DOMAIN_BUS_ANY, which is actually used as a wildcard. This gives us one chance out of two to end up with the wrong domain, and we try to configure a wired interrupt with the MSI domain. Everything grinds to a halt pretty quickly. What we really need to do is to start looking for a domain that would uniquely identify a wired interrupt domain, and only use DOMAIN_BUS_ANY as a fallback. In order to solve this, let's introduce a new DOMAIN_BUS_WIRED token, which is going to be used exactly as described above. Of course, this depends on the irqchip to setup the domain bus_token, and nobody had to implement this so far. Only so far. Signed-off-by: Marc Zyngier Cc: Greg Kroah-Hartman Cc: Rob Herring Cc: Frank Rowand Cc: Grant Likely Cc: Thomas Petazzoni Cc: Jiang Liu Link: http://lkml.kernel.org/r/1453816347-32720-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 1 + kernel/irq/irqdomain.c | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index f64622ad02c1..04579d9fbce4 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -70,6 +70,7 @@ struct irq_fwspec { */ enum irq_domain_bus_token { DOMAIN_BUS_ANY = 0, + DOMAIN_BUS_WIRED, DOMAIN_BUS_PCI_MSI, DOMAIN_BUS_PLATFORM_MSI, DOMAIN_BUS_NEXUS, diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8cf95de1ab3f..d75179735a28 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -575,10 +575,15 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) unsigned int type = IRQ_TYPE_NONE; int virq; - if (fwspec->fwnode) - domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY); - else + if (fwspec->fwnode) { + domain = irq_find_matching_fwnode(fwspec->fwnode, + DOMAIN_BUS_WIRED); + if (!domain) + domain = irq_find_matching_fwnode(fwspec->fwnode, + DOMAIN_BUS_ANY); + } else { domain = irq_default_domain; + } if (!domain) { pr_warn("no irq domain found for %s !\n", -- cgit From 7809998ab1af22602a8463845108edc49dfb9ef0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Jan 2016 16:41:49 +0100 Subject: tick/sched: Hide unused oneshot timer code A couple of functions in kernel/time/tick-sched.c are only relevant for oneshot timer mode, i.e. when hires-timers or nohz mode are enabled. If both are disabled, we get gcc warnings about them: kernel/time/tick-sched.c:98:16: warning: 'tick_init_jiffy_update' defined but not used [-Wunused-function] static ktime_t tick_init_jiffy_update(void) ^ kernel/time/tick-sched.c:112:13: warning: 'tick_sched_do_timer' defined but not used [-Wunused-function] static void tick_sched_do_timer(ktime_t now) ^ kernel/time/tick-sched.c:134:13: warning: 'tick_sched_handle' defined but not used [-Wunused-function] static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) ^ This encloses the whole set of functions in an appropriate ifdef to avoid the warning and to make it clearer when they are used. Signed-off-by: Arnd Bergmann Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1453736525-1959191-1-git-send-email-arnd@arndb.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7ea28ed3109d..cbe5d8dcf15a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -36,16 +36,17 @@ */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); -/* - * The time, when the last jiffy update happened. Protected by jiffies_lock. - */ -static ktime_t last_jiffies_update; - struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +/* + * The time, when the last jiffy update happened. Protected by jiffies_lock. + */ +static ktime_t last_jiffies_update; + /* * Must be called with interrupts disabled ! */ @@ -151,6 +152,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } +#endif #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; -- cgit From 9c808765e88efb6fa6af7e2206ef89512f1840a7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 15 Jan 2016 17:41:08 +0000 Subject: hrtimer: Add support for CLOCK_MONOTONIC_RAW The KVM/ARM timer implementation arms a hrtimer when a vcpu is blocked (usually because it is waiting for an interrupt) while its timer is going to kick in the future. It is essential that this timer doesn't get adjusted, or the guest will end up being woken-up at the wrong time (NTP running on the host seems to confuse the hell out of some guests). In order to allow this, let's add CLOCK_MONOTONIC_RAW support to hrtimer (it is so far only supported for posix timers). It also has the (limited) benefit of fixing de0421d53bfb ("mac80211_hwsim: shuffle code to prepare for dynamic radios"), which already uses this functionnality without realizing wasn't implemented (just being lucky...). Signed-off-by: Marc Zyngier Cc: Tomasz Nowicki Cc: Christoffer Dall Link: http://lkml.kernel.org/r/1452879670-16133-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 1 + kernel/time/hrtimer.c | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 76dd4f0da5ca..a6d64af5e73f 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -151,6 +151,7 @@ enum hrtimer_base_type { HRTIMER_BASE_REALTIME, HRTIMER_BASE_BOOTTIME, HRTIMER_BASE_TAI, + HRTIMER_BASE_MONOTONIC_RAW, HRTIMER_MAX_CLOCK_BASES, }; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 435b8850dd80..a125f222fee2 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -90,12 +90,18 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, + { + .index = HRTIMER_BASE_MONOTONIC_RAW, + .clockid = CLOCK_MONOTONIC_RAW, + .get_time = &ktime_get_raw, + }, } }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, + [CLOCK_MONOTONIC_RAW] = HRTIMER_BASE_MONOTONIC_RAW, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; @@ -1268,7 +1274,10 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) if (!(active & 0x01)) continue; - basenow = ktime_add(now, base->offset); + if (unlikely(base->index == HRTIMER_BASE_MONOTONIC_RAW)) + basenow = ktime_get_raw(); + else + basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; -- cgit From 9006a01829a50cfd6bbd4980910ed46e895e93d7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 15 Jan 2016 17:41:09 +0000 Subject: hrtimer: Catch illegal clockids It is way too easy to take any random clockid and feed it to the hrtimer subsystem. At best, it gets mapped to a monotonic base, but it would be better to just catch illegal values as early as possible. This patch does exactly that, mapping illegal clockids to an illegal base index, and panicing when we detect the illegal condition. Signed-off-by: Marc Zyngier Cc: Tomasz Nowicki Cc: Christoffer Dall Link: http://lkml.kernel.org/r/1452879670-16133-3-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index a125f222fee2..cb0fe70f3c51 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -99,6 +99,9 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { + /* Make sure we catch unsupported clockids */ + [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, + [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_MONOTONIC_RAW] = HRTIMER_BASE_MONOTONIC_RAW, @@ -108,7 +111,9 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - return hrtimer_clock_to_base_table[clock_id]; + int base = hrtimer_clock_to_base_table[clock_id]; + BUG_ON(base == HRTIMER_MAX_CLOCK_BASES); + return base; } /* -- cgit From bbf66d897adf2bb0c310db96c97e8db6369f39e1 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 22 Jan 2016 18:31:53 +0100 Subject: clocksource: Allow unregistering the watchdog Hyper-V vmbus module registers TSC page clocksource when loaded. This is the clocksource with the highest rating and thus it becomes the watchdog making unloading of the vmbus module impossible. Separate clocksource_select_watchdog() from clocksource_enqueue_watchdog() and use it on clocksource register/rating change/unregister. After all, lobotomized monkeys may need some love too. Signed-off-by: Vitaly Kuznetsov Cc: John Stultz Cc: Dexuan Cui Cc: K. Y. Srinivasan Link: http://lkml.kernel.org/r/1453483913-25672-1-git-send-email-vkuznets@redhat.com Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 52 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 664de539299b..56ece145a814 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) /* cs is a watchdog. */ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_select_watchdog(bool fallback) +{ + struct clocksource *cs, *old_wd; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + /* save current watchdog */ + old_wd = watchdog; + if (fallback) + watchdog = NULL; + + list_for_each_entry(cs, &clocksource_list, list) { + /* cs is a clocksource to be watched. */ + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + + /* Skip current if we were requested for a fallback. */ + if (fallback && cs == old_wd) + continue; + /* Pick the best watchdog. */ - if (!watchdog || cs->rating > watchdog->rating) { + if (!watchdog || cs->rating > watchdog->rating) watchdog = cs; - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - } } + /* If we failed to find a fallback restore the old one. */ + if (!watchdog) + watchdog = old_wd; + + /* If we changed the watchdog we need to reset cycles. */ + if (watchdog != old_wd) + clocksource_reset_watchdog(); + /* Check if the watchdog timer needs to be started. */ clocksource_start_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); @@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } +static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int __clocksource_watchdog_kthread(void) { return 0; } @@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); return 0; } @@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) mutex_lock(&clocksource_mutex); __clocksource_change_rating(cs, rating); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); @@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating); */ static int clocksource_unbind(struct clocksource *cs) { - /* - * I really can't convince myself to support this on hardware - * designed by lobotomized monkeys. - */ - if (clocksource_is_watchdog(cs)) - return -EBUSY; + if (clocksource_is_watchdog(cs)) { + /* Select and try to install a replacement watchdog. */ + clocksource_select_watchdog(true); + if (clocksource_is_watchdog(cs)) + return -EBUSY; + } if (cs == curr_clocksource) { /* Select and try to install a replacement clock source */ -- cgit From 1ca8ec532fc2d986f1f4a319857bb18e0c9739b4 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 27 Jan 2016 19:26:07 +0800 Subject: tick/nohz: Set the correct expiry when switching to nohz/lowres mode commit 0ff53d096422 sets the next tick interrupt to the last jiffies update, i.e. in the past, because the forward operation is invoked before the set operation. There is no resulting damage (yet), but we get an extra pointless tick interrupt. Revert the order so we get the next tick interrupt in the future. Fixes: commit 0ff53d096422 "tick: sched: Force tick interrupt and get rid of softirq magic" Signed-off-by: Wanpeng Li Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1453893967-3458-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cbe5d8dcf15a..de2d9fef6ea6 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -995,9 +995,9 @@ static void tick_nohz_switch_to_nohz(void) /* Get the next period */ next = tick_init_jiffy_update(); - hrtimer_forward_now(&ts->sched_timer, tick_period); hrtimer_set_expires(&ts->sched_timer, next); - tick_program_event(next, 1); + hrtimer_forward_now(&ts->sched_timer, tick_period); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } -- cgit From 103502a35cfce0710909da874f092cb44823ca03 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Sat, 26 Dec 2015 06:00:48 +0100 Subject: seccomp: always propagate NO_NEW_PRIVS on tsync Before this patch, a process with some permissive seccomp filter that was applied by root without NO_NEW_PRIVS was able to add more filters to itself without setting NO_NEW_PRIVS by setting the new filter from a throwaway thread with NO_NEW_PRIVS. Signed-off-by: Jann Horn Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- kernel/seccomp.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 580ac2d4024f..15a1795bbba1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -316,24 +316,24 @@ static inline void seccomp_sync_threads(void) put_seccomp_filter(thread); smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); + + /* + * Don't let an unprivileged task work around + * the no_new_privs restriction by creating + * a thread that sets it up, enters seccomp, + * then dies. + */ + if (task_no_new_privs(caller)) + task_set_no_new_privs(thread); + /* * Opt the other thread into seccomp if needed. * As threads are considered to be trust-realm * equivalent (see ptrace_may_access), it is safe to * allow one thread to transition the other. */ - if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { - /* - * Don't let an unprivileged task work around - * the no_new_privs restriction by creating - * a thread that sets it up, enters seccomp, - * then dies. - */ - if (task_no_new_privs(caller)) - task_set_no_new_privs(thread); - + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); - } } } -- cgit From 993e9fe1e70aedbb6e9670211f1ad56beea8373d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Jan 2016 16:48:28 +0100 Subject: PM: APM_EMULATION does not depend on PM The APM emulation code does multiple things, and some of them depend on PM_SLEEP, while the battery management does not. However, selecting the symbol like SHARPSL_PM does causes a Kconfig warning: warning: (SHARPSL_PM && PMAC_APM_EMU) selects APM_EMULATION which has unmet direct dependencies (PM && SYS_SUPPORTS_APM_EMULATION) From all I can tell, this is completely harmless, and we can simply allow APM_EMULATION to be enabled here, even if PM is not. Signed-off-by: Arnd Bergmann Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 02e8dfaa1ce2..68d3ebc12601 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -235,7 +235,7 @@ config PM_TRACE_RTC config APM_EMULATION tristate "Advanced Power Management Emulation" - depends on PM && SYS_SUPPORTS_APM_EMULATION + depends on SYS_SUPPORTS_APM_EMULATION help APM is a BIOS specification for saving power using several different techniques. This is mostly useful for battery powered laptops with -- cgit From 37282a77954aa2dbb339d15902290f25b868d2e8 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sat, 9 Jan 2016 22:55:31 -0800 Subject: tty: audit: Combine push functions tty_audit_push() and tty_audit_push_current() perform identical tasks; eliminate the tty_audit_push() implementation and the tty_audit_push_current() name. Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_audit.c | 35 +++-------------------------------- include/linux/tty.h | 8 ++------ kernel/audit.c | 2 +- 3 files changed, 6 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/drivers/tty/tty_audit.c b/drivers/tty/tty_audit.c index 5ae48396e265..6b82c3ce321f 100644 --- a/drivers/tty/tty_audit.c +++ b/drivers/tty/tty_audit.c @@ -172,12 +172,11 @@ void tty_audit_tiocsti(struct tty_struct *tty, char ch) } /** - * tty_audit_push_current - Flush current's pending audit data + * tty_audit_push - Flush current's pending audit data * - * Try to lock sighand and get a reference to the tty audit buffer if available. - * Flush the buffer or return an appropriate error code. + * Returns 0 if success, -EPERM if tty audit is disabled */ -int tty_audit_push_current(void) +int tty_audit_push(void) { struct tty_audit_buf *buf = ERR_PTR(-EPERM); unsigned long flags; @@ -309,31 +308,3 @@ void tty_audit_add_data(struct tty_struct *tty, const void *data, size_t size) mutex_unlock(&buf->mutex); tty_audit_buf_put(buf); } - -/** - * tty_audit_push - Push buffered data out - * - * Make sure no audit data is pending on the current process. - */ -void tty_audit_push(void) -{ - struct tty_audit_buf *buf; - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - if (likely(!current->signal->audit_tty)) { - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - return; - } - buf = current->signal->tty_audit_buf; - if (buf) - atomic_inc(&buf->count); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - - if (buf) { - mutex_lock(&buf->mutex); - tty_audit_buf_push(buf); - mutex_unlock(&buf->mutex); - tty_audit_buf_put(buf); - } -} diff --git a/include/linux/tty.h b/include/linux/tty.h index 83d74dcfb3c8..dea7d54d00d4 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -597,8 +597,7 @@ extern void tty_audit_add_data(struct tty_struct *tty, const void *data, extern void tty_audit_exit(void); extern void tty_audit_fork(struct signal_struct *sig); extern void tty_audit_tiocsti(struct tty_struct *tty, char ch); -extern void tty_audit_push(void); -extern int tty_audit_push_current(void); +extern int tty_audit_push(void); #else static inline void tty_audit_add_data(struct tty_struct *tty, const void *data, size_t size) @@ -613,10 +612,7 @@ static inline void tty_audit_exit(void) static inline void tty_audit_fork(struct signal_struct *sig) { } -static inline void tty_audit_push(void) -{ -} -static inline int tty_audit_push_current(void) +static inline int tty_audit_push(void) { return 0; } diff --git a/kernel/audit.c b/kernel/audit.c index 3a3e5deeda8d..610f221df069 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -920,7 +920,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err == 1) { /* match or error */ err = 0; if (msg_type == AUDIT_USER_TTY) { - err = tty_audit_push_current(); + err = tty_audit_push(); if (err) break; } -- cgit From 2e28d38ae1d9ced6ac2deb4001aca3f267304cdb Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sat, 9 Jan 2016 22:55:33 -0800 Subject: tty: audit: Handle tty audit enable atomically The audit_tty and audit_tty_log_passwd fields are actually bool values, so merge into single memory location to access atomically. NB: audit log operations may still occur after tty audit is disabled which is consistent with the existing functionality Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_audit.c | 53 ++++++++++++++++++++----------------------------- include/linux/audit.h | 4 ++++ include/linux/sched.h | 1 - kernel/audit.c | 25 +++++++++++------------ 4 files changed, 38 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/drivers/tty/tty_audit.c b/drivers/tty/tty_audit.c index 50380d87061d..3d90f88c5ff9 100644 --- a/drivers/tty/tty_audit.c +++ b/drivers/tty/tty_audit.c @@ -131,7 +131,6 @@ void tty_audit_exit(void) void tty_audit_fork(struct signal_struct *sig) { sig->audit_tty = current->signal->audit_tty; - sig->audit_tty_log_passwd = current->signal->audit_tty_log_passwd; } /** @@ -141,11 +140,9 @@ void tty_audit_tiocsti(struct tty_struct *tty, char ch) { struct tty_audit_buf *buf; dev_t dev; - int should_audit; unsigned long flags; spin_lock_irqsave(¤t->sighand->siglock, flags); - should_audit = current->signal->audit_tty; buf = current->signal->tty_audit_buf; if (buf) atomic_inc(&buf->count); @@ -160,7 +157,7 @@ void tty_audit_tiocsti(struct tty_struct *tty, char ch) tty_audit_buf_put(buf); } - if (should_audit && audit_enabled) { + if (audit_enabled && (current->signal->audit_tty & AUDIT_TTY_ENABLE)) { kuid_t auid; unsigned int sessionid; @@ -177,29 +174,25 @@ void tty_audit_tiocsti(struct tty_struct *tty, char ch) */ int tty_audit_push(void) { - struct tty_audit_buf *buf = ERR_PTR(-EPERM); + struct tty_audit_buf *buf; unsigned long flags; + if (~current->signal->audit_tty & AUDIT_TTY_ENABLE) + return -EPERM; + spin_lock_irqsave(¤t->sighand->siglock, flags); - if (current->signal->audit_tty) { - buf = current->signal->tty_audit_buf; - if (buf) - atomic_inc(&buf->count); - } + buf = current->signal->tty_audit_buf; + if (buf) + atomic_inc(&buf->count); spin_unlock_irqrestore(¤t->sighand->siglock, flags); - /* - * Return 0 when signal->audit_tty set - * but current->signal->tty_audit_buf == NULL. - */ - if (!buf || IS_ERR(buf)) - return PTR_ERR(buf); - - mutex_lock(&buf->mutex); - tty_audit_buf_push(buf); - mutex_unlock(&buf->mutex); + if (buf) { + mutex_lock(&buf->mutex); + tty_audit_buf_push(buf); + mutex_unlock(&buf->mutex); - tty_audit_buf_put(buf); + tty_audit_buf_put(buf); + } return 0; } @@ -218,8 +211,6 @@ static struct tty_audit_buf *tty_audit_buf_get(void) buf = NULL; buf2 = NULL; spin_lock_irqsave(¤t->sighand->siglock, flags); - if (likely(!current->signal->audit_tty)) - goto out; buf = current->signal->tty_audit_buf; if (buf) { atomic_inc(&buf->count); @@ -233,9 +224,10 @@ static struct tty_audit_buf *tty_audit_buf_get(void) return NULL; } - spin_lock_irqsave(¤t->sighand->siglock, flags); - if (!current->signal->audit_tty) + if (~current->signal->audit_tty & AUDIT_TTY_ENABLE) goto out; + + spin_lock_irqsave(¤t->sighand->siglock, flags); buf = current->signal->tty_audit_buf; if (!buf) { current->signal->tty_audit_buf = buf2; @@ -259,9 +251,8 @@ static struct tty_audit_buf *tty_audit_buf_get(void) void tty_audit_add_data(struct tty_struct *tty, const void *data, size_t size) { struct tty_audit_buf *buf; - int audit_log_tty_passwd; - unsigned long flags; unsigned int icanon = !!L_ICANON(tty); + unsigned int audit_tty; dev_t dev; if (unlikely(size == 0)) @@ -271,10 +262,10 @@ void tty_audit_add_data(struct tty_struct *tty, const void *data, size_t size) && tty->driver->subtype == PTY_TYPE_MASTER) return; - spin_lock_irqsave(¤t->sighand->siglock, flags); - audit_log_tty_passwd = current->signal->audit_tty_log_passwd; - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - if (!audit_log_tty_passwd && icanon && !L_ECHO(tty)) + audit_tty = READ_ONCE(current->signal->audit_tty); + if (~audit_tty & AUDIT_TTY_ENABLE) + return; + if ((~audit_tty & AUDIT_TTY_LOG_PASSWD) && icanon && !L_ECHO(tty)) return; buf = tty_audit_buf_get(); diff --git a/include/linux/audit.h b/include/linux/audit.h index b40ed5df5542..e38e3fc13ea8 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -109,6 +109,10 @@ extern int audit_classify_compat_syscall(int abi, unsigned syscall); /* maximized args number that audit_socketcall can process */ #define AUDITSC_ARGS 6 +/* bit values for ->signal->audit_tty */ +#define AUDIT_TTY_ENABLE BIT(0) +#define AUDIT_TTY_LOG_PASSWD BIT(1) + struct filename; extern void audit_log_session_info(struct audit_buffer *ab); diff --git a/include/linux/sched.h b/include/linux/sched.h index a10494a94cc3..a389222e9703 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -775,7 +775,6 @@ struct signal_struct { #endif #ifdef CONFIG_AUDIT unsigned audit_tty; - unsigned audit_tty_log_passwd; struct tty_audit_buf *tty_audit_buf; #endif diff --git a/kernel/audit.c b/kernel/audit.c index 610f221df069..2651e423b2dc 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1030,20 +1030,19 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_TTY_GET: { struct audit_tty_status s; - struct task_struct *tsk = current; + unsigned int t; - spin_lock(&tsk->sighand->siglock); - s.enabled = tsk->signal->audit_tty; - s.log_passwd = tsk->signal->audit_tty_log_passwd; - spin_unlock(&tsk->sighand->siglock); + t = READ_ONCE(current->signal->audit_tty); + s.enabled = t & AUDIT_TTY_ENABLE; + s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status s, old; - struct task_struct *tsk = current; struct audit_buffer *ab; + unsigned int t; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ @@ -1053,14 +1052,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) (s.log_passwd != 0 && s.log_passwd != 1)) err = -EINVAL; - spin_lock(&tsk->sighand->siglock); - old.enabled = tsk->signal->audit_tty; - old.log_passwd = tsk->signal->audit_tty_log_passwd; - if (!err) { - tsk->signal->audit_tty = s.enabled; - tsk->signal->audit_tty_log_passwd = s.log_passwd; + if (err) + t = READ_ONCE(current->signal->audit_tty); + else { + t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD); + t = xchg(¤t->signal->audit_tty, t); } - spin_unlock(&tsk->sighand->siglock); + old.enabled = t & AUDIT_TTY_ENABLE; + old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" -- cgit From 78cd2c748f459739ff864dd9308c0f6caf7f6e41 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Jan 2016 14:08:45 +0100 Subject: perf: Fix orphan hole We should set event->owner before we install the event, otherwise there is a hole where the target task can fork() and we'll not inherit the event because it thinks the event is orphaned. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9de4d352ba8c..6759f2a332d7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8489,6 +8489,8 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__header_size(event); perf_event__id_header_size(event); + event->owner = current; + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); @@ -8498,8 +8500,6 @@ SYSCALL_DEFINE5(perf_event_open, put_online_cpus(); - event->owner = current; - mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); -- cgit From 6a3351b612b72c558910c88a43e2ef6d7d68bc97 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Jan 2016 14:09:54 +0100 Subject: perf: Fix race in perf_event_exit_task_context() There is a race between perf_event_exit_task_context() and orphans_remove_work() which results in a use-after-free. We mark ctx->task with TASK_TOMBSTONE to indicate a context is 'dead', under ctx->lock. After which point event_function_call() on any event of that context will NOP A concurrent orphans_remove_work() will only hold ctx->mutex for the list iteration and not serialize against this. Therefore its possible that orphans_remove_work()'s perf_remove_from_context() call will fail, but we'll continue to free the event, with the result of free'd memory still being on lists and everything. Once perf_event_exit_task_context() gets around to acquiring ctx->mutex it too will iterate the event list, encounter the already free'd event and proceed to free it _again_. This fails with the WARN in free_event(). Plug the race by having perf_event_exit_task_context() hold ctx::mutex over the whole tear-down, thereby 'naturally' serializing against all other sites, including the orphan work. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: alexander.shishkin@linux.intel.com Cc: dsahern@gmail.com Cc: namhyung@kernel.org Link: http://lkml.kernel.org/r/20160125130954.GY6357@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 50 +++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6759f2a332d7..1d243fadfd12 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8748,14 +8748,40 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { struct perf_event_context *child_ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; - unsigned long flags; WARN_ON_ONCE(child != current); - child_ctx = perf_lock_task_context(child, ctxn, &flags); + child_ctx = perf_pin_task_context(child, ctxn); if (!child_ctx) return; + /* + * In order to reduce the amount of tricky in ctx tear-down, we hold + * ctx::mutex over the entire thing. This serializes against almost + * everything that wants to access the ctx. + * + * The exception is sys_perf_event_open() / + * perf_event_create_kernel_count() which does find_get_context() + * without ctx::mutex (it cannot because of the move_group double mutex + * lock thing). See the comments in perf_install_in_context(). + * + * We can recurse on the same lock type through: + * + * __perf_event_exit_task() + * sync_child_event() + * put_event() + * mutex_lock(&ctx->mutex) + * + * But since its the parent context it won't be the same instance. + */ + mutex_lock(&child_ctx->mutex); + + /* + * In a single ctx::lock section, de-schedule the events and detach the + * context from the task such that we cannot ever get it scheduled back + * in. + */ + raw_spin_lock_irq(&child_ctx->lock); task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); /* @@ -8767,14 +8793,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); put_task_struct(current); /* cannot be last */ - /* - * If this context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the events from it. - */ clone_ctx = unclone_ctx(child_ctx); - update_context_time(child_ctx); - raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + raw_spin_unlock_irq(&child_ctx->lock); if (clone_ctx) put_ctx(clone_ctx); @@ -8786,18 +8806,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ perf_event_task(child, child_ctx, 0); - /* - * We can recurse on the same lock type through: - * - * __perf_event_exit_task() - * sync_child_event() - * put_event() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. - */ - mutex_lock(&child_ctx->mutex); - list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) __perf_event_exit_task(child_event, child_ctx, child); -- cgit From 828b6f0e26170938d617e99a17177453be4d77a3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Jan 2016 21:59:04 +0100 Subject: perf: Fix NULL deref Dan reported: 1229 if (ctx->task == TASK_TOMBSTONE || 1230 !atomic_inc_not_zero(&ctx->refcount)) { 1231 raw_spin_unlock(&ctx->lock); 1232 ctx = NULL; ^^^^^^^^^^ ctx is NULL. 1233 } 1234 1235 WARN_ON_ONCE(ctx->task != task); ^^^^^^^^^^^^^^^^^ The patch adds a NULL dereference. Reported-by: Dan Carpenter Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 63b6da39bb38 ("perf: Fix perf_event_exit_task() race") Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1d243fadfd12..fe97f95f204e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1230,9 +1230,9 @@ retry: !atomic_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; + } else { + WARN_ON_ONCE(ctx->task != task); } - - WARN_ON_ONCE(ctx->task != task); } rcu_read_unlock(); if (!ctx) -- cgit From e03e7ee34fdd1c3ef494949a75cb8c61c7265fa9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Jan 2016 20:59:49 -0800 Subject: perf/bpf: Convert perf_event_array to use struct file Robustify refcounting. Signed-off-by: Alexei Starovoitov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Wang Nan Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160126045947.GA40151@ast-mbp.thefacebook.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++-- kernel/bpf/arraymap.c | 21 +++++++++++---------- kernel/events/core.c | 21 ++++++++++----------- kernel/trace/bpf_trace.c | 14 ++++++++++---- 4 files changed, 33 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6612732d8fd0..4f90434b8d64 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -729,7 +729,7 @@ extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); -extern struct perf_event *perf_event_get(unsigned int fd); +extern struct file *perf_event_get(unsigned int fd); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); @@ -1070,7 +1070,7 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } -static inline struct perf_event *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } +static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index b0799bced518..89ebbc4d1164 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -291,10 +291,13 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) { struct perf_event *event; const struct perf_event_attr *attr; + struct file *file; - event = perf_event_get(fd); - if (IS_ERR(event)) - return event; + file = perf_event_get(fd); + if (IS_ERR(file)) + return file; + + event = file->private_data; attr = perf_event_attrs(event); if (IS_ERR(attr)) @@ -304,24 +307,22 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) goto err; if (attr->type == PERF_TYPE_RAW) - return event; + return file; if (attr->type == PERF_TYPE_HARDWARE) - return event; + return file; if (attr->type == PERF_TYPE_SOFTWARE && attr->config == PERF_COUNT_SW_BPF_OUTPUT) - return event; + return file; err: - perf_event_release_kernel(event); + fput(file); return ERR_PTR(-EINVAL); } static void perf_event_fd_array_put_ptr(void *ptr) { - struct perf_event *event = ptr; - - perf_event_release_kernel(event); + fput((struct file *)ptr); } static const struct bpf_map_ops perf_event_array_ops = { diff --git a/kernel/events/core.c b/kernel/events/core.c index fe97f95f204e..eb44730afea5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8916,21 +8916,20 @@ void perf_event_delayed_put(struct task_struct *task) WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); } -struct perf_event *perf_event_get(unsigned int fd) +struct file *perf_event_get(unsigned int fd) { - int err; - struct fd f; - struct perf_event *event; + struct file *file; - err = perf_fget_light(fd, &f); - if (err) - return ERR_PTR(err); + file = fget_raw(fd); + if (!file) + return ERR_PTR(-EBADF); - event = f.file->private_data; - atomic_long_inc(&event->refcount); - fdput(f); + if (file->f_op != &perf_fops) { + fput(file); + return ERR_PTR(-EBADF); + } - return event; + return file; } const struct perf_event_attr *perf_event_attrs(struct perf_event *event) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 45dd798bcd37..326a75e884db 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -191,14 +191,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); struct perf_event *event; + struct file *file; if (unlikely(index >= array->map.max_entries)) return -E2BIG; - event = (struct perf_event *)array->ptrs[index]; - if (!event) + file = (struct file *)array->ptrs[index]; + if (unlikely(!file)) return -ENOENT; + event = file->private_data; + /* make sure event is local and doesn't have pmu::count */ if (event->oncpu != smp_processor_id() || event->pmu->count) @@ -228,6 +231,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) void *data = (void *) (long) r4; struct perf_sample_data sample_data; struct perf_event *event; + struct file *file; struct perf_raw_record raw = { .size = size, .data = data, @@ -236,10 +240,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) if (unlikely(index >= array->map.max_entries)) return -E2BIG; - event = (struct perf_event *)array->ptrs[index]; - if (unlikely(!event)) + file = (struct file *)array->ptrs[index]; + if (unlikely(!file)) return -ENOENT; + event = file->private_data; + if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; -- cgit From a0733e695b83a9c31f779e41dcaec8ef924716b5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2016 12:14:40 +0100 Subject: perf: Remove __free_event() There is but a single caller, remove the function - we already have _free_event(), the extra indirection is nonsensical.. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index eb44730afea5..024adf0e34eb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3590,7 +3590,7 @@ static void unaccount_event(struct perf_event *event) * 3) two matching events on the same context. * * The former two cases are handled in the allocation path (perf_event_alloc(), - * __free_event()), the latter -- before the first perf_install_in_context(). + * _free_event()), the latter -- before the first perf_install_in_context(). */ static int exclusive_event_init(struct perf_event *event) { @@ -3665,29 +3665,6 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } -static void __free_event(struct perf_event *event) -{ - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } - - perf_event_free_bpf_prog(event); - - if (event->destroy) - event->destroy(event); - - if (event->ctx) - put_ctx(event->ctx); - - if (event->pmu) { - exclusive_event_destroy(event); - module_put(event->pmu->module); - } - - call_rcu(&event->rcu_head, free_event_rcu); -} - static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); @@ -3709,7 +3686,25 @@ static void _free_event(struct perf_event *event) if (is_cgroup_event(event)) perf_detach_cgroup(event); - __free_event(event); + if (!event->parent) { + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); + } + + perf_event_free_bpf_prog(event); + + if (event->destroy) + event->destroy(event); + + if (event->ctx) + put_ctx(event->ctx); + + if (event->pmu) { + exclusive_event_destroy(event); + module_put(event->pmu->module); + } + + call_rcu(&event->rcu_head, free_event_rcu); } /* -- cgit From 07c4a776135ea6d808ea19aabeb51de6b8648402 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2016 12:15:37 +0100 Subject: perf: Update locking order Update the locking order to note that ctx::lock nests inside of child_mutex, as per: perf_ioctl(): ctx::mutex -> perf_event_for_each(): event::child_mutex -> _perf_event_enable(): ctx::lock Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 024adf0e34eb..d345964c2bd6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1086,8 +1086,8 @@ static void put_ctx(struct perf_event_context *ctx) * Lock order: * task_struct::perf_event_mutex * perf_event_context::mutex - * perf_event_context::lock * perf_event::child_mutex; + * perf_event_context::lock * perf_event::mmap_mutex * mmap_sem */ -- cgit From 6e801e016917989ab8a7ddfc4229a15a5621622a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2016 12:17:08 +0100 Subject: perf: Fix STATE_EXIT usage We should never attempt to enable a STATE_EXIT event. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d345964c2bd6..d84374fa44e5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2253,7 +2253,8 @@ static void __perf_event_enable(struct perf_event *event, struct perf_event *leader = event->group_leader; struct perf_event_context *task_ctx; - if (event->state >= PERF_EVENT_STATE_INACTIVE) + if (event->state >= PERF_EVENT_STATE_INACTIVE || + event->state <= PERF_EVENT_STATE_ERROR) return; update_context_time(ctx); @@ -2298,7 +2299,8 @@ static void _perf_event_enable(struct perf_event *event) struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); - if (event->state >= PERF_EVENT_STATE_INACTIVE) { + if (event->state >= PERF_EVENT_STATE_INACTIVE || + event->state < PERF_EVENT_STATE_ERROR) { raw_spin_unlock_irq(&ctx->lock); return; } -- cgit From f47c02c0c8403963fbb8c3484e285727305d0f73 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2016 12:30:14 +0100 Subject: perf: Robustify event->owner usage and SMP ordering Use smp_store_release() to clear event->owner and lockless_dereference() to observe it. Further use READ_ONCE() for all lockless reads. This changes perf_remove_from_owner() to leave event->owner cleared. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d84374fa44e5..5f055de90c6d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -152,7 +152,7 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, static bool is_kernel_event(struct perf_event *event) { - return event->owner == TASK_TOMBSTONE; + return READ_ONCE(event->owner) == TASK_TOMBSTONE; } /* @@ -1651,7 +1651,7 @@ out: */ static bool is_orphaned_event(struct perf_event *event) { - return event && !is_kernel_event(event) && !event->owner; + return event && !is_kernel_event(event) && !READ_ONCE(event->owner); } /* @@ -3733,14 +3733,13 @@ static void perf_remove_from_owner(struct perf_event *event) struct task_struct *owner; rcu_read_lock(); - owner = ACCESS_ONCE(event->owner); /* - * Matches the smp_wmb() in perf_event_exit_task(). If we observe - * !owner it means the list deletion is complete and we can indeed - * free this event, otherwise we need to serialize on + * Matches the smp_store_release() in perf_event_exit_task(). If we + * observe !owner it means the list deletion is complete and we can + * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ - smp_read_barrier_depends(); + owner = lockless_dereference(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last @@ -3768,8 +3767,10 @@ static void perf_remove_from_owner(struct perf_event *event) * ensured they're done, and we can proceed with freeing the * event. */ - if (event->owner) + if (event->owner) { list_del_init(&event->owner_entry); + smp_store_release(&event->owner, NULL); + } mutex_unlock(&owner->perf_event_mutex); put_task_struct(owner); } @@ -8829,8 +8830,7 @@ void perf_event_exit_task(struct task_struct *child) * the owner, closes a race against perf_release() where * we need to serialize on the owner->perf_event_mutex. */ - smp_wmb(); - event->owner = NULL; + smp_store_release(&event->owner, NULL); } mutex_unlock(&child->perf_event_mutex); -- cgit From 8ba289b8d4e4dbd1f971fbf0d2085e4776a4ba25 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2016 13:06:56 +0100 Subject: perf: Clean up sync_child_event() sync_child_event() has outgrown its purpose, it does far too much. Bring it back to its named purpose. Rename __perf_event_exit_task() to perf_event_exit_event() to better reflect what it does and move the event->state assignment under the ctx->lock, like state changes ought to be. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 81 +++++++++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5f055de90c6d..8c3d95195f05 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1041,9 +1041,8 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::mutex nests and those are: * * - perf_event_exit_task_context() [ child , 0 ] - * __perf_event_exit_task() - * sync_child_event() - * put_event() [ parent, 1 ] + * perf_event_exit_event() + * put_event() [ parent, 1 ] * * - perf_event_init_context() [ parent, 0 ] * inherit_task_group() @@ -1846,7 +1845,8 @@ static void __perf_event_disable(struct perf_event *event, * remains valid. This condition is satisifed when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that - * goes to exit will block in sync_child_event. + * goes to exit will block in perf_event_exit_event(). + * * When called from perf_pending_event it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. @@ -4086,7 +4086,7 @@ static void _perf_event_reset(struct perf_event *event) /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block - * in sync_child_event if it goes to exit, thus satisfying the + * in perf_event_exit_event() if it goes to exit, thus satisfying the * task existence requirements of perf_event_enable/disable. */ static void perf_event_for_each_child(struct perf_event *event, @@ -8681,33 +8681,15 @@ static void sync_child_event(struct perf_event *child_event, &parent_event->child_total_time_enabled); atomic64_add(child_event->total_time_running, &parent_event->child_total_time_running); - - /* - * Remove this event from the parent's list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_del_init(&child_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - /* - * Make sure user/parent get notified, that we just - * lost one event. - */ - perf_event_wakeup(parent_event); - - /* - * Release the parent event, if this was the last - * reference to it. - */ - put_event(parent_event); } static void -__perf_event_exit_task(struct perf_event *child_event, - struct perf_event_context *child_ctx, - struct task_struct *child) +perf_event_exit_event(struct perf_event *child_event, + struct perf_event_context *child_ctx, + struct task_struct *child) { + struct perf_event *parent_event = child_event->parent; + /* * Do not destroy the 'original' grouping; because of the context * switch optimization the original events could've ended up in a @@ -8723,23 +8705,39 @@ __perf_event_exit_task(struct perf_event *child_event, raw_spin_lock_irq(&child_ctx->lock); WARN_ON_ONCE(child_ctx->is_active); - if (!!child_event->parent) + if (parent_event) perf_group_detach(child_event); list_del_event(child_event, child_ctx); + child_event->state = PERF_EVENT_STATE_EXIT; raw_spin_unlock_irq(&child_ctx->lock); /* - * It can happen that the parent exits first, and has events - * that are still around due to the child reference. These - * events need to be zapped. + * Parent events are governed by their filedesc, retain them. */ - if (child_event->parent) { - sync_child_event(child_event, child); - free_event(child_event); - } else { - child_event->state = PERF_EVENT_STATE_EXIT; + if (!parent_event) { perf_event_wakeup(child_event); + return; } + /* + * Child events can be cleaned up. + */ + + sync_child_event(child_event, child); + + /* + * Remove this event from the parent's list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + /* + * Kick perf_poll() for is_event_hup(). + */ + perf_event_wakeup(parent_event); + free_event(child_event); + put_event(parent_event); } static void perf_event_exit_task_context(struct task_struct *child, int ctxn) @@ -8765,10 +8763,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * * We can recurse on the same lock type through: * - * __perf_event_exit_task() - * sync_child_event() - * put_event() - * mutex_lock(&ctx->mutex) + * perf_event_exit_event() + * put_event() + * mutex_lock(&ctx->mutex) * * But since its the parent context it won't be the same instance. */ @@ -8805,7 +8802,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) perf_event_task(child, child_ctx, 0); list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - __perf_event_exit_task(child_event, child_ctx, child); + perf_event_exit_event(child_event, child_ctx, child); mutex_unlock(&child_ctx->mutex); -- cgit From 45a0e07abf4933490a2d2f81b1a31fe267bd3561 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2016 13:09:48 +0100 Subject: perf: Add flags argument to perf_remove_from_context() In preparation to adding more options, convert the boolean argument into a flags word. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 8c3d95195f05..4291a4d27664 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1770,6 +1770,8 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } +#define DETACH_GROUP 0x01UL + /* * Cross CPU call to remove a performance event * @@ -1782,10 +1784,10 @@ __perf_remove_from_context(struct perf_event *event, struct perf_event_context *ctx, void *info) { - bool detach_group = (unsigned long)info; + unsigned long flags = (unsigned long)info; event_sched_out(event, cpuctx, ctx); - if (detach_group) + if (flags & DETACH_GROUP) perf_group_detach(event); list_del_event(event, ctx); @@ -1808,12 +1810,11 @@ __perf_remove_from_context(struct perf_event *event, * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_remove_from_context(struct perf_event *event, bool detach_group) +static void perf_remove_from_context(struct perf_event *event, unsigned long flags) { lockdep_assert_held(&event->ctx->mutex); - event_function_call(event, __perf_remove_from_context, - (void *)(unsigned long)detach_group); + event_function_call(event, __perf_remove_from_context, (void *)flags); } /* @@ -3800,7 +3801,7 @@ static void put_event(struct perf_event *event) */ ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, true); + perf_remove_from_context(event, DETACH_GROUP); perf_event_ctx_unlock(event, ctx); _free_event(event); @@ -3840,7 +3841,7 @@ static void orphans_remove_work(struct work_struct *work) if (!is_orphaned_child(event)) continue; - perf_remove_from_context(event, true); + perf_remove_from_context(event, DETACH_GROUP); mutex_lock(&parent_event->child_mutex); list_del_init(&event->child_list); @@ -8430,11 +8431,11 @@ SYSCALL_DEFINE5(perf_event_open, * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ - perf_remove_from_context(group_leader, false); + perf_remove_from_context(group_leader, 0); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_remove_from_context(sibling, false); + perf_remove_from_context(sibling, 0); put_ctx(gctx); } @@ -8614,7 +8615,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { - perf_remove_from_context(event, false); + perf_remove_from_context(event, 0); unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); list_add(&event->migrate_entry, &events); @@ -9240,7 +9241,7 @@ static void __perf_event_exit_context(void *__info) raw_spin_lock(&ctx->lock); list_for_each_entry(event, &ctx->event_list, event_entry) - __perf_remove_from_context(event, cpuctx, ctx, (void *)(unsigned long)true); + __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); } -- cgit From 60beda849343494b2a598b927630bbe293c1cc6e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2016 14:55:02 +0100 Subject: perf: Untangle 'owner' confusion There are two concepts of owner wrt an event and they are conflated: - event::owner / event::owner_list, used by prctl(.option = PR_TASK_PERF_EVENTS_{EN,DIS}ABLE). - the 'owner' of the event object, typically the file descriptor. Currently these two concepts are conflated, which gives trouble with scm_rights passing of file descriptors. Passing the event and then closing the creating task would render the event 'orphan' and would have it cleared out. Unlikely what is expectd. This patch untangles these two concepts by using PERF_EVENT_STATE_EXIT to denote the second type. Reported-by: Alexei Starovoitov Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4291a4d27664..e549cf2accdd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1650,7 +1650,7 @@ out: */ static bool is_orphaned_event(struct perf_event *event) { - return event && !is_kernel_event(event) && !READ_ONCE(event->owner); + return event && event->state == PERF_EVENT_STATE_EXIT; } /* @@ -1771,6 +1771,7 @@ group_sched_out(struct perf_event *group_event, } #define DETACH_GROUP 0x01UL +#define DETACH_STATE 0x02UL /* * Cross CPU call to remove a performance event @@ -1790,6 +1791,8 @@ __perf_remove_from_context(struct perf_event *event, if (flags & DETACH_GROUP) perf_group_detach(event); list_del_event(event, ctx); + if (flags & DETACH_STATE) + event->state = PERF_EVENT_STATE_EXIT; if (!ctx->nr_events && ctx->is_active) { ctx->is_active = 0; @@ -3801,9 +3804,16 @@ static void put_event(struct perf_event *event) */ ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, DETACH_GROUP); + perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE); perf_event_ctx_unlock(event, ctx); + /* + * At this point we must have event->state == PERF_EVENT_STATE_EXIT, + * either from the above perf_remove_from_context() or through + * perf_event_exit_event(). + */ + WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT); + _free_event(event); } -- cgit From c6e5b73242d2d9172ea880483bc4ba7ffca0cfb2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Jan 2016 16:07:41 +0200 Subject: perf: Synchronously clean up child events The orphan cleanup workqueue doesn't always catch orphans, for example, if they never schedule after they are orphaned. IOW, the event leak is still very real. It also wouldn't work for kernel counters. Doing it synchonously is a little hairy due to lock inversion issues, but is made to work. Patch based on work by Alexander Shishkin. Suggested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 3 - kernel/events/core.c | 174 ++++++++++++++++++++++----------------------- 2 files changed, 84 insertions(+), 93 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4f90434b8d64..b35a61a481fa 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -634,9 +634,6 @@ struct perf_event_context { int nr_cgroups; /* cgroup evts */ void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; - - struct delayed_work orphans_remove; - bool orphans_remove_sched; }; /* diff --git a/kernel/events/core.c b/kernel/events/core.c index e549cf2accdd..98c862aff8fa 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -49,8 +49,6 @@ #include -static struct workqueue_struct *perf_wq; - typedef int (*remote_function_f)(void *); struct remote_function_call { @@ -1645,45 +1643,11 @@ out: perf_event__header_size(tmp); } -/* - * User event without the task. - */ static bool is_orphaned_event(struct perf_event *event) { - return event && event->state == PERF_EVENT_STATE_EXIT; -} - -/* - * Event has a parent but parent's task finished and it's - * alive only because of children holding refference. - */ -static bool is_orphaned_child(struct perf_event *event) -{ - return is_orphaned_event(event->parent); -} - -static void orphans_remove_work(struct work_struct *work); - -static void schedule_orphans_remove(struct perf_event_context *ctx) -{ - if (!ctx->task || ctx->orphans_remove_sched || !perf_wq) - return; - - if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) { - get_ctx(ctx); - ctx->orphans_remove_sched = true; - } -} - -static int __init perf_workqueue_init(void) -{ - perf_wq = create_singlethread_workqueue("perf"); - WARN(!perf_wq, "failed to create perf workqueue\n"); - return perf_wq ? 0 : -1; + return event->state == PERF_EVENT_STATE_EXIT; } -core_initcall(perf_workqueue_init); - static inline int pmu_filter_match(struct perf_event *event) { struct pmu *pmu = event->pmu; @@ -1744,9 +1708,6 @@ event_sched_out(struct perf_event *event, if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; - if (is_orphaned_child(event)) - schedule_orphans_remove(ctx); - perf_pmu_enable(event->pmu); } @@ -1984,9 +1945,6 @@ event_sched_in(struct perf_event *event, if (event->attr.exclusive) cpuctx->exclusive = 1; - if (is_orphaned_child(event)) - schedule_orphans_remove(ctx); - out: perf_pmu_enable(event->pmu); @@ -3369,7 +3327,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx) INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); - INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work); } static struct perf_event_context * @@ -3782,11 +3739,22 @@ static void perf_remove_from_owner(struct perf_event *event) static void put_event(struct perf_event *event) { - struct perf_event_context *ctx; - if (!atomic_long_dec_and_test(&event->refcount)) return; + _free_event(event); +} + +/* + * Kill an event dead; while event:refcount will preserve the event + * object, it will not preserve its functionality. Once the last 'user' + * gives up the object, we'll destroy the thing. + */ +int perf_event_release_kernel(struct perf_event *event) +{ + struct perf_event_context *ctx; + struct perf_event *child, *tmp; + if (!is_kernel_event(event)) perf_remove_from_owner(event); @@ -3811,14 +3779,70 @@ static void put_event(struct perf_event *event) * At this point we must have event->state == PERF_EVENT_STATE_EXIT, * either from the above perf_remove_from_context() or through * perf_event_exit_event(). + * + * Therefore, anybody acquiring event->child_mutex after the below + * loop _must_ also see this, most importantly inherit_event() which + * will avoid placing more children on the list. + * + * Thus this guarantees that we will in fact observe and kill _ALL_ + * child events. */ WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT); - _free_event(event); -} +again: + mutex_lock(&event->child_mutex); + list_for_each_entry(child, &event->child_list, child_list) { -int perf_event_release_kernel(struct perf_event *event) -{ + /* + * Cannot change, child events are not migrated, see the + * comment with perf_event_ctx_lock_nested(). + */ + ctx = lockless_dereference(child->ctx); + /* + * Since child_mutex nests inside ctx::mutex, we must jump + * through hoops. We start by grabbing a reference on the ctx. + * + * Since the event cannot get freed while we hold the + * child_mutex, the context must also exist and have a !0 + * reference count. + */ + get_ctx(ctx); + + /* + * Now that we have a ctx ref, we can drop child_mutex, and + * acquire ctx::mutex without fear of it going away. Then we + * can re-acquire child_mutex. + */ + mutex_unlock(&event->child_mutex); + mutex_lock(&ctx->mutex); + mutex_lock(&event->child_mutex); + + /* + * Now that we hold ctx::mutex and child_mutex, revalidate our + * state, if child is still the first entry, it didn't get freed + * and we can continue doing so. + */ + tmp = list_first_entry_or_null(&event->child_list, + struct perf_event, child_list); + if (tmp == child) { + perf_remove_from_context(child, DETACH_GROUP); + list_del(&child->child_list); + free_event(child); + /* + * This matches the refcount bump in inherit_event(); + * this can't be the last reference. + */ + put_event(event); + } + + mutex_unlock(&event->child_mutex); + mutex_unlock(&ctx->mutex); + put_ctx(ctx); + goto again; + } + mutex_unlock(&event->child_mutex); + + /* Must be the last reference */ put_event(event); return 0; } @@ -3829,46 +3853,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); */ static int perf_release(struct inode *inode, struct file *file) { - put_event(file->private_data); + perf_event_release_kernel(file->private_data); return 0; } -/* - * Remove all orphanes events from the context. - */ -static void orphans_remove_work(struct work_struct *work) -{ - struct perf_event_context *ctx; - struct perf_event *event, *tmp; - - ctx = container_of(work, struct perf_event_context, - orphans_remove.work); - - mutex_lock(&ctx->mutex); - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { - struct perf_event *parent_event = event->parent; - - if (!is_orphaned_child(event)) - continue; - - perf_remove_from_context(event, DETACH_GROUP); - - mutex_lock(&parent_event->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent_event->child_mutex); - - free_event(event); - put_event(parent_event); - } - - raw_spin_lock_irq(&ctx->lock); - ctx->orphans_remove_sched = false; - raw_spin_unlock_irq(&ctx->lock); - mutex_unlock(&ctx->mutex); - - put_ctx(ctx); -} - u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; @@ -8719,7 +8707,7 @@ perf_event_exit_event(struct perf_event *child_event, if (parent_event) perf_group_detach(child_event); list_del_event(child_event, child_ctx); - child_event->state = PERF_EVENT_STATE_EXIT; + child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */ raw_spin_unlock_irq(&child_ctx->lock); /* @@ -8977,8 +8965,16 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; + /* + * is_orphaned_event() and list_add_tail(&parent_event->child_list) + * must be under the same lock in order to serialize against + * perf_event_release_kernel(), such that either we must observe + * is_orphaned_event() or they will observe us on the child_list. + */ + mutex_lock(&parent_event->child_mutex); if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { + mutex_unlock(&parent_event->child_mutex); free_event(child_event); return NULL; } @@ -9026,8 +9022,6 @@ inherit_event(struct perf_event *parent_event, /* * Link this into the parent event's child list */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); list_add_tail(&child_event->child_list, &parent_event->child_list); mutex_unlock(&parent_event->child_mutex); -- cgit From 5fa7c8ec57f70a7b5c6fe269fa9c51b9e465989c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2016 15:25:15 +0100 Subject: perf: Remove/simplify lockdep annotation Now that the perf_event_ctx_lock_nested() call has moved from put_event() into perf_event_release_kernel() the first reason is no longer valid as that can no longer happen. The second reason seems to have been invalidated when Al Viro made fput() unconditionally async in the following commit: 4a9d4b024a31 ("switch fput to task_work_add") such that munmap()->fput()->release()->perf_release() would no longer happen. Therefore, remove the annotation. This should increase the efficiency of lockdep coverage of perf locking. Suggested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 98c862aff8fa..f1e53e8d4ae2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3758,19 +3758,7 @@ int perf_event_release_kernel(struct perf_event *event) if (!is_kernel_event(event)) perf_remove_from_owner(event); - /* - * There are two ways this annotation is useful: - * - * 1) there is a lock recursion from perf_event_exit_task - * see the comment there. - * - * 2) there is a lock-inversion with mmap_sem through - * perf_read_group(), which takes faults while - * holding ctx->mutex, however this is called after - * the last filedesc died, so there is no possibility - * to trigger the AB-BA case. - */ - ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); + ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(ctx->parent_ctx); perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE); perf_event_ctx_unlock(event, ctx); @@ -8759,14 +8747,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * perf_event_create_kernel_count() which does find_get_context() * without ctx::mutex (it cannot because of the move_group double mutex * lock thing). See the comments in perf_install_in_context(). - * - * We can recurse on the same lock type through: - * - * perf_event_exit_event() - * put_event() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. */ mutex_lock(&child_ctx->mutex); -- cgit From 6ccd83714a009ee301b50c15f6c3a5dc1f30164c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Jan 2016 10:22:41 -0500 Subject: tracing/stacktrace: Show entire trace if passed in function not found When a max stack trace is discovered, the stack dump is saved. In order to not record the overhead of the stack tracer, the ip of the traced function is looked for within the dump. The trace is started from the location of that function. But if for some reason the ip is not found, the entire stack trace is then truncated. That's not very useful. Instead, print everything if the ip of the traced function is not found within the trace. This issue showed up on s390. Link: http://lkml.kernel.org/r/20160129102241.1b3c9c04@gandalf.local.home Fixes: 72ac426a5bb0 ("tracing: Clean up stack tracing and fix fentry updates") Cc: stable@vger.kernel.org # v4.3+ Reported-by: Heiko Carstens Tested-by: Heiko Carstens Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index dda9e6742950..202df6cffcca 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -125,6 +125,13 @@ check_stack(unsigned long ip, unsigned long *stack) break; } + /* + * Some archs may not have the passed in ip in the dump. + * If that happens, we need to show everything. + */ + if (i == stack_trace_max.nr_entries) + i = 0; + /* * Now find where in the stack these are. */ -- cgit From 23d11a58a9a60dcb52c8fc6494efce908b24c295 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 29 Jan 2016 05:59:46 -0500 Subject: workqueue: skip flush dependency checks for legacy workqueues fca839c00a12 ("workqueue: warn if memory reclaim tries to flush !WQ_MEM_RECLAIM workqueue") implemented flush dependency warning which triggers if a PF_MEMALLOC task or WQ_MEM_RECLAIM workqueue tries to flush a !WQ_MEM_RECLAIM workquee. This assumes that workqueues marked with WQ_MEM_RECLAIM sit in memory reclaim path and making it depend on something which may need more memory to make forward progress can lead to deadlocks. Unfortunately, workqueues created with the legacy create*_workqueue() interface always have WQ_MEM_RECLAIM regardless of whether they are depended upon memory reclaim or not. These spurious WQ_MEM_RECLAIM markings cause spurious triggering of the flush dependency checks. WARNING: CPU: 0 PID: 6 at kernel/workqueue.c:2361 check_flush_dependency+0x138/0x144() workqueue: WQ_MEM_RECLAIM deferwq:deferred_probe_work_func is flushing !WQ_MEM_RECLAIM events:lru_add_drain_per_cpu ... Workqueue: deferwq deferred_probe_work_func [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (dump_stack+0x94/0xd4) [] (dump_stack) from [] (warn_slowpath_common+0x80/0xb0) [] (warn_slowpath_common) from [] (warn_slowpath_fmt+0x30/0x40) [] (warn_slowpath_fmt) from [] (check_flush_dependency+0x138/0x144) [] (check_flush_dependency) from [] (flush_work+0x50/0x15c) [] (flush_work) from [] (lru_add_drain_all+0x130/0x180) [] (lru_add_drain_all) from [] (migrate_prep+0x8/0x10) [] (migrate_prep) from [] (alloc_contig_range+0xd8/0x338) [] (alloc_contig_range) from [] (cma_alloc+0xe0/0x1ac) [] (cma_alloc) from [] (__alloc_from_contiguous+0x38/0xd8) [] (__alloc_from_contiguous) from [] (__dma_alloc+0x240/0x278) [] (__dma_alloc) from [] (arm_dma_alloc+0x54/0x5c) [] (arm_dma_alloc) from [] (dmam_alloc_coherent+0xc0/0xec) [] (dmam_alloc_coherent) from [] (ahci_port_start+0x150/0x1dc) [] (ahci_port_start) from [] (ata_host_start.part.3+0xc8/0x1c8) [] (ata_host_start.part.3) from [] (ata_host_activate+0x50/0x148) [] (ata_host_activate) from [] (ahci_host_activate+0x44/0x114) [] (ahci_host_activate) from [] (ahci_platform_init_host+0x1d8/0x3c8) [] (ahci_platform_init_host) from [] (tegra_ahci_probe+0x448/0x4e8) [] (tegra_ahci_probe) from [] (platform_drv_probe+0x50/0xac) [] (platform_drv_probe) from [] (driver_probe_device+0x214/0x2c0) [] (driver_probe_device) from [] (bus_for_each_drv+0x60/0x94) [] (bus_for_each_drv) from [] (__device_attach+0xb0/0x114) [] (__device_attach) from [] (bus_probe_device+0x84/0x8c) [] (bus_probe_device) from [] (deferred_probe_work_func+0x68/0x98) [] (deferred_probe_work_func) from [] (process_one_work+0x120/0x3f8) [] (process_one_work) from [] (worker_thread+0x38/0x55c) [] (worker_thread) from [] (kthread+0xdc/0xf4) [] (kthread) from [] (ret_from_fork+0x14/0x3c) Fix it by marking workqueues created via create*_workqueue() with __WQ_LEGACY and disabling flush dependency checks on them. Signed-off-by: Tejun Heo Reported-and-tested-by: Thierry Reding Link: http://lkml.kernel.org/g/20160126173843.GA11115@ulmo.nvidia.com Fixes: fca839c00a12 ("workqueue: warn if memory reclaim tries to flush !WQ_MEM_RECLAIM workqueue") --- include/linux/workqueue.h | 9 +++++---- kernel/workqueue.c | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 0e32bc71245e..ca73c503b92a 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -311,6 +311,7 @@ enum { __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ + __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ @@ -411,12 +412,12 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ - alloc_workqueue("%s", WQ_MEM_RECLAIM, 1, (name)) + alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) #define create_freezable_workqueue(name) \ - alloc_workqueue("%s", WQ_FREEZABLE | WQ_UNBOUND | WQ_MEM_RECLAIM, \ - 1, (name)) + alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \ + WQ_MEM_RECLAIM, 1, (name)) #define create_singlethread_workqueue(name) \ - alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM, name) + alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name) extern void destroy_workqueue(struct workqueue_struct *wq); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 61a0264e28f9..dc7faad63646 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2355,7 +2355,8 @@ static void check_flush_dependency(struct workqueue_struct *target_wq, WARN_ONCE(current->flags & PF_MEMALLOC, "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", current->pid, current->comm, target_wq->name, target_func); - WARN_ONCE(worker && (worker->current_pwq->wq->flags & WQ_MEM_RECLAIM), + WARN_ONCE(worker && ((worker->current_pwq->wq->flags & + (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", worker->current_pwq->wq->name, worker->current_func, target_wq->name, target_func); -- cgit From eb7d78c9e7f6418932bd5fbee45eb46d5ab05002 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 29 Jan 2016 21:48:34 -0800 Subject: devm_memremap_pages: fix vmem_altmap lifetime + alignment handling to_vmem_altmap() needs to return valid results until arch_remove_memory() completes. It also needs to be valid for any pfn in a section regardless of whether that pfn maps to data. This escape was a result of a bug in the unit test. The signature of this bug is that free_pagetable() fails to retrieve a vmem_altmap and goes off into the weeds: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] get_pfnblock_flags_mask+0x49/0x60 [..] Call Trace: [] free_hot_cold_page+0x97/0x1d0 [] __free_pages+0x2a/0x40 [] free_pagetable+0x8c/0xd4 [] remove_pagetable+0x37a/0x808 [] vmemmap_free+0x10/0x20 Fixes: 4b94ffdc4163 ("x86, mm: introduce vmem_altmap to augment vmemmap_populate()") Cc: Andrew Morton Reported-by: Jeff Moyer Signed-off-by: Dan Williams --- kernel/memremap.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index e517a16cb426..cbc3e97e2bb4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -183,7 +183,11 @@ EXPORT_SYMBOL(put_zone_device_page); static void pgmap_radix_release(struct resource *res) { - resource_size_t key; + resource_size_t key, align_start, align_size, align_end; + + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + align_end = align_start + align_size - 1; mutex_lock(&pgmap_lock); for (key = res->start; key <= res->end; key += SECTION_SIZE) @@ -226,12 +230,11 @@ static void devm_memremap_pages_release(struct device *dev, void *data) percpu_ref_put(pgmap->ref); } - pgmap_radix_release(res); - /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, "%s: failed to free all reserved pages\n", __func__); } @@ -267,7 +270,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, { int is_ram = region_intersects(res->start, resource_size(res), "System RAM"); - resource_size_t key, align_start, align_size; + resource_size_t key, align_start, align_size, align_end; struct dev_pagemap *pgmap; struct page_map *page_map; unsigned long pfn; @@ -309,7 +312,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, mutex_lock(&pgmap_lock); error = 0; - for (key = res->start; key <= res->end; key += SECTION_SIZE) { + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + align_end = align_start + align_size - 1; + for (key = align_start; key <= align_end; key += SECTION_SIZE) { struct dev_pagemap *dup; rcu_read_lock(); @@ -336,8 +342,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (nid < 0) nid = numa_mem_id(); - align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(resource_size(res), SECTION_SIZE); error = arch_add_memory(nid, align_start, align_size, true); if (error) goto err_add_memory; -- cgit From 840d6fe7425ffb6a62d53b2759e01ae6daf90e4e Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 30 Jan 2016 10:04:17 +0800 Subject: pid: Fix spelling in comments Accidentally discovered this typo when I studied this module. Signed-off-by: Zhen Lei Cc: Hanjun Guo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tianhong Ding Cc: Xinwei Hu Cc: Zefan Li Link: http://lkml.kernel.org/r/1454119457-11272-1-git-send-email-thunder.leizhen@huawei.com Signed-off-by: Ingo Molnar --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 78b3d9f80d44..e793d091f680 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -588,7 +588,7 @@ void __init pidhash_init(void) void __init pidmap_init(void) { - /* Veryify no one has done anything silly */ + /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); /* bump default and minimum pid_max based on number of cpus */ -- cgit From a3650d53ba16ec412185abb98f231e9ba6bcdc65 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 Jan 2016 21:57:18 +0100 Subject: resource: Handle resource flags properly I/O resource flags consist of I/O resource types and modifier bits. Therefore, checking an I/O resource type in 'flags' must be performed with a bitwise operation. Fix find_next_iomem_res() and region_intersects() that simply compare 'flags' against a given value. Also change __request_region() to set 'res->flags' from resource_type() and resource_ext_type() of the parent, so that children nodes will inherit the extended I/O resource type. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jakub Sitnicki Cc: Jiang Liu Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Toshi Kani Cc: Vinod Koul Cc: linux-arch@vger.kernel.org Cc: linux-mm Link: http://lkml.kernel.org/r/1453841853-11383-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- kernel/resource.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 09c0597840b0..96afc8027487 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -358,7 +358,7 @@ static int find_next_iomem_res(struct resource *res, char *name, read_lock(&resource_lock); for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { - if (p->flags != res->flags) + if ((p->flags & res->flags) != res->flags) continue; if (name && strcmp(p->name, name)) continue; @@ -519,7 +519,8 @@ int region_intersects(resource_size_t start, size_t size, const char *name) read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { - bool is_type = strcmp(p->name, name) == 0 && p->flags == flags; + bool is_type = strcmp(p->name, name) == 0 && + ((p->flags & flags) == flags); if (start >= p->start && start <= p->end) is_type ? type++ : other++; @@ -1071,7 +1072,7 @@ struct resource * __request_region(struct resource *parent, res->name = name; res->start = start; res->end = start + n - 1; - res->flags = resource_type(parent); + res->flags = resource_type(parent) | resource_ext_type(parent); res->flags |= IORESOURCE_BUSY | flags; write_lock(&resource_lock); -- cgit From 43ee493bde78da00deaf5737925365c691a036ad Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 Jan 2016 21:57:19 +0100 Subject: resource: Add I/O resource descriptor walk_iomem_res() and region_intersects() still need to use strcmp() for searching a resource entry by @name in the iomem table. This patch introduces I/O resource descriptor 'desc' in struct resource for the iomem search interfaces. Drivers can assign their unique descriptor to a range when they support the search interfaces. Otherwise, 'desc' is set to IORES_DESC_NONE (0). This avoids changing most of the drivers as they typically allocate resource entries statically, or by calling alloc_resource(), kzalloc(), or alloc_bootmem_low(), which set the field to zero by default. A later patch will address some drivers that use kmalloc() without zero'ing the field. Also change release_mem_region_adjustable() to set 'desc' when its resource entry gets separated. Other resource interfaces are also changed to initialize 'desc' explicitly although alloc_resource() sets it to 0. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jakub Sitnicki Cc: Jiang Liu Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-arch@vger.kernel.org Cc: linux-mm Link: http://lkml.kernel.org/r/1453841853-11383-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- include/linux/ioport.h | 18 ++++++++++++++++++ kernel/resource.c | 5 +++++ 2 files changed, 23 insertions(+) (limited to 'kernel') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 4b65d944717f..983bea05d69c 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -20,6 +20,7 @@ struct resource { resource_size_t end; const char *name; unsigned long flags; + unsigned long desc; struct resource *parent, *sibling, *child; }; @@ -112,6 +113,22 @@ struct resource { /* PCI control bits. Shares IORESOURCE_BITS with above PCI ROM. */ #define IORESOURCE_PCI_FIXED (1<<4) /* Do not move resource */ +/* + * I/O Resource Descriptors + * + * Descriptors are used by walk_iomem_res_desc() and region_intersects() + * for searching a specific resource range in the iomem table. Assign + * a new descriptor when a resource range supports the search interfaces. + * Otherwise, resource.desc must be set to IORES_DESC_NONE (0). + */ +enum { + IORES_DESC_NONE = 0, + IORES_DESC_CRASH_KERNEL = 1, + IORES_DESC_ACPI_TABLES = 2, + IORES_DESC_ACPI_NV_STORAGE = 3, + IORES_DESC_PERSISTENT_MEMORY = 4, + IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5, +}; /* helpers to define resources */ #define DEFINE_RES_NAMED(_start, _size, _name, _flags) \ @@ -120,6 +137,7 @@ struct resource { .end = (_start) + (_size) - 1, \ .name = (_name), \ .flags = (_flags), \ + .desc = IORES_DESC_NONE, \ } #define DEFINE_RES_IO_NAMED(_start, _size, _name) \ diff --git a/kernel/resource.c b/kernel/resource.c index 96afc8027487..61512e972ece 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -949,6 +949,7 @@ static void __init __reserve_region_with_split(struct resource *root, res->start = start; res->end = end; res->flags = IORESOURCE_BUSY; + res->desc = IORES_DESC_NONE; while (1) { @@ -983,6 +984,7 @@ static void __init __reserve_region_with_split(struct resource *root, next_res->start = conflict->end + 1; next_res->end = end; next_res->flags = IORESOURCE_BUSY; + next_res->desc = IORES_DESC_NONE; } } else { res->start = conflict->end + 1; @@ -1074,6 +1076,7 @@ struct resource * __request_region(struct resource *parent, res->end = start + n - 1; res->flags = resource_type(parent) | resource_ext_type(parent); res->flags |= IORESOURCE_BUSY | flags; + res->desc = IORES_DESC_NONE; write_lock(&resource_lock); @@ -1238,6 +1241,7 @@ int release_mem_region_adjustable(struct resource *parent, new_res->start = end + 1; new_res->end = res->end; new_res->flags = res->flags; + new_res->desc = res->desc; new_res->parent = res->parent; new_res->sibling = res->sibling; new_res->child = NULL; @@ -1413,6 +1417,7 @@ static int __init reserve_setup(char *str) res->start = io_start; res->end = io_start + io_num - 1; res->flags = IORESOURCE_BUSY; + res->desc = IORES_DESC_NONE; res->child = NULL; if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) reserved = x+1; -- cgit From 1a085d0727afaedb9506f04798516298b1676e11 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 Jan 2016 21:57:23 +0100 Subject: kexec: Set IORESOURCE_SYSTEM_RAM for System RAM Set proper ioresource flags and types for crash kernel reservation areas. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Reviewed-by: Dave Young Cc: Andrew Morton Cc: Andy Lutomirski Cc: Baoquan He Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: HATAYAMA Daisuke Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Minfei Huang Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: Vivek Goyal Cc: kexec@lists.infradead.org Cc: linux-arch@vger.kernel.org Cc: linux-mm Link: http://lkml.kernel.org/r/1453841853-11383-8-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- kernel/kexec_core.c | 8 +++++--- kernel/kexec_file.c | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 8dc659144869..8d34308ea449 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -66,13 +66,15 @@ struct resource crashk_res = { .name = "Crash kernel", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL }; struct resource crashk_low_res = { .name = "Crash kernel", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL }; int kexec_should_crash(struct task_struct *p) @@ -959,7 +961,7 @@ int crash_shrink_memory(unsigned long new_size) ram_res->start = end; ram_res->end = crashk_res.end; - ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; ram_res->name = "System RAM"; crashk_res.end = end - 1; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 007b791f676d..2bfcdc064116 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -525,7 +525,7 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, /* Walk the RAM ranges and allocate a suitable range for the buffer */ if (image->type == KEXEC_TYPE_CRASH) ret = walk_iomem_res("Crash kernel", - IORESOURCE_MEM | IORESOURCE_BUSY, + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, crashk_res.start, crashk_res.end, kbuf, locate_mem_hole_callback); else -- cgit From bd7e6cb30ced147292d854a54d4a1f5c5a05d927 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 Jan 2016 21:57:26 +0100 Subject: resource: Change walk_system_ram() to use System RAM type Now that all System RAM resource entries have been initialized to IORESOURCE_SYSTEM_RAM type, change walk_system_ram_res() and walk_system_ram_range() to call find_next_iomem_res() by setting @res.flags to IORESOURCE_SYSTEM_RAM and @name to NULL. With this change, they walk through the iomem table to find System RAM ranges without the need to do strcmp() on the resource names. No functional change is made to the interfaces. Signed-off-by: Toshi Kani [ Boris: fixup comments. ] Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jakub Sitnicki Cc: Jiang Liu Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-arch@vger.kernel.org Cc: linux-mm Link: http://lkml.kernel.org/r/1453841853-11383-11-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- kernel/resource.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 61512e972ece..994f1e41269b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -415,11 +415,11 @@ int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, } /* - * This function calls callback against all memory range of "System RAM" - * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. - * Now, this function is only for "System RAM". This function deals with - * full ranges and not pfn. If resources are not pfn aligned, dealing - * with pfn can truncate ranges. + * This function calls the @func callback against all memory ranges of type + * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. + * Now, this function is only for System RAM, it deals with full ranges and + * not PFNs. If resources are not PFN-aligned, dealing with PFNs can truncate + * ranges. */ int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)) @@ -430,10 +430,10 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, res.start = start; res.end = end; - res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (!find_next_iomem_res(&res, "System RAM", true))) { + (!find_next_iomem_res(&res, NULL, true))) { ret = (*func)(res.start, res.end, arg); if (ret) break; @@ -446,9 +446,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* - * This function calls callback against all memory range of "System RAM" - * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. - * Now, this function is only for "System RAM". + * This function calls the @func callback against all memory ranges of type + * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. + * It is to be used only for System RAM. */ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) @@ -460,10 +460,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, res.start = (u64) start_pfn << PAGE_SHIFT; res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; - res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (find_next_iomem_res(&res, "System RAM", true) >= 0)) { + (find_next_iomem_res(&res, NULL, true) >= 0)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) @@ -484,7 +484,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) } /* * This generic page_is_ram() returns true if specified address is - * registered as "System RAM" in iomem_resource list. + * registered as System RAM in iomem_resource list. */ int __weak page_is_ram(unsigned long pfn) { -- cgit From 1c29f25bf5d6c557017f619b638c619cbbf798c4 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 Jan 2016 21:57:28 +0100 Subject: memremap: Change region_intersects() to take @flags and @desc Change region_intersects() to identify a target with @flags and @desc, instead of @name with strcmp(). Change the callers of region_intersects(), memremap() and devm_memremap(), to set IORESOURCE_SYSTEM_RAM in @flags and IORES_DESC_NONE in @desc when searching System RAM. Also, export region_intersects() so that the ACPI EINJ error injection driver can call this function in a later patch. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Acked-by: Dan Williams Cc: Andrew Morton Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jakub Sitnicki Cc: Jan Kara Cc: Jiang Liu Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Cc: Toshi Kani Cc: Vlastimil Babka Cc: linux-arch@vger.kernel.org Cc: linux-mm Link: http://lkml.kernel.org/r/1453841853-11383-13-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- include/linux/mm.h | 3 ++- kernel/memremap.c | 13 +++++++------ kernel/resource.c | 26 +++++++++++++++----------- 3 files changed, 24 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index f1cd22f2df1a..cd5a300d3397 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -385,7 +385,8 @@ enum { REGION_MIXED, }; -int region_intersects(resource_size_t offset, size_t size, const char *type); +int region_intersects(resource_size_t offset, size_t size, unsigned long flags, + unsigned long desc); /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); diff --git a/kernel/memremap.c b/kernel/memremap.c index e517a16cb426..293309cac061 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -47,7 +47,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size) * being mapped does not have i/o side effects and the __iomem * annotation is not applicable. * - * MEMREMAP_WB - matches the default mapping for "System RAM" on + * MEMREMAP_WB - matches the default mapping for System RAM on * the architecture. This is usually a read-allocate write-back cache. * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM * memremap() will bypass establishing a new mapping and instead return @@ -56,11 +56,12 @@ static void *try_ram_remap(resource_size_t offset, size_t size) * MEMREMAP_WT - establish a mapping whereby writes either bypass the * cache or are written through to memory and never exist in a * cache-dirty state with respect to program visibility. Attempts to - * map "System RAM" with this mapping type will fail. + * map System RAM with this mapping type will fail. */ void *memremap(resource_size_t offset, size_t size, unsigned long flags) { - int is_ram = region_intersects(offset, size, "System RAM"); + int is_ram = region_intersects(offset, size, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); void *addr = NULL; if (is_ram == REGION_MIXED) { @@ -76,7 +77,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) * MEMREMAP_WB is special in that it can be satisifed * from the direct map. Some archs depend on the * capability of memremap() to autodetect cases where - * the requested range is potentially in "System RAM" + * the requested range is potentially in System RAM. */ if (is_ram == REGION_INTERSECTS) addr = try_ram_remap(offset, size); @@ -88,7 +89,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) * If we don't have a mapping yet and more request flags are * pending then we will be attempting to establish a new virtual * address mapping. Enforce that this mapping is not aliasing - * "System RAM" + * System RAM. */ if (!addr && is_ram == REGION_INTERSECTS && flags) { WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", @@ -266,7 +267,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { int is_ram = region_intersects(res->start, resource_size(res), - "System RAM"); + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); resource_size_t key, align_start, align_size; struct dev_pagemap *pgmap; struct page_map *page_map; diff --git a/kernel/resource.c b/kernel/resource.c index 994f1e41269b..0041cedc47d6 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -496,31 +496,34 @@ EXPORT_SYMBOL_GPL(page_is_ram); * region_intersects() - determine intersection of region with known resources * @start: region start address * @size: size of region - * @name: name of resource (in iomem_resource) + * @flags: flags of resource (in iomem_resource) + * @desc: descriptor of resource (in iomem_resource) or IORES_DESC_NONE * * Check if the specified region partially overlaps or fully eclipses a - * resource identified by @name. Return REGION_DISJOINT if the region - * does not overlap @name, return REGION_MIXED if the region overlaps - * @type and another resource, and return REGION_INTERSECTS if the - * region overlaps @type and no other defined resource. Note, that - * REGION_INTERSECTS is also returned in the case when the specified - * region overlaps RAM and undefined memory holes. + * resource identified by @flags and @desc (optional with IORES_DESC_NONE). + * Return REGION_DISJOINT if the region does not overlap @flags/@desc, + * return REGION_MIXED if the region overlaps @flags/@desc and another + * resource, and return REGION_INTERSECTS if the region overlaps @flags/@desc + * and no other defined resource. Note that REGION_INTERSECTS is also + * returned in the case when the specified region overlaps RAM and undefined + * memory holes. * * region_intersect() is used by memory remapping functions to ensure * the user is not remapping RAM and is a vast speed up over walking * through the resource table page by page. */ -int region_intersects(resource_size_t start, size_t size, const char *name) +int region_intersects(resource_size_t start, size_t size, unsigned long flags, + unsigned long desc) { - unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; resource_size_t end = start + size - 1; int type = 0; int other = 0; struct resource *p; read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { - bool is_type = strcmp(p->name, name) == 0 && - ((p->flags & flags) == flags); + bool is_type = (((p->flags & flags) == flags) && + ((desc == IORES_DESC_NONE) || + (desc == p->desc))); if (start >= p->start && start <= p->end) is_type ? type++ : other++; @@ -539,6 +542,7 @@ int region_intersects(resource_size_t start, size_t size, const char *name) return REGION_DISJOINT; } +EXPORT_SYMBOL_GPL(region_intersects); void __weak arch_remove_reservations(struct resource *avail) { -- cgit From 3f33647c41962401272bb60dce67e6094d14dbf2 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 Jan 2016 21:57:29 +0100 Subject: resource: Add walk_iomem_res_desc() Add a new interface, walk_iomem_res_desc(), which walks through the iomem table by identifying a target with @flags and @desc. This interface provides the same functionality as walk_iomem_res(), but does not use strcmp() to @name for better efficiency. walk_iomem_res() is deprecated and will be removed in a later patch. Requested-by: Borislav Petkov Signed-off-by: Toshi Kani [ Fixup comments. ] Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Hanjun Guo Cc: Jakub Sitnicki Cc: Jiang Liu Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-arch@vger.kernel.org Cc: linux-mm Link: http://lkml.kernel.org/r/1453841853-11383-14-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- include/linux/ioport.h | 3 +++ kernel/resource.c | 66 ++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 59 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 983bea05d69c..2a4a5e839965 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -268,6 +268,9 @@ extern int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)); extern int +walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, + void *arg, int (*func)(u64, u64, void *)); +extern int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)); diff --git a/kernel/resource.c b/kernel/resource.c index 0041cedc47d6..37ed2fcb8246 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -333,14 +333,15 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); /* - * Finds the lowest iomem reosurce exists with-in [res->start.res->end) - * the caller must specify res->start, res->end, res->flags and "name". - * If found, returns 0, res is overwritten, if not found, returns -1. - * This walks through whole tree and not just first level children - * until and unless first_level_children_only is true. + * Finds the lowest iomem resource existing within [res->start.res->end). + * The caller must specify res->start, res->end, res->flags, and optionally + * desc and "name". If found, returns 0, res is overwritten, if not found, + * returns -1. + * This function walks the whole tree and not just first level children until + * and unless first_level_children_only is true. */ -static int find_next_iomem_res(struct resource *res, char *name, - bool first_level_children_only) +static int find_next_iomem_res(struct resource *res, unsigned long desc, + char *name, bool first_level_children_only) { resource_size_t start, end; struct resource *p; @@ -360,6 +361,8 @@ static int find_next_iomem_res(struct resource *res, char *name, for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { if ((p->flags & res->flags) != res->flags) continue; + if ((desc != IORES_DESC_NONE) && (desc != p->desc)) + continue; if (name && strcmp(p->name, name)) continue; if (p->start > end) { @@ -385,12 +388,55 @@ static int find_next_iomem_res(struct resource *res, char *name, * Walks through iomem resources and calls func() with matching resource * ranges. This walks through whole tree and not just first level children. * All the memory ranges which overlap start,end and also match flags and + * desc are valid candidates. + * + * @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check. + * @flags: I/O resource flags + * @start: start addr + * @end: end addr + * + * NOTE: For a new descriptor search, define a new IORES_DESC in + * and set it in 'desc' of a target resource entry. + */ +int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, + u64 end, void *arg, int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = flags; + orig_end = res.end; + + while ((res.start < res.end) && + (!find_next_iomem_res(&res, desc, NULL, false))) { + + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + + res.start = res.end + 1; + res.end = orig_end; + } + + return ret; +} + +/* + * Walks through iomem resources and calls @func with matching resource + * ranges. This walks the whole tree and not just first level children. + * All the memory ranges which overlap start,end and also match flags and * name are valid candidates. * * @name: name of resource * @flags: resource flags * @start: start addr * @end: end addr + * + * NOTE: This function is deprecated and should not be used in new code. + * Use walk_iomem_res_desc(), instead. */ int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)) @@ -404,7 +450,7 @@ int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, res.flags = flags; orig_end = res.end; while ((res.start < res.end) && - (!find_next_iomem_res(&res, name, false))) { + (!find_next_iomem_res(&res, IORES_DESC_NONE, name, false))) { ret = (*func)(res.start, res.end, arg); if (ret) break; @@ -433,7 +479,7 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (!find_next_iomem_res(&res, NULL, true))) { + (!find_next_iomem_res(&res, IORES_DESC_NONE, NULL, true))) { ret = (*func)(res.start, res.end, arg); if (ret) break; @@ -463,7 +509,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (find_next_iomem_res(&res, NULL, true) >= 0)) { + (find_next_iomem_res(&res, IORES_DESC_NONE, NULL, true) >= 0)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) -- cgit From f0f4711aa16b82016c0b6e59871934bbd71258da Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 Jan 2016 21:57:30 +0100 Subject: x86, kexec, nvdimm: Use walk_iomem_res_desc() for iomem search Change the callers of walk_iomem_res() scanning for the following resources by name to use walk_iomem_res_desc() instead. "ACPI Tables" "ACPI Non-volatile Storage" "Persistent Memory (legacy)" "Crash kernel" Note, the caller of walk_iomem_res() with "GART" will be removed in a later patch. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Reviewed-by: Dave Young Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Chun-Yi Cc: Dan Williams Cc: Denys Vlasenko Cc: Don Zickus Cc: H. Peter Anvin Cc: Lee, Chun-Yi Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Minfei Huang Cc: Peter Zijlstra (Intel) Cc: Ross Zwisler Cc: Stephen Rothwell Cc: Takao Indoh Cc: Thomas Gleixner Cc: Toshi Kani Cc: kexec@lists.infradead.org Cc: linux-arch@vger.kernel.org Cc: linux-mm Cc: linux-nvdimm@lists.01.org Link: http://lkml.kernel.org/r/1453841853-11383-15-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/crash.c | 4 ++-- arch/x86/kernel/pmem.c | 4 ++-- drivers/nvdimm/e820.c | 2 +- kernel/kexec_file.c | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 58f34319b29a..35e152eeb6e0 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -599,12 +599,12 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) /* Add ACPI tables */ cmd.type = E820_ACPI; flags = IORESOURCE_MEM | IORESOURCE_BUSY; - walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd, + walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd, memmap_entry_callback); /* Add ACPI Non-volatile Storage */ cmd.type = E820_NVS; - walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd, + walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, memmap_entry_callback); /* Add crashk_low_res region */ diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 14415aff1813..92f70147a9a6 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -13,11 +13,11 @@ static int found(u64 start, u64 end, void *data) static __init int register_e820_pmem(void) { - char *pmem = "Persistent Memory (legacy)"; struct platform_device *pdev; int rc; - rc = walk_iomem_res(pmem, IORESOURCE_MEM, 0, -1, NULL, found); + rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY, + IORESOURCE_MEM, 0, -1, NULL, found); if (rc <= 0) return 0; diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c index b0045a505dc8..95825b38559a 100644 --- a/drivers/nvdimm/e820.c +++ b/drivers/nvdimm/e820.c @@ -55,7 +55,7 @@ static int e820_pmem_probe(struct platform_device *pdev) for (p = iomem_resource.child; p ; p = p->sibling) { struct nd_region_desc ndr_desc; - if (strncmp(p->name, "Persistent Memory (legacy)", 26) != 0) + if (p->desc != IORES_DESC_PERSISTENT_MEMORY_LEGACY) continue; memset(&ndr_desc, 0, sizeof(ndr_desc)); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 2bfcdc064116..56b18eb1f001 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -524,10 +524,10 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, /* Walk the RAM ranges and allocate a suitable range for the buffer */ if (image->type == KEXEC_TYPE_CRASH) - ret = walk_iomem_res("Crash kernel", - IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, - crashk_res.start, crashk_res.end, kbuf, - locate_mem_hole_callback); + ret = walk_iomem_res_desc(crashk_res.desc, + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, kbuf, + locate_mem_hole_callback); else ret = walk_system_ram_res(0, -1, kbuf, locate_mem_hole_callback); -- cgit From a8fc42530ddd19d7580fe8c9f2ea86220a97e94c Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 26 Jan 2016 21:57:32 +0100 Subject: resource: Kill walk_iomem_res() walk_iomem_res_desc() replaced walk_iomem_res() and there is no caller to walk_iomem_res() any more. Kill it. Also remove @name from find_next_iomem_res() as it is no longer used. Signed-off-by: Toshi Kani Signed-off-by: Borislav Petkov Acked-by: Dave Young Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dan Williams Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Hanjun Guo Cc: Jakub Sitnicki Cc: Jiang Liu Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: Vinod Koul Cc: linux-arch@vger.kernel.org Cc: linux-mm Link: http://lkml.kernel.org/r/1453841853-11383-17-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- include/linux/ioport.h | 3 --- kernel/resource.c | 49 +++++-------------------------------------------- 2 files changed, 5 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 2a4a5e839965..afb45597fb5f 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -270,9 +270,6 @@ walk_system_ram_res(u64 start, u64 end, void *arg, extern int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)); -extern int -walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, void *arg, - int (*func)(u64, u64, void *)); /* True if any part of r1 overlaps r2 */ static inline bool resource_overlaps(struct resource *r1, struct resource *r2) diff --git a/kernel/resource.c b/kernel/resource.c index 37ed2fcb8246..49834309043c 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -335,13 +335,12 @@ EXPORT_SYMBOL(release_resource); /* * Finds the lowest iomem resource existing within [res->start.res->end). * The caller must specify res->start, res->end, res->flags, and optionally - * desc and "name". If found, returns 0, res is overwritten, if not found, - * returns -1. + * desc. If found, returns 0, res is overwritten, if not found, returns -1. * This function walks the whole tree and not just first level children until * and unless first_level_children_only is true. */ static int find_next_iomem_res(struct resource *res, unsigned long desc, - char *name, bool first_level_children_only) + bool first_level_children_only) { resource_size_t start, end; struct resource *p; @@ -363,8 +362,6 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, continue; if ((desc != IORES_DESC_NONE) && (desc != p->desc)) continue; - if (name && strcmp(p->name, name)) - continue; if (p->start > end) { p = NULL; break; @@ -411,7 +408,7 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, orig_end = res.end; while ((res.start < res.end) && - (!find_next_iomem_res(&res, desc, NULL, false))) { + (!find_next_iomem_res(&res, desc, false))) { ret = (*func)(res.start, res.end, arg); if (ret) @@ -424,42 +421,6 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, return ret; } -/* - * Walks through iomem resources and calls @func with matching resource - * ranges. This walks the whole tree and not just first level children. - * All the memory ranges which overlap start,end and also match flags and - * name are valid candidates. - * - * @name: name of resource - * @flags: resource flags - * @start: start addr - * @end: end addr - * - * NOTE: This function is deprecated and should not be used in new code. - * Use walk_iomem_res_desc(), instead. - */ -int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, - void *arg, int (*func)(u64, u64, void *)) -{ - struct resource res; - u64 orig_end; - int ret = -1; - - res.start = start; - res.end = end; - res.flags = flags; - orig_end = res.end; - while ((res.start < res.end) && - (!find_next_iomem_res(&res, IORES_DESC_NONE, name, false))) { - ret = (*func)(res.start, res.end, arg); - if (ret) - break; - res.start = res.end + 1; - res.end = orig_end; - } - return ret; -} - /* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. @@ -479,7 +440,7 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (!find_next_iomem_res(&res, IORES_DESC_NONE, NULL, true))) { + (!find_next_iomem_res(&res, IORES_DESC_NONE, true))) { ret = (*func)(res.start, res.end, arg); if (ret) break; @@ -509,7 +470,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (find_next_iomem_res(&res, IORES_DESC_NONE, NULL, true) >= 0)) { + (find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) -- cgit From e11b956e9ebef098cc4f81964a1f57e40fe75cd4 Mon Sep 17 00:00:00 2001 From: Li Bin Date: Sat, 30 Jan 2016 11:54:03 +0800 Subject: kernel/Makefile: remove the useless CFLAGS_REMOVE_cgroup-debug.o The file cgroup-debug.c had been removed from commit fe6934354f8e (cgroups: move the cgroup debug subsys into cgroup.c to access internal state). Remain the CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE) useless in kernel/Makefile. Signed-off-by: Li Bin Acked-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 53abf008ecb3..baa55e50a315 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -14,8 +14,7 @@ obj-y = fork.o exec_domain.o panic.o \ obj-$(CONFIG_MULTIUSER) += groups.o ifdef CONFIG_FUNCTION_TRACER -# Do not trace debug files and internal ftrace files -CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE) +# Do not trace internal ftrace files CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif -- cgit From 76e9f0ee52b0be5761e29847e0ef01f23f24f1df Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 22 Jan 2016 09:43:28 -0800 Subject: phys_to_pfn_t: use phys_addr_t A dma_addr_t is potentially smaller than a phys_addr_t on some archs. Don't truncate the address when doing the pfn conversion. Cc: Ross Zwisler Reported-by: Matthew Wilcox [willy: fix pfn_t_to_phys as well] Signed-off-by: Dan Williams --- include/linux/pfn_t.h | 4 ++-- kernel/memremap.c | 2 +- tools/testing/nvdimm/test/iomap.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 0703b5360d31..37448ab5fb5c 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -29,7 +29,7 @@ static inline pfn_t pfn_to_pfn_t(unsigned long pfn) return __pfn_to_pfn_t(pfn, 0); } -extern pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags); +extern pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags); static inline bool pfn_t_has_page(pfn_t pfn) { @@ -48,7 +48,7 @@ static inline struct page *pfn_t_to_page(pfn_t pfn) return NULL; } -static inline dma_addr_t pfn_t_to_phys(pfn_t pfn) +static inline phys_addr_t pfn_t_to_phys(pfn_t pfn) { return PFN_PHYS(pfn_t_to_pfn(pfn)); } diff --git a/kernel/memremap.c b/kernel/memremap.c index cbc3e97e2bb4..70ee3775de24 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -150,7 +150,7 @@ void devm_memunmap(struct device *dev, void *addr) } EXPORT_SYMBOL(devm_memunmap); -pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags) +pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags) { return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); } diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index 7ec7df9e7fc7..0c1a7e65bb81 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -113,7 +113,7 @@ void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res, } EXPORT_SYMBOL(__wrap_devm_memremap_pages); -pfn_t __wrap_phys_to_pfn_t(dma_addr_t addr, unsigned long flags) +pfn_t __wrap_phys_to_pfn_t(phys_addr_t addr, unsigned long flags) { struct nfit_test_resource *nfit_res = get_nfit_res(addr); -- cgit From 4355efbd80482a961cae849281a8ef866e53d55c Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 3 Feb 2016 16:55:26 +1030 Subject: modules: fix modparam async_probe request Commit f2411da746985 ("driver-core: add driver module asynchronous probe support") added async probe support, in two forms: * in-kernel driver specification annotation * generic async_probe module parameter (modprobe foo async_probe) To support the generic kernel parameter parse_args() was extended via commit ecc8617053e0 ("module: add extra argument for parse_params() callback") however commit failed to f2411da746985 failed to add the required argument. This causes a crash then whenever async_probe generic module parameter is used. This was overlooked when the form in which in-kernel async probe support was reworked a bit... Fix this as originally intended. Cc: Hannes Reinecke Cc: Dmitry Torokhov Cc: stable@vger.kernel.org (4.2+) Signed-off-by: Luis R. Rodriguez Signed-off-by: Rusty Russell [minimized] --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 8358f4697c0c..2149f7003e49 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3496,7 +3496,7 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Module is ready to execute: parsing args may do that. */ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, NULL, + -32768, 32767, mod, unknown_module_param_cb); if (IS_ERR(after_dashes)) { err = PTR_ERR(after_dashes); -- cgit From 2e7bac536106236104e9e339531ff0fcdb7b8147 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 3 Feb 2016 16:55:26 +1030 Subject: module: wrapper for symbol name. This trivial wrapper adds clarity and makes the following patch smaller. Cc: stable@kernel.org Signed-off-by: Rusty Russell --- kernel/module.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 2149f7003e49..1e79d8157712 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3627,6 +3627,11 @@ static inline int is_arm_mapping_symbol(const char *str) && (str[2] == '\0' || str[2] == '.'); } +static const char *symname(struct module *mod, unsigned int symnum) +{ + return mod->strtab + mod->symtab[symnum].st_name; +} + static const char *get_ksymbol(struct module *mod, unsigned long addr, unsigned long *size, @@ -3649,15 +3654,15 @@ static const char *get_ksymbol(struct module *mod, /* We ignore unnamed symbols: they're uninformative * and inserted at a whim. */ + if (*symname(mod, i) == '\0' + || is_arm_mapping_symbol(symname(mod, i))) + continue; + if (mod->symtab[i].st_value <= addr - && mod->symtab[i].st_value > mod->symtab[best].st_value - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) + && mod->symtab[i].st_value > mod->symtab[best].st_value) best = i; if (mod->symtab[i].st_value > addr - && mod->symtab[i].st_value < nextval - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) + && mod->symtab[i].st_value < nextval) nextval = mod->symtab[i].st_value; } @@ -3668,7 +3673,7 @@ static const char *get_ksymbol(struct module *mod, *size = nextval - mod->symtab[best].st_value; if (offset) *offset = addr - mod->symtab[best].st_value; - return mod->strtab + mod->symtab[best].st_name; + return symname(mod, best); } /* For kallsyms to ask for address resolution. NULL means not found. Careful @@ -3763,8 +3768,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, if (symnum < mod->num_symtab) { *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; - strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, - KSYM_NAME_LEN); + strlcpy(name, symname(mod, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); preempt_enable(); @@ -3781,7 +3785,7 @@ static unsigned long mod_find_symname(struct module *mod, const char *name) unsigned int i; for (i = 0; i < mod->num_symtab; i++) - if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 && + if (strcmp(name, symname(mod, i)) == 0 && mod->symtab[i].st_info != 'U') return mod->symtab[i].st_value; return 0; @@ -3825,7 +3829,7 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, if (mod->state == MODULE_STATE_UNFORMED) continue; for (i = 0; i < mod->num_symtab; i++) { - ret = fn(data, mod->strtab + mod->symtab[i].st_name, + ret = fn(data, symname(mod, i), mod, mod->symtab[i].st_value); if (ret != 0) return ret; -- cgit From 8244062ef1e54502ef55f54cced659913f244c3e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 3 Feb 2016 16:55:26 +1030 Subject: modules: fix longstanding /proc/kallsyms vs module insertion race. For CONFIG_KALLSYMS, we keep two symbol tables and two string tables. There's one full copy, marked SHF_ALLOC and laid out at the end of the module's init section. There's also a cut-down version that only contains core symbols and strings, and lives in the module's core section. After module init (and before we free the module memory), we switch the mod->symtab, mod->num_symtab and mod->strtab to point to the core versions. We do this under the module_mutex. However, kallsyms doesn't take the module_mutex: it uses preempt_disable() and rcu tricks to walk through the modules, because it's used in the oops path. It's also used in /proc/kallsyms. There's nothing atomic about the change of these variables, so we can get the old (larger!) num_symtab and the new symtab pointer; in fact this is what I saw when trying to reproduce. By grouping these variables together, we can use a carefully-dereferenced pointer to ensure we always get one or the other (the free of the module init section is already done in an RCU callback, so that's safe). We allocate the init one at the end of the module init section, and keep the core one inside the struct module itself (it could also have been allocated at the end of the module core, but that's probably overkill). Reported-by: Weilong Chen Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111541 Cc: stable@kernel.org Signed-off-by: Rusty Russell --- include/linux/module.h | 19 +++++---- kernel/module.c | 112 ++++++++++++++++++++++++++++++------------------- 2 files changed, 79 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 4560d8f1545d..2bb0c3085706 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -324,6 +324,12 @@ struct module_layout { #define __module_layout_align #endif +struct mod_kallsyms { + Elf_Sym *symtab; + unsigned int num_symtab; + char *strtab; +}; + struct module { enum module_state state; @@ -405,15 +411,10 @@ struct module { #endif #ifdef CONFIG_KALLSYMS - /* - * We keep the symbol and string tables for kallsyms. - * The core_* fields below are temporary, loader-only (they - * could really be discarded after module init). - */ - Elf_Sym *symtab, *core_symtab; - unsigned int num_symtab, core_num_syms; - char *strtab, *core_strtab; - + /* Protected by RCU and/or module_mutex: use rcu_dereference() */ + struct mod_kallsyms *kallsyms; + struct mod_kallsyms core_kallsyms; + /* Section attributes */ struct module_sect_attrs *sect_attrs; diff --git a/kernel/module.c b/kernel/module.c index 1e79d8157712..9537da37ce87 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -303,6 +303,9 @@ struct load_info { struct _ddebug *debug; unsigned int num_debug; bool sig_ok; +#ifdef CONFIG_KALLSYMS + unsigned long mod_kallsyms_init_off; +#endif struct { unsigned int sym, str, mod, vers, info, pcpu; } index; @@ -2480,10 +2483,21 @@ static void layout_symtab(struct module *mod, struct load_info *info) strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect, info->index.str) | INIT_OFFSET_MASK; - mod->init_layout.size = debug_align(mod->init_layout.size); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); + + /* We'll tack temporary mod_kallsyms on the end. */ + mod->init_layout.size = ALIGN(mod->init_layout.size, + __alignof__(struct mod_kallsyms)); + info->mod_kallsyms_init_off = mod->init_layout.size; + mod->init_layout.size += sizeof(struct mod_kallsyms); + mod->init_layout.size = debug_align(mod->init_layout.size); } +/* + * We use the full symtab and strtab which layout_symtab arranged to + * be appended to the init section. Later we switch to the cut-down + * core-only ones. + */ static void add_kallsyms(struct module *mod, const struct load_info *info) { unsigned int i, ndst; @@ -2492,29 +2506,34 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) char *s; Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - mod->symtab = (void *)symsec->sh_addr; - mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Set up to point into init section. */ + mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off; + + mod->kallsyms->symtab = (void *)symsec->sh_addr; + mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ - mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; /* Set types up while we still have access to sections. */ - for (i = 0; i < mod->num_symtab; i++) - mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - - mod->core_symtab = dst = mod->core_layout.base + info->symoffs; - mod->core_strtab = s = mod->core_layout.base + info->stroffs; - src = mod->symtab; - for (ndst = i = 0; i < mod->num_symtab; i++) { + for (i = 0; i < mod->kallsyms->num_symtab; i++) + mod->kallsyms->symtab[i].st_info + = elf_type(&mod->kallsyms->symtab[i], info); + + /* Now populate the cut down core kallsyms for after init. */ + mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; + mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; + src = mod->kallsyms->symtab; + for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { if (i == 0 || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { dst[ndst] = src[i]; - dst[ndst++].st_name = s - mod->core_strtab; - s += strlcpy(s, &mod->strtab[src[i].st_name], + dst[ndst++].st_name = s - mod->core_kallsyms.strtab; + s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], KSYM_NAME_LEN) + 1; } } - mod->core_num_syms = ndst; + mod->core_kallsyms.num_symtab = ndst; } #else static inline void layout_symtab(struct module *mod, struct load_info *info) @@ -3263,9 +3282,8 @@ static noinline int do_init_module(struct module *mod) module_put(mod); trim_init_extable(mod); #ifdef CONFIG_KALLSYMS - mod->num_symtab = mod->core_num_syms; - mod->symtab = mod->core_symtab; - mod->strtab = mod->core_strtab; + /* Switch to core kallsyms now init is done: kallsyms may be walking! */ + rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif mod_tree_remove_init(mod); disable_ro_nx(&mod->init_layout); @@ -3627,9 +3645,9 @@ static inline int is_arm_mapping_symbol(const char *str) && (str[2] == '\0' || str[2] == '.'); } -static const char *symname(struct module *mod, unsigned int symnum) +static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum) { - return mod->strtab + mod->symtab[symnum].st_name; + return kallsyms->strtab + kallsyms->symtab[symnum].st_name; } static const char *get_ksymbol(struct module *mod, @@ -3639,6 +3657,7 @@ static const char *get_ksymbol(struct module *mod, { unsigned int i, best = 0; unsigned long nextval; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); /* At worse, next value is at end of module */ if (within_module_init(addr, mod)) @@ -3648,32 +3667,32 @@ static const char *get_ksymbol(struct module *mod, /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ - for (i = 1; i < mod->num_symtab; i++) { - if (mod->symtab[i].st_shndx == SHN_UNDEF) + for (i = 1; i < kallsyms->num_symtab; i++) { + if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) continue; /* We ignore unnamed symbols: they're uninformative * and inserted at a whim. */ - if (*symname(mod, i) == '\0' - || is_arm_mapping_symbol(symname(mod, i))) + if (*symname(kallsyms, i) == '\0' + || is_arm_mapping_symbol(symname(kallsyms, i))) continue; - if (mod->symtab[i].st_value <= addr - && mod->symtab[i].st_value > mod->symtab[best].st_value) + if (kallsyms->symtab[i].st_value <= addr + && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value) best = i; - if (mod->symtab[i].st_value > addr - && mod->symtab[i].st_value < nextval) - nextval = mod->symtab[i].st_value; + if (kallsyms->symtab[i].st_value > addr + && kallsyms->symtab[i].st_value < nextval) + nextval = kallsyms->symtab[i].st_value; } if (!best) return NULL; if (size) - *size = nextval - mod->symtab[best].st_value; + *size = nextval - kallsyms->symtab[best].st_value; if (offset) - *offset = addr - mod->symtab[best].st_value; - return symname(mod, best); + *offset = addr - kallsyms->symtab[best].st_value; + return symname(kallsyms, best); } /* For kallsyms to ask for address resolution. NULL means not found. Careful @@ -3763,18 +3782,21 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + struct mod_kallsyms *kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - if (symnum < mod->num_symtab) { - *value = mod->symtab[symnum].st_value; - *type = mod->symtab[symnum].st_info; - strlcpy(name, symname(mod, symnum), KSYM_NAME_LEN); + kallsyms = rcu_dereference_sched(mod->kallsyms); + if (symnum < kallsyms->num_symtab) { + *value = kallsyms->symtab[symnum].st_value; + *type = kallsyms->symtab[symnum].st_info; + strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); preempt_enable(); return 0; } - symnum -= mod->num_symtab; + symnum -= kallsyms->num_symtab; } preempt_enable(); return -ERANGE; @@ -3783,11 +3805,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, static unsigned long mod_find_symname(struct module *mod, const char *name) { unsigned int i; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - for (i = 0; i < mod->num_symtab; i++) - if (strcmp(name, symname(mod, i)) == 0 && - mod->symtab[i].st_info != 'U') - return mod->symtab[i].st_value; + for (i = 0; i < kallsyms->num_symtab; i++) + if (strcmp(name, symname(kallsyms, i)) == 0 && + kallsyms->symtab[i].st_info != 'U') + return kallsyms->symtab[i].st_value; return 0; } @@ -3826,11 +3849,14 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, module_assert_mutex(); list_for_each_entry(mod, &modules, list) { + /* We hold module_mutex: no need for rcu_dereference_sched */ + struct mod_kallsyms *kallsyms = mod->kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - for (i = 0; i < mod->num_symtab; i++) { - ret = fn(data, symname(mod, i), - mod, mod->symtab[i].st_value); + for (i = 0; i < kallsyms->num_symtab; i++) { + ret = fn(data, symname(kallsyms, i), + mod, kallsyms->symtab[i].st_value); if (ret != 0) return ret; } -- cgit From 0306e481d479a58eff17c27adf213fbb5822946b Mon Sep 17 00:00:00 2001 From: Shilpasri G Bhat Date: Wed, 3 Feb 2016 01:11:40 +0530 Subject: cpufreq: powernv/tracing: Add powernv_throttle tracepoint This patch adds the powernv_throttle tracepoint to trace the CPU frequency throttling event, which is used by the powernv-cpufreq driver in POWER8. Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy Signed-off-by: Rafael J. Wysocki --- include/trace/events/power.h | 22 ++++++++++++++++++++++ kernel/trace/power-traces.c | 1 + 2 files changed, 23 insertions(+) (limited to 'kernel') diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 284244ebfe8d..19e50300ce7d 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -38,6 +38,28 @@ DEFINE_EVENT(cpu, cpu_idle, TP_ARGS(state, cpu_id) ); +TRACE_EVENT(powernv_throttle, + + TP_PROTO(int chip_id, const char *reason, int pmax), + + TP_ARGS(chip_id, reason, pmax), + + TP_STRUCT__entry( + __field(int, chip_id) + __string(reason, reason) + __field(int, pmax) + ), + + TP_fast_assign( + __entry->chip_id = chip_id; + __assign_str(reason, reason); + __entry->pmax = pmax; + ), + + TP_printk("Chip %d Pmax %d %s", __entry->chip_id, + __entry->pmax, __get_str(reason)) +); + TRACE_EVENT(pstate_sample, TP_PROTO(u32 core_busy, diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index eb4220a132ec..81b87451c0ea 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,4 +15,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); -- cgit From a6e4491c682a7b28574a62e6f311a0acec50b318 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Thu, 4 Feb 2016 09:38:00 -0500 Subject: sched/isolcpus: Output warning when the 'isolcpus=' kernel parameter is invalid The isolcpus= kernel boot parameter restricts userspace from scheduling on the specified CPUs. If a CPU is specified that is outside the range of 0 to nr_cpu_ids, cpulist_parse() will return -ERANGE, return an empty cpulist, and fail silently. This patch adds an error message to isolated_cpu_setup() to indicate to the user that something has gone awry, and returns 0 on error. Signed-off-by: Prarit Bhargava Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454596680-10367-1-git-send-email-prarit@redhat.com [ Twiddled some details. ] Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9503d590e5ef..24fcdbf28b18 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6173,11 +6173,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { + int ret; + alloc_bootmem_cpumask_var(&cpu_isolated_map); - cpulist_parse(str, cpu_isolated_map); + ret = cpulist_parse(str, cpu_isolated_map); + if (ret) { + pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); + return 0; + } return 1; } - __setup("isolcpus=", isolated_cpu_setup); struct s_data { -- cgit From 823dd3224a07f618d652a7743c9603222d019de3 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Fri, 5 Feb 2016 15:36:05 -0800 Subject: signals: avoid random wakeups in sigsuspend() A random wakeup can get us out of sigsuspend() without TIF_SIGPENDING being set. Avoid that by making sure we were signaled, like sys_pause() does. Signed-off-by: Sasha Levin Acked-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Cc: Dmitry Vyukov Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f3f1f7a972fd..0508544c8ced 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3508,8 +3508,10 @@ static int sigsuspend(sigset_t *set) current->saved_sigmask = current->blocked; set_current_blocked(set); - __set_current_state(TASK_INTERRUPTIBLE); - schedule(); + while (!signal_pending(current)) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } set_restore_sigmask(); return -ERESTARTNOHAND; } -- cgit From 824bd0ce6c7c43a9e1e210abf124958e54d88342 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Feb 2016 22:39:53 -0800 Subject: bpf: introduce BPF_MAP_TYPE_PERCPU_HASH map Introduce BPF_MAP_TYPE_PERCPU_HASH map type which is used to do accurate counters without need to use BPF_XADD instruction which turned out to be too costly for high-performance network monitoring. In the typical use case the 'key' is the flow tuple or other long living object that sees a lot of events per second. bpf_map_lookup_elem() returns per-cpu area. Example: struct { u32 packets; u32 bytes; } * ptr = bpf_map_lookup_elem(&map, &key); /* ptr points to this_cpu area of the value, so the following * increments will not collide with other cpus */ ptr->packets ++; ptr->bytes += skb->len; bpf_update_elem() atomically creates a new element where all per-cpu values are zero initialized and this_cpu value is populated with given 'value'. Note that non-per-cpu hash map always allocates new element and then deletes old after rcu grace period to maintain atomicity of update. Per-cpu hash map updates element values in-place. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + kernel/bpf/hashtab.c | 275 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 229 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aa6f8571de13..43ae40c8763e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -81,6 +81,7 @@ enum bpf_map_type { BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PROG_ARRAY, BPF_MAP_TYPE_PERF_EVENT_ARRAY, + BPF_MAP_TYPE_PERCPU_HASH, }; enum bpf_prog_type { diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c5b30fd8a315..2be5f6e8bb04 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -31,21 +31,27 @@ struct bpf_htab { struct htab_elem { struct hlist_node hash_node; struct rcu_head rcu; - u32 hash; + union { + u32 hash; + u32 key_size; + }; char key[0] __aligned(8); }; /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; struct bpf_htab *htab; int err, i; + u64 cost; htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); /* mandatory map attributes */ + htab->map.map_type = attr->map_type; htab->map.key_size = attr->key_size; htab->map.value_size = attr->value_size; htab->map.max_entries = attr->max_entries; @@ -77,24 +83,34 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; + if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) + /* make sure the size for pcpu_alloc() is reasonable */ + goto free_htab; + htab->elem_size = sizeof(struct htab_elem) + - round_up(htab->map.key_size, 8) + - htab->map.value_size; + round_up(htab->map.key_size, 8); + if (percpu) + htab->elem_size += sizeof(void *); + else + htab->elem_size += htab->map.value_size; /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; - if ((u64) htab->n_buckets * sizeof(struct bucket) + - (u64) htab->elem_size * htab->map.max_entries >= - U32_MAX - PAGE_SIZE) + cost = (u64) htab->n_buckets * sizeof(struct bucket) + + (u64) htab->elem_size * htab->map.max_entries; + + if (percpu) + cost += (u64) round_up(htab->map.value_size, 8) * + num_possible_cpus() * htab->map.max_entries; + + if (cost >= U32_MAX - PAGE_SIZE) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) + - htab->elem_size * htab->map.max_entries, - PAGE_SIZE) >> PAGE_SHIFT; + htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; err = -ENOMEM; htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), @@ -148,7 +164,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, } /* Called from syscall or from eBPF program */ -static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_head *head; @@ -166,6 +182,13 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); + return l; +} + +static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + if (l) return l->key + round_up(map->key_size, 8); @@ -230,65 +253,139 @@ find_first_elem: return -ENOENT; } + +static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, + void __percpu *pptr) +{ + *(void __percpu **)(l->key + key_size) = pptr; +} + +static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) +{ + return *(void __percpu **)(l->key + key_size); +} + +static void htab_percpu_elem_free(struct htab_elem *l) +{ + free_percpu(htab_elem_get_ptr(l, l->key_size)); + kfree(l); +} + +static void htab_percpu_elem_free_rcu(struct rcu_head *head) +{ + struct htab_elem *l = container_of(head, struct htab_elem, rcu); + + htab_percpu_elem_free(l); +} + +static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size) +{ + if (percpu) { + l->key_size = key_size; + call_rcu(&l->rcu, htab_percpu_elem_free_rcu); + } else { + kfree_rcu(l, rcu); + } +} + +static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, + void *value, u32 key_size, u32 hash, + bool percpu) +{ + u32 size = htab->map.value_size; + struct htab_elem *l_new; + void __percpu *pptr; + + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return NULL; + + memcpy(l_new->key, key, key_size); + if (percpu) { + /* round up value_size to 8 bytes */ + size = round_up(size, 8); + + /* alloc_percpu zero-fills */ + pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN); + if (!pptr) { + kfree(l_new); + return NULL; + } + + /* copy true value_size bytes */ + memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + htab_elem_set_ptr(l_new, key_size, pptr); + } else { + memcpy(l_new->key + round_up(key_size, 8), value, size); + } + + l_new->hash = hash; + return l_new; +} + +static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, + u64 map_flags) +{ + if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries)) + /* if elem with this 'key' doesn't exist and we've reached + * max_entries limit, fail insertion of new elem + */ + return -E2BIG; + + if (l_old && map_flags == BPF_NOEXIST) + /* elem already exists */ + return -EEXIST; + + if (!l_old && map_flags == BPF_EXIST) + /* elem doesn't exist, cannot update it */ + return -ENOENT; + + return 0; +} + /* Called from syscall or from eBPF program */ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new, *l_old; + struct htab_elem *l_new = NULL, *l_old; struct hlist_head *head; - struct bucket *b; unsigned long flags; - u32 key_size; + struct bucket *b; + u32 key_size, hash; int ret; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; WARN_ON_ONCE(!rcu_read_lock_held()); - /* allocate new element outside of lock */ - l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return -ENOMEM; - key_size = map->key_size; - memcpy(l_new->key, key, key_size); - memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); + hash = htab_map_hash(key, key_size); + + /* allocate new element outside of the lock, since + * we're most likley going to insert it + */ + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false); + if (!l_new) + return -ENOMEM; - l_new->hash = htab_map_hash(l_new->key, key_size); - b = __select_bucket(htab, l_new->hash); + b = __select_bucket(htab, hash); head = &b->head; /* bpf_map_update_elem() can be called in_irq() */ raw_spin_lock_irqsave(&b->lock, flags); - l_old = lookup_elem_raw(head, l_new->hash, key, key_size); + l_old = lookup_elem_raw(head, hash, key, key_size); - if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) { - /* if elem with this 'key' doesn't exist and we've reached - * max_entries limit, fail insertion of new elem - */ - ret = -E2BIG; + ret = check_flags(htab, l_old, map_flags); + if (ret) goto err; - } - if (l_old && map_flags == BPF_NOEXIST) { - /* elem already exists */ - ret = -EEXIST; - goto err; - } - - if (!l_old && map_flags == BPF_EXIST) { - /* elem doesn't exist, cannot update it */ - ret = -ENOENT; - goto err; - } - - /* add new element to the head of the list, so that concurrent - * search will find it before old elem + /* add new element to the head of the list, so that + * concurrent search will find it before old elem */ hlist_add_head_rcu(&l_new->hash_node, head); if (l_old) { @@ -298,7 +395,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, atomic_inc(&htab->count); } raw_spin_unlock_irqrestore(&b->lock, flags); - return 0; err: raw_spin_unlock_irqrestore(&b->lock, flags); @@ -306,10 +402,64 @@ err: return ret; } +static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new = NULL, *l_old; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + if (l_old) { + /* per-cpu hash map can update value in-place */ + memcpy(this_cpu_ptr(htab_elem_get_ptr(l_old, key_size)), + value, htab->map.value_size); + } else { + l_new = alloc_htab_elem(htab, key, value, key_size, + hash, true); + if (!l_new) { + ret = -ENOMEM; + goto err; + } + hlist_add_head_rcu(&l_new->hash_node, head); + atomic_inc(&htab->count); + } + ret = 0; +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + return ret; +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH; struct hlist_head *head; struct bucket *b; struct htab_elem *l; @@ -332,7 +482,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) if (l) { hlist_del_rcu(&l->hash_node); atomic_dec(&htab->count); - kfree_rcu(l, rcu); + free_htab_elem(l, percpu, key_size); ret = 0; } @@ -352,7 +502,12 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); atomic_dec(&htab->count); - kfree(l); + if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) { + l->key_size = htab->map.key_size; + htab_percpu_elem_free(l); + } else { + kfree(l); + } } } } @@ -391,9 +546,35 @@ static struct bpf_map_type_list htab_type __read_mostly = { .type = BPF_MAP_TYPE_HASH, }; +/* Called from eBPF program */ +static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) + return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); + else + return NULL; +} + +static const struct bpf_map_ops htab_percpu_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_update_elem = htab_percpu_map_update_elem, + .map_delete_elem = htab_map_delete_elem, +}; + +static struct bpf_map_type_list htab_percpu_type __read_mostly = { + .ops = &htab_percpu_ops, + .type = BPF_MAP_TYPE_PERCPU_HASH, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); + bpf_register_map_type(&htab_percpu_type); return 0; } late_initcall(register_htab_map); -- cgit From a10423b87a7eae75da79ce80a8d9475047a674ee Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Feb 2016 22:39:54 -0800 Subject: bpf: introduce BPF_MAP_TYPE_PERCPU_ARRAY map Primary use case is a histogram array of latency where bpf program computes the latency of block requests or other events and stores histogram of latency into array of 64 elements. All cpus are constantly running, so normal increment is not accurate, bpf_xadd causes cache ping-pong and this per-cpu approach allows fastest collision-free counters. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/arraymap.c | 102 ++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 93 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 83d1926c61e4..141fb0d45731 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -151,6 +151,7 @@ struct bpf_array { union { char value[0] __aligned(8); void *ptrs[0] __aligned(8); + void __percpu *pptrs[0] __aligned(8); }; }; #define MAX_TAIL_CALL_CNT 32 diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 43ae40c8763e..2ee0fde1bf96 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -82,6 +82,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PROG_ARRAY, BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_MAP_TYPE_PERCPU_HASH, + BPF_MAP_TYPE_PERCPU_ARRAY, }; enum bpf_prog_type { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 89ebbc4d1164..b9bf1d7949ca 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -17,11 +17,39 @@ #include #include +static void bpf_array_free_percpu(struct bpf_array *array) +{ + int i; + + for (i = 0; i < array->map.max_entries; i++) + free_percpu(array->pptrs[i]); +} + +static int bpf_array_alloc_percpu(struct bpf_array *array) +{ + void __percpu *ptr; + int i; + + for (i = 0; i < array->map.max_entries; i++) { + ptr = __alloc_percpu_gfp(array->elem_size, 8, + GFP_USER | __GFP_NOWARN); + if (!ptr) { + bpf_array_free_percpu(array); + return -ENOMEM; + } + array->pptrs[i] = ptr; + } + + return 0; +} + /* Called from syscall */ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; struct bpf_array *array; - u32 elem_size, array_size; + u64 array_size; + u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -36,12 +64,16 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); - /* check round_up into zero and u32 overflow */ - if (elem_size == 0 || - attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size) + array_size = sizeof(*array); + if (percpu) + array_size += (u64) attr->max_entries * sizeof(void *); + else + array_size += (u64) attr->max_entries * elem_size; + + /* make sure there is no u32 overflow later in round_up() */ + if (array_size >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); - array_size = sizeof(*array) + attr->max_entries * elem_size; /* allocate all map elements and zero-initialize them */ array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); @@ -52,12 +84,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } /* copy mandatory map attributes */ + array->map.map_type = attr->map_type; array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; - array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; array->elem_size = elem_size; + if (!percpu) + goto out; + + array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); + + if (array_size >= U32_MAX - PAGE_SIZE || + elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { + kvfree(array); + return ERR_PTR(-ENOMEM); + } +out: + array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; + return &array->map; } @@ -67,12 +112,24 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) return NULL; return array->value + array->elem_size * index; } +/* Called from eBPF program */ +static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return this_cpu_ptr(array->pptrs[index]); +} + /* Called from syscall */ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -99,19 +156,24 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; - if (map_flags == BPF_NOEXIST) + if (unlikely(map_flags == BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; - memcpy(array->value + array->elem_size * index, value, map->value_size); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + memcpy(this_cpu_ptr(array->pptrs[index]), + value, map->value_size); + else + memcpy(array->value + array->elem_size * index, + value, map->value_size); return 0; } @@ -133,6 +195,9 @@ static void array_map_free(struct bpf_map *map) */ synchronize_rcu(); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + bpf_array_free_percpu(array); + kvfree(array); } @@ -150,9 +215,24 @@ static struct bpf_map_type_list array_type __read_mostly = { .type = BPF_MAP_TYPE_ARRAY, }; +static const struct bpf_map_ops percpu_array_ops = { + .map_alloc = array_map_alloc, + .map_free = array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = percpu_array_map_lookup_elem, + .map_update_elem = array_map_update_elem, + .map_delete_elem = array_map_delete_elem, +}; + +static struct bpf_map_type_list percpu_array_type __read_mostly = { + .ops = &percpu_array_ops, + .type = BPF_MAP_TYPE_PERCPU_ARRAY, +}; + static int __init register_array_map(void) { bpf_register_map_type(&array_type); + bpf_register_map_type(&percpu_array_type); return 0; } late_initcall(register_array_map); -- cgit From 15a07b33814d14ca817887dbea8530728dc0fbe4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Feb 2016 22:39:55 -0800 Subject: bpf: add lookup/update support for per-cpu hash and array maps The functions bpf_map_lookup_elem(map, key, value) and bpf_map_update_elem(map, key, value, flags) need to get/set values from all-cpus for per-cpu hash and array maps, so that user space can aggregate/update them as necessary. Example of single counter aggregation in user space: unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); long values[nr_cpus]; long value = 0; bpf_lookup_elem(fd, key, values); for (i = 0; i < nr_cpus; i++) value += values[i]; The user space must provide round_up(value_size, 8) * nr_cpus array to get/set values, since kernel will use 'long' copy of per-cpu values to try to copy good counters atomically. It's a best-effort, since bpf programs and user space are racing to access the same memory. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 23 ++++++++++++++ kernel/bpf/arraymap.c | 64 +++++++++++++++++++++++++++++++++++++++ kernel/bpf/hashtab.c | 83 +++++++++++++++++++++++++++++++++++++++++++++------ kernel/bpf/syscall.c | 57 ++++++++++++++++++++++++----------- 4 files changed, 201 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 141fb0d45731..90ee6ab24bc5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -183,6 +183,29 @@ int bpf_prog_new_fd(struct bpf_prog *prog); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname); +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, + u64 flags); +int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, + u64 flags); + +/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and + * forced to use 'long' read/writes to try to atomically copy long counters. + * Best-effort only. No barriers here, since it _will_ race with concurrent + * updates from BPF programs. Called from bpf syscall and mostly used with + * size 8 or 16 bytes, so ask compiler to inline it. + */ +static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) +{ + const long *lsrc = src; + long *ldst = dst; + + size /= sizeof(long); + while (size--) + *ldst++ = *lsrc++; +} + /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); #else diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index b9bf1d7949ca..bd3bdf2486a7 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -130,6 +130,32 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) return this_cpu_ptr(array->pptrs[index]); } +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(index >= array->map.max_entries)) + return -ENOENT; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); + off += size; + } + rcu_read_unlock(); + return 0; +} + /* Called from syscall */ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -177,6 +203,44 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return 0; } +int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + if (unlikely(index >= array->map.max_entries)) + /* all elements were pre-allocated, cannot insert a new one */ + return -E2BIG; + + if (unlikely(map_flags == BPF_NOEXIST)) + /* all elements already exist */ + return -EEXIST; + + /* the user space will provide round_up(value_size, 8) bytes that + * will be copied into per-cpu area. bpf programs can only access + * value_size of it. During lookup the same extra bytes will be + * returned or zeros which were zero-filled by percpu_alloc, + * so no kernel data leaks possible + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); + off += size; + } + rcu_read_unlock(); + return 0; +} + /* Called from syscall or from eBPF program */ static int array_map_delete_elem(struct bpf_map *map, void *key) { diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2be5f6e8bb04..fd5db8fe9360 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -290,7 +290,7 @@ static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size) static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, - bool percpu) + bool percpu, bool onallcpus) { u32 size = htab->map.value_size; struct htab_elem *l_new; @@ -312,8 +312,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, return NULL; } - /* copy true value_size bytes */ - memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + if (!onallcpus) { + /* copy true value_size bytes */ + memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + } else { + int off = 0, cpu; + + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } htab_elem_set_ptr(l_new, key_size, pptr); } else { memcpy(l_new->key + round_up(key_size, 8), value, size); @@ -368,7 +378,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* allocate new element outside of the lock, since * we're most likley going to insert it */ - l_new = alloc_htab_elem(htab, key, value, key_size, hash, false); + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); if (!l_new) return -ENOMEM; @@ -402,8 +412,9 @@ err: return ret; } -static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; @@ -436,12 +447,25 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, goto err; if (l_old) { + void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); + u32 size = htab->map.value_size; + /* per-cpu hash map can update value in-place */ - memcpy(this_cpu_ptr(htab_elem_get_ptr(l_old, key_size)), - value, htab->map.value_size); + if (!onallcpus) { + memcpy(this_cpu_ptr(pptr), value, size); + } else { + int off = 0, cpu; + + size = round_up(size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true); + hash, true, onallcpus); if (!l_new) { ret = -ENOMEM; goto err; @@ -455,6 +479,12 @@ err: return ret; } +static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + return __htab_percpu_map_update_elem(map, key, value, map_flags, false); +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { @@ -557,6 +587,41 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) +{ + struct htab_elem *l; + void __percpu *pptr; + int ret = -ENOENT; + int cpu, off = 0; + u32 size; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + l = __htab_map_lookup_elem(map, key); + if (!l) + goto out; + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, + per_cpu_ptr(pptr, cpu), size); + off += size; + } + ret = 0; +out: + rcu_read_unlock(); + return ret; +} + +int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return __htab_percpu_map_update_elem(map, key, value, map_flags, true); +} + static const struct bpf_map_ops htab_percpu_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 637397059f76..c95a753c2007 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -239,6 +239,7 @@ static int map_lookup_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value, *ptr; + u32 value_size; struct fd f; int err; @@ -259,23 +260,35 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; - rcu_read_lock(); - ptr = map->ops->map_lookup_elem(map, key); - if (ptr) - memcpy(value, ptr, map->value_size); - rcu_read_unlock(); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_copy(map, key, value); + } else { + rcu_read_lock(); + ptr = map->ops->map_lookup_elem(map, key); + if (ptr) + memcpy(value, ptr, value_size); + rcu_read_unlock(); + err = ptr ? 0 : -ENOENT; + } - err = -ENOENT; - if (!ptr) + if (err) goto free_value; err = -EFAULT; - if (copy_to_user(uvalue, value, map->value_size) != 0) + if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; err = 0; @@ -298,6 +311,7 @@ static int map_update_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; + u32 value_size; struct fd f; int err; @@ -318,21 +332,30 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; err = -EFAULT; - if (copy_from_user(value, uvalue, map->value_size) != 0) + if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; - /* eBPF program that use maps are running under rcu_read_lock(), - * therefore all map accessors rely on this fact, so do the same here - */ - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, attr->flags); - rcu_read_unlock(); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_update(map, key, value, attr->flags); + } else { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, attr->flags); + rcu_read_unlock(); + } free_value: kfree(value); -- cgit From fbf198030e0b027538c290300cfaf9e2efdce122 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Feb 2016 19:52:23 +0100 Subject: genirq: Add default affinity mask command line option If we isolate CPUs, then we don't want random device interrupts on them. Even w/o the user space irq balancer enabled we can end up with irqs on non boot cpus and chasing newly requested interrupts is a tedious task. Allow to restrict the default irq affinity mask. Signed-off-by: Thomas Gleixner Cc: Rik van Riel Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Sebastian Siewior Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1602031948190.25254@nanos Signed-off-by: Thomas Gleixner --- Documentation/kernel-parameters.txt | 9 +++++++++ kernel/irq/irqdesc.c | 21 +++++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 551ecf09c8dd..87298f8592b7 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1687,6 +1687,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ip= [IP_PNP] See Documentation/filesystems/nfs/nfsroot.txt. + irqaffinity= [SMP] Set the default irq affinity mask + Format: + ,..., + or + - + (must be a positive range in ascending order) + or a mixture + ,...,- + irqfixup [HW] When an interrupt is not handled search all handlers for it. Intended to get systems with badly broken diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 0409da0bcc33..0ccd028817d7 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -24,10 +24,27 @@ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) +static int __init irq_affinity_setup(char *str) +{ + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + cpulist_parse(str, irq_default_affinity); + /* + * Set at least the boot cpu. We don't want to end up with + * bugreports caused by random comandline masks + */ + cpumask_set_cpu(smp_processor_id(), irq_default_affinity); + return 1; +} +__setup("irqaffinity=", irq_affinity_setup); + static void __init init_irq_default_affinity(void) { - alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); - cpumask_setall(irq_default_affinity); +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!irq_default_affinity) + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); +#endif + if (cpumask_empty(irq_default_affinity)) + cpumask_setall(irq_default_affinity); } #else static void __init init_irq_default_affinity(void) -- cgit From fd97646b05957348e01be3d9de5c3d979b25c819 Mon Sep 17 00:00:00 2001 From: Wei Yuan Date: Sat, 6 Feb 2016 15:39:47 +0800 Subject: audit: Fix typo in comment Signed-off-by: Weiyuan Signed-off-by: Paul Moore --- kernel/audit_watch.c | 2 +- kernel/auditfilter.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 656c7e93ac0d..0348b12b5a4d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -185,7 +185,7 @@ static struct audit_watch *audit_init_watch(char *path) return watch; } -/* Translate a watch string to kernel respresentation. */ +/* Translate a watch string to kernel representation. */ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) { struct audit_watch *watch; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b8ff9e193753..94ca7b1e5e7e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -158,7 +158,7 @@ char *audit_unpack_string(void **bufp, size_t *remain, size_t len) return str; } -/* Translate an inode field to kernel respresentation. */ +/* Translate an inode field to kernel representation. */ static inline int audit_to_inode(struct audit_krule *krule, struct audit_field *f) { @@ -415,7 +415,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) return 0; } -/* Translate struct audit_rule_data to kernel's rule respresentation. */ +/* Translate struct audit_rule_data to kernel's rule representation. */ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, size_t datasz) { @@ -593,7 +593,7 @@ static inline size_t audit_pack_string(void **bufp, const char *str) return len; } -/* Translate kernel rule respresentation to struct audit_rule_data. */ +/* Translate kernel rule representation to struct audit_rule_data. */ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) { struct audit_rule_data *data; -- cgit From 8a5fd56431fe1682e870bd6ab0c276e74befbeb9 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 4 Feb 2016 14:40:40 +0100 Subject: locking/lockdep: Fix stack trace caching logic check_prev_add() caches saved stack trace in static trace variable to avoid duplicate save_trace() calls in dependencies involving trylocks. But that caching logic contains a bug. We may not save trace on first iteration due to early return from check_prev_add(). Then on the second iteration when we actually need the trace we don't save it because we think that we've already saved it. Let check_prev_add() itself control when stack is saved. There is another bug. Trace variable is protected by graph lock. But we can temporary release graph lock during printing. Fix this by invalidating cached stack trace when we release graph lock. Signed-off-by: Dmitry Vyukov Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: glider@google.com Cc: kcc@google.com Cc: peter@hurleysoftware.com Cc: sasha.levin@oracle.com Link: http://lkml.kernel.org/r/1454593240-121647-1-git-send-email-dvyukov@google.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 60ace56618f6..c7710e4092ef 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1822,7 +1822,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, int trylock_loop) + struct held_lock *next, int distance, int *stack_saved) { struct lock_list *entry; int ret; @@ -1883,8 +1883,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, } } - if (!trylock_loop && !save_trace(&trace)) - return 0; + if (!*stack_saved) { + if (!save_trace(&trace)) + return 0; + *stack_saved = 1; + } /* * Ok, all validations passed, add the new lock @@ -1907,6 +1910,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Debugging printouts: */ if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { + /* We drop graph lock, so another thread can overwrite trace. */ + *stack_saved = 0; graph_unlock(); printk("\n new dependency: "); print_lock_name(hlock_class(prev)); @@ -1929,7 +1934,7 @@ static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { int depth = curr->lockdep_depth; - int trylock_loop = 0; + int stack_saved = 0; struct held_lock *hlock; /* @@ -1956,7 +1961,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) */ if (hlock->read != 2 && hlock->check) { if (!check_prev_add(curr, hlock, next, - distance, trylock_loop)) + distance, &stack_saved)) return 0; /* * Stop after the first non-trylock entry, @@ -1979,7 +1984,6 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) if (curr->held_locks[depth].irq_context != curr->held_locks[depth-1].irq_context) break; - trylock_loop = 1; } return 1; out_bug: -- cgit From a7636d9ecfa3ab7800a7c04c1f89378229eff609 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 3 Feb 2016 12:28:28 -0800 Subject: kprobes: Optimize hot path by using percpu counter to collect 'nhit' statistics When doing ebpf+kprobe on some hot TCP functions (e.g. tcp_rcv_established), the kprobe_dispatcher() function shows up in 'perf report'. In kprobe_dispatcher(), there is a lot of cache bouncing on 'tk->nhit++'. 'tk->nhit' and 'tk->tp.flags' also share the same cacheline. perf report (cycles:pp): 8.30% ipv4_dst_check 4.74% copy_user_enhanced_fast_string 3.93% dst_release 2.80% tcp_v4_rcv 2.31% queued_spin_lock_slowpath 2.30% _raw_spin_lock 1.88% mlx4_en_process_rx_cq 1.84% eth_get_headlen 1.81% ip_rcv_finish ~~~~ 1.71% kprobe_dispatcher ~~~~ 1.55% mlx4_en_xmit 1.09% __probe_kernel_read perf report after patch: 9.15% ipv4_dst_check 5.00% copy_user_enhanced_fast_string 4.12% dst_release 2.96% tcp_v4_rcv 2.50% _raw_spin_lock 2.39% queued_spin_lock_slowpath 2.11% eth_get_headlen 2.03% mlx4_en_process_rx_cq 1.69% mlx4_en_xmit 1.19% ip_rcv_finish 1.12% __probe_kernel_read 1.02% ehci_hcd_cleanup Signed-off-by: Martin KaFai Lau Acked-by: Masami Hiramatsu Acked-by: Steven Rostedt Cc: Alexei Starovoitov Cc: Josef Bacik Cc: Kernel Team Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454531308-2441898-1-git-send-email-kafai@fb.com Signed-off-by: Ingo Molnar --- kernel/trace/trace_kprobe.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c9956440d0e6..21b81a41dae5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -30,7 +30,7 @@ struct trace_kprobe { struct list_head list; struct kretprobe rp; /* Use rp.kp for kprobe use */ - unsigned long nhit; + unsigned long __percpu *nhit; const char *symbol; /* symbol name */ struct trace_probe tp; }; @@ -274,6 +274,10 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, if (!tk) return ERR_PTR(ret); + tk->nhit = alloc_percpu(unsigned long); + if (!tk->nhit) + goto error; + if (symbol) { tk->symbol = kstrdup(symbol, GFP_KERNEL); if (!tk->symbol) @@ -313,6 +317,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, error: kfree(tk->tp.call.name); kfree(tk->symbol); + free_percpu(tk->nhit); kfree(tk); return ERR_PTR(ret); } @@ -327,6 +332,7 @@ static void free_trace_kprobe(struct trace_kprobe *tk) kfree(tk->tp.call.class->system); kfree(tk->tp.call.name); kfree(tk->symbol); + free_percpu(tk->nhit); kfree(tk); } @@ -874,9 +880,14 @@ static const struct file_operations kprobe_events_ops = { static int probes_profile_seq_show(struct seq_file *m, void *v) { struct trace_kprobe *tk = v; + unsigned long nhit = 0; + int cpu; + + for_each_possible_cpu(cpu) + nhit += *per_cpu_ptr(tk->nhit, cpu); seq_printf(m, " %-44s %15lu %15lu\n", - trace_event_name(&tk->tp.call), tk->nhit, + trace_event_name(&tk->tp.call), nhit, tk->rp.kp.nmissed); return 0; @@ -1225,7 +1236,7 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); - tk->nhit++; + raw_cpu_inc(*tk->nhit); if (tk->tp.flags & TP_FLAG_TRACE) kprobe_trace_func(tk, regs); @@ -1242,7 +1253,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); - tk->nhit++; + raw_cpu_inc(*tk->nhit); if (tk->tp.flags & TP_FLAG_TRACE) kretprobe_trace_func(tk, ri, regs); -- cgit From cb2517653fccaf9f9b4ae968c7ee005c1bbacdc5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 5 Feb 2016 09:08:36 +0000 Subject: sched/debug: Make schedstats a runtime tunable that is disabled by default schedstats is very useful during debugging and performance tuning but it incurs overhead to calculate the stats. As such, even though it can be disabled at build time, it is often enabled as the information is useful. This patch adds a kernel command-line and sysctl tunable to enable or disable schedstats on demand (when it's built in). It is disabled by default as someone who knows they need it can also learn to enable it when necessary. The benefits are dependent on how scheduler-intensive the workload is. If it is then the patch reduces the number of cycles spent calculating the stats with a small benefit from reducing the cache footprint of the scheduler. These measurements were taken from a 48-core 2-socket machine with Xeon(R) E5-2670 v3 cpus although they were also tested on a single socket machine 8-core machine with Intel i7-3770 processors. netperf-tcp 4.5.0-rc1 4.5.0-rc1 vanilla nostats-v3r1 Hmean 64 560.45 ( 0.00%) 575.98 ( 2.77%) Hmean 128 766.66 ( 0.00%) 795.79 ( 3.80%) Hmean 256 950.51 ( 0.00%) 981.50 ( 3.26%) Hmean 1024 1433.25 ( 0.00%) 1466.51 ( 2.32%) Hmean 2048 2810.54 ( 0.00%) 2879.75 ( 2.46%) Hmean 3312 4618.18 ( 0.00%) 4682.09 ( 1.38%) Hmean 4096 5306.42 ( 0.00%) 5346.39 ( 0.75%) Hmean 8192 10581.44 ( 0.00%) 10698.15 ( 1.10%) Hmean 16384 18857.70 ( 0.00%) 18937.61 ( 0.42%) Small gains here, UDP_STREAM showed nothing intresting and neither did the TCP_RR tests. The gains on the 8-core machine were very similar. tbench4 4.5.0-rc1 4.5.0-rc1 vanilla nostats-v3r1 Hmean mb/sec-1 500.85 ( 0.00%) 522.43 ( 4.31%) Hmean mb/sec-2 984.66 ( 0.00%) 1018.19 ( 3.41%) Hmean mb/sec-4 1827.91 ( 0.00%) 1847.78 ( 1.09%) Hmean mb/sec-8 3561.36 ( 0.00%) 3611.28 ( 1.40%) Hmean mb/sec-16 5824.52 ( 0.00%) 5929.03 ( 1.79%) Hmean mb/sec-32 10943.10 ( 0.00%) 10802.83 ( -1.28%) Hmean mb/sec-64 15950.81 ( 0.00%) 16211.31 ( 1.63%) Hmean mb/sec-128 15302.17 ( 0.00%) 15445.11 ( 0.93%) Hmean mb/sec-256 14866.18 ( 0.00%) 15088.73 ( 1.50%) Hmean mb/sec-512 15223.31 ( 0.00%) 15373.69 ( 0.99%) Hmean mb/sec-1024 14574.25 ( 0.00%) 14598.02 ( 0.16%) Hmean mb/sec-2048 13569.02 ( 0.00%) 13733.86 ( 1.21%) Hmean mb/sec-3072 12865.98 ( 0.00%) 13209.23 ( 2.67%) Small gains of 2-4% at low thread counts and otherwise flat. The gains on the 8-core machine were slightly different tbench4 on 8-core i7-3770 single socket machine Hmean mb/sec-1 442.59 ( 0.00%) 448.73 ( 1.39%) Hmean mb/sec-2 796.68 ( 0.00%) 794.39 ( -0.29%) Hmean mb/sec-4 1322.52 ( 0.00%) 1343.66 ( 1.60%) Hmean mb/sec-8 2611.65 ( 0.00%) 2694.86 ( 3.19%) Hmean mb/sec-16 2537.07 ( 0.00%) 2609.34 ( 2.85%) Hmean mb/sec-32 2506.02 ( 0.00%) 2578.18 ( 2.88%) Hmean mb/sec-64 2511.06 ( 0.00%) 2569.16 ( 2.31%) Hmean mb/sec-128 2313.38 ( 0.00%) 2395.50 ( 3.55%) Hmean mb/sec-256 2110.04 ( 0.00%) 2177.45 ( 3.19%) Hmean mb/sec-512 2072.51 ( 0.00%) 2053.97 ( -0.89%) In constract, this shows a relatively steady 2-3% gain at higher thread counts. Due to the nature of the patch and the type of workload, it's not a surprise that the result will depend on the CPU used. hackbench-pipes 4.5.0-rc1 4.5.0-rc1 vanilla nostats-v3r1 Amean 1 0.0637 ( 0.00%) 0.0660 ( -3.59%) Amean 4 0.1229 ( 0.00%) 0.1181 ( 3.84%) Amean 7 0.1921 ( 0.00%) 0.1911 ( 0.52%) Amean 12 0.3117 ( 0.00%) 0.2923 ( 6.23%) Amean 21 0.4050 ( 0.00%) 0.3899 ( 3.74%) Amean 30 0.4586 ( 0.00%) 0.4433 ( 3.33%) Amean 48 0.5910 ( 0.00%) 0.5694 ( 3.65%) Amean 79 0.8663 ( 0.00%) 0.8626 ( 0.43%) Amean 110 1.1543 ( 0.00%) 1.1517 ( 0.22%) Amean 141 1.4457 ( 0.00%) 1.4290 ( 1.16%) Amean 172 1.7090 ( 0.00%) 1.6924 ( 0.97%) Amean 192 1.9126 ( 0.00%) 1.9089 ( 0.19%) Some small gains and losses and while the variance data is not included, it's close to the noise. The UMA machine did not show anything particularly different pipetest 4.5.0-rc1 4.5.0-rc1 vanilla nostats-v2r2 Min Time 4.13 ( 0.00%) 3.99 ( 3.39%) 1st-qrtle Time 4.38 ( 0.00%) 4.27 ( 2.51%) 2nd-qrtle Time 4.46 ( 0.00%) 4.39 ( 1.57%) 3rd-qrtle Time 4.56 ( 0.00%) 4.51 ( 1.10%) Max-90% Time 4.67 ( 0.00%) 4.60 ( 1.50%) Max-93% Time 4.71 ( 0.00%) 4.65 ( 1.27%) Max-95% Time 4.74 ( 0.00%) 4.71 ( 0.63%) Max-99% Time 4.88 ( 0.00%) 4.79 ( 1.84%) Max Time 4.93 ( 0.00%) 4.83 ( 2.03%) Mean Time 4.48 ( 0.00%) 4.39 ( 1.91%) Best99%Mean Time 4.47 ( 0.00%) 4.39 ( 1.91%) Best95%Mean Time 4.46 ( 0.00%) 4.38 ( 1.93%) Best90%Mean Time 4.45 ( 0.00%) 4.36 ( 1.98%) Best50%Mean Time 4.36 ( 0.00%) 4.25 ( 2.49%) Best10%Mean Time 4.23 ( 0.00%) 4.10 ( 3.13%) Best5%Mean Time 4.19 ( 0.00%) 4.06 ( 3.20%) Best1%Mean Time 4.13 ( 0.00%) 4.00 ( 3.39%) Small improvement and similar gains were seen on the UMA machine. The gain is small but it stands to reason that doing less work in the scheduler is a good thing. The downside is that the lack of schedstats and tracepoints may be surprising to experts doing performance analysis until they find the existence of the schedstats= parameter or schedstats sysctl. It will be automatically activated for latencytop and sleep profiling to alleviate the problem. For tracepoints, there is a simple warning as it's not safe to activate schedstats in the context when it's known the tracepoint may be wanted but is unavailable. Signed-off-by: Mel Gorman Reviewed-by: Matt Fleming Reviewed-by: Srikar Dronamraju Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1454663316-22048-1-git-send-email-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 5 ++ Documentation/sysctl/kernel.txt | 8 +++ include/linux/latencytop.h | 3 + include/linux/sched.h | 4 ++ include/linux/sched/sysctl.h | 4 ++ kernel/latencytop.c | 14 ++++- kernel/profile.c | 1 + kernel/sched/core.c | 70 +++++++++++++++++++++- kernel/sched/debug.c | 102 +++++++++++++++++--------------- kernel/sched/fair.c | 113 +++++++++++++++++++++++++----------- kernel/sched/sched.h | 1 + kernel/sched/stats.h | 8 ++- kernel/sysctl.c | 13 ++++- 13 files changed, 256 insertions(+), 90 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 551ecf09c8dd..ed47b609530b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3528,6 +3528,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. sched_debug [KNL] Enables verbose scheduler debug messages. + schedstats= [KNL,X86] Enable or disable scheduled statistics. + Allowed values are enable and disable. This feature + incurs a small amount of overhead in the scheduler + but is useful for debugging and performance tuning. + skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate xtime_lock contention on larger systems, and/or RCU lock contention on all systems with CONFIG_MAXSMP set. diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index a93b414672a7..87119dc9bc64 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -760,6 +760,14 @@ rtsig-nr shows the number of RT signals currently queued. ============================================================== +sched_schedstats: + +Enables/disables scheduler statistics. Enabling this feature +incurs a small amount of overhead in the scheduler but is +useful for debugging and performance tuning. + +============================================================== + sg-big-buff: This file shows the size of the generic SCSI (sg) buffer. diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index e23121f9d82a..59ccab297ae0 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -37,6 +37,9 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) void clear_all_latency_tracing(struct task_struct *p); +extern int sysctl_latencytop(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + #else static inline void diff --git a/include/linux/sched.h b/include/linux/sched.h index a10494a94cc3..a292c4b7e94c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -920,6 +920,10 @@ static inline int sched_info_on(void) #endif } +#ifdef CONFIG_SCHEDSTATS +void force_schedstat_enabled(void); +#endif + enum cpu_idle_type { CPU_IDLE, CPU_NOT_IDLE, diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c9e4731cf10b..4f080ab4f2cd 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -95,4 +95,8 @@ extern int sysctl_numa_balancing(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sysctl_schedstats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + #endif /* _SCHED_SYSCTL_H */ diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a02812743a7e..b5c30d9f46c5 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -47,12 +47,12 @@ * of times) */ -#include #include #include #include #include #include +#include #include #include #include @@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void) proc_create("latency_stats", 0644, NULL, &lstats_fops); return 0; } + +int sysctl_latencytop(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err; + + err = proc_dointvec(table, write, buffer, lenp, ppos); + if (latencytop_enabled) + force_schedstat_enabled(); + + return err; +} device_initcall(init_lstats_procfs); diff --git a/kernel/profile.c b/kernel/profile.c index 99513e1160e5..51369697466e 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -59,6 +59,7 @@ int profile_setup(char *str) if (!strncmp(str, sleepstr, strlen(sleepstr))) { #ifdef CONFIG_SCHEDSTATS + force_schedstat_enabled(); prof_on = SLEEP_PROFILING; if (str[strlen(sleepstr)] == ',') str += strlen(sleepstr) + 1; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 24fcdbf28b18..7e548bde67ee 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2093,7 +2093,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ttwu_queue(p, cpu); stat: - ttwu_stat(p, cpu, wake_flags); + if (schedstat_enabled()) + ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2141,7 +2142,8 @@ static void try_to_wake_up_local(struct task_struct *p) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0); - ttwu_stat(p, smp_processor_id(), 0); + if (schedstat_enabled()) + ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); } @@ -2210,6 +2212,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif #ifdef CONFIG_SCHEDSTATS + /* Even if schedstat is disabled, there should not be garbage */ memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif @@ -2281,6 +2284,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, #endif #endif +DEFINE_STATIC_KEY_FALSE(sched_schedstats); + +#ifdef CONFIG_SCHEDSTATS +static void set_schedstats(bool enabled) +{ + if (enabled) + static_branch_enable(&sched_schedstats); + else + static_branch_disable(&sched_schedstats); +} + +void force_schedstat_enabled(void) +{ + if (!schedstat_enabled()) { + pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); + static_branch_enable(&sched_schedstats); + } +} + +static int __init setup_schedstats(char *str) +{ + int ret = 0; + if (!str) + goto out; + + if (!strcmp(str, "enable")) { + set_schedstats(true); + ret = 1; + } else if (!strcmp(str, "disable")) { + set_schedstats(false); + ret = 1; + } +out: + if (!ret) + pr_warn("Unable to parse schedstats=\n"); + + return ret; +} +__setup("schedstats=", setup_schedstats); + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_schedstats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = static_branch_likely(&sched_schedstats); + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_schedstats(state); + return err; +} +#endif +#endif + /* * fork()/clone()-time setup: */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 641511771ae6..7cfa87bd8b89 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -75,16 +75,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->vruntime); PN(se->sum_exec_runtime); #ifdef CONFIG_SCHEDSTATS - PN(se->statistics.wait_start); - PN(se->statistics.sleep_start); - PN(se->statistics.block_start); - PN(se->statistics.sleep_max); - PN(se->statistics.block_max); - PN(se->statistics.exec_max); - PN(se->statistics.slice_max); - PN(se->statistics.wait_max); - PN(se->statistics.wait_sum); - P(se->statistics.wait_count); + if (schedstat_enabled()) { + PN(se->statistics.wait_start); + PN(se->statistics.sleep_start); + PN(se->statistics.block_start); + PN(se->statistics.sleep_max); + PN(se->statistics.block_max); + PN(se->statistics.exec_max); + PN(se->statistics.slice_max); + PN(se->statistics.wait_max); + PN(se->statistics.wait_sum); + P(se->statistics.wait_count); + } #endif P(se->load.weight); #ifdef CONFIG_SMP @@ -122,10 +124,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) (long long)(p->nvcsw + p->nivcsw), p->prio); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(p->se.statistics.wait_sum), - SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(p->se.statistics.sum_sleep_runtime)); + if (schedstat_enabled()) { + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", + SPLIT_NS(p->se.statistics.wait_sum), + SPLIT_NS(p->se.sum_exec_runtime), + SPLIT_NS(p->se.statistics.sum_sleep_runtime)); + } #else SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 0LL, 0L, @@ -313,17 +317,18 @@ do { \ #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); - P(yld_count); - - P(sched_count); - P(sched_goidle); #ifdef CONFIG_SMP P64(avg_idle); P64(max_idle_balance_cost); #endif - P(ttwu_count); - P(ttwu_local); + if (schedstat_enabled()) { + P(yld_count); + P(sched_count); + P(sched_goidle); + P(ttwu_count); + P(ttwu_local); + } #undef P #undef P64 @@ -569,38 +574,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) nr_switches = p->nvcsw + p->nivcsw; #ifdef CONFIG_SCHEDSTATS - PN(se.statistics.sum_sleep_runtime); - PN(se.statistics.wait_start); - PN(se.statistics.sleep_start); - PN(se.statistics.block_start); - PN(se.statistics.sleep_max); - PN(se.statistics.block_max); - PN(se.statistics.exec_max); - PN(se.statistics.slice_max); - PN(se.statistics.wait_max); - PN(se.statistics.wait_sum); - P(se.statistics.wait_count); - PN(se.statistics.iowait_sum); - P(se.statistics.iowait_count); P(se.nr_migrations); - P(se.statistics.nr_migrations_cold); - P(se.statistics.nr_failed_migrations_affine); - P(se.statistics.nr_failed_migrations_running); - P(se.statistics.nr_failed_migrations_hot); - P(se.statistics.nr_forced_migrations); - P(se.statistics.nr_wakeups); - P(se.statistics.nr_wakeups_sync); - P(se.statistics.nr_wakeups_migrate); - P(se.statistics.nr_wakeups_local); - P(se.statistics.nr_wakeups_remote); - P(se.statistics.nr_wakeups_affine); - P(se.statistics.nr_wakeups_affine_attempts); - P(se.statistics.nr_wakeups_passive); - P(se.statistics.nr_wakeups_idle); - { + if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; + PN(se.statistics.sum_sleep_runtime); + PN(se.statistics.wait_start); + PN(se.statistics.sleep_start); + PN(se.statistics.block_start); + PN(se.statistics.sleep_max); + PN(se.statistics.block_max); + PN(se.statistics.exec_max); + PN(se.statistics.slice_max); + PN(se.statistics.wait_max); + PN(se.statistics.wait_sum); + P(se.statistics.wait_count); + PN(se.statistics.iowait_sum); + P(se.statistics.iowait_count); + P(se.statistics.nr_migrations_cold); + P(se.statistics.nr_failed_migrations_affine); + P(se.statistics.nr_failed_migrations_running); + P(se.statistics.nr_failed_migrations_hot); + P(se.statistics.nr_forced_migrations); + P(se.statistics.nr_wakeups); + P(se.statistics.nr_wakeups_sync); + P(se.statistics.nr_wakeups_migrate); + P(se.statistics.nr_wakeups_local); + P(se.statistics.nr_wakeups_remote); + P(se.statistics.nr_wakeups_affine); + P(se.statistics.nr_wakeups_affine_attempts); + P(se.statistics.nr_wakeups_passive); + P(se.statistics.nr_wakeups_idle); + avg_atom = p->se.sum_exec_runtime; if (nr_switches) avg_atom = div64_ul(avg_atom, nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56b7d4b83947..51a45502d8a6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,8 +20,8 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ -#include #include +#include #include #include #include @@ -755,7 +755,9 @@ static void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct task_struct *p; - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + u64 delta; + + delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; if (entity_is_task(se)) { p = task_of(se); @@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) se->statistics.wait_sum += delta; se->statistics.wait_start = 0; } -#else -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} -#endif /* * Task is being enqueued - update stats: */ -static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* * Are we enqueueing a waiting task? (for current tasks @@ -802,7 +794,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) } static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Mark the end of the wait period if dequeueing a @@ -810,7 +802,40 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (se != cfs_rq->curr) update_stats_wait_end(cfs_rq, se); + + if (flags & DEQUEUE_SLEEP) { + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + if (tsk->state & TASK_INTERRUPTIBLE) + se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); + if (tsk->state & TASK_UNINTERRUPTIBLE) + se->statistics.block_start = rq_clock(rq_of(cfs_rq)); + } + } + +} +#else +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ } +#endif /* * We are picking a new current task - update its stats: @@ -3102,6 +3127,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +static inline void check_schedstat_required(void) +{ +#ifdef CONFIG_SCHEDSTATS + if (schedstat_enabled()) + return; + + /* Force schedstat enabled if a dependent tracepoint is active */ + if (trace_sched_stat_wait_enabled() || + trace_sched_stat_sleep_enabled() || + trace_sched_stat_iowait_enabled() || + trace_sched_stat_blocked_enabled() || + trace_sched_stat_runtime_enabled()) { + pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " + "stat_blocked and stat_runtime require the " + "kernel parameter schedstats=enabled or " + "kernel.sched_schedstats=1\n"); + } +#endif +} + static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -3122,11 +3167,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); - enqueue_sleeper(cfs_rq, se); + if (schedstat_enabled()) + enqueue_sleeper(cfs_rq, se); } - update_stats_enqueue(cfs_rq, se); - check_spread(cfs_rq, se); + check_schedstat_required(); + if (schedstat_enabled()) { + update_stats_enqueue(cfs_rq, se); + check_spread(cfs_rq, se); + } if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; @@ -3193,19 +3242,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_curr(cfs_rq); dequeue_entity_load_avg(cfs_rq, se); - update_stats_dequeue(cfs_rq, se); - if (flags & DEQUEUE_SLEEP) { -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - struct task_struct *tsk = task_of(se); - - if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); - if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_clock(rq_of(cfs_rq)); - } -#endif - } + if (schedstat_enabled()) + update_stats_dequeue(cfs_rq, se, flags); clear_buddies(cfs_rq, se); @@ -3279,7 +3317,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - update_stats_wait_end(cfs_rq, se); + if (schedstat_enabled()) + update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(se, 1); } @@ -3292,7 +3331,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ - if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { + if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { se->statistics.slice_max = max(se->statistics.slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime); } @@ -3375,9 +3414,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); - check_spread(cfs_rq, prev); + if (schedstat_enabled()) { + check_spread(cfs_rq, prev); + if (prev->on_rq) + update_stats_wait_start(cfs_rq, prev); + } + if (prev->on_rq) { - update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 10f16374df7f..1d583870e1a6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1022,6 +1022,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; +extern struct static_key_false sched_schedstats; static inline u64 global_rt_period(void) { diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index b0fbc7632de5..70b3b6a20fb0 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.run_delay += delta; } -# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) -# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) -# define schedstat_set(var, val) do { var = (val); } while (0) +# define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) +# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) +# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #else /* !CONFIG_SCHEDSTATS */ static inline void rq_sched_info_arrive(struct rq *rq, unsigned long long delta) @@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} +# define schedstat_enabled() 0 # define schedstat_inc(rq, field) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0) # define schedstat_set(var, val) do { } while (0) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 97715fd9e790..f5102fabef7f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHEDSTATS + { + .procname = "sched_schedstats", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_schedstats, + .extra1 = &zero, + .extra2 = &one, + }, +#endif /* CONFIG_SCHEDSTATS */ #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING { @@ -505,7 +516,7 @@ static struct ctl_table kern_table[] = { .data = &latencytop_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = sysctl_latencytop, }, #endif #ifdef CONFIG_BLK_DEV_INITRD -- cgit From a63f38cc4ccfa076f87fc3d0c276ee62e710f953 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 3 Feb 2016 13:44:12 -0800 Subject: locking/lockdep: Convert hash tables to hlists Mike said: : CONFIG_UBSAN_ALIGNMENT breaks x86-64 kernel with lockdep enabled, i.e. : kernel with CONFIG_UBSAN_ALIGNMENT=y fails to load without even any error : message. : : The problem is that ubsan callbacks use spinlocks and might be called : before lockdep is initialized. Particularly this line in the : reserve_ebda_region function causes problem: : : lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); : : If i put lockdep_init() before reserve_ebda_region call in : x86_64_start_reservations kernel loads well. Fix this ordering issue permanently: change lockdep so that it uses hlists for the hash tables. Unlike a list_head, an hlist_head is in its initialized state when it is all-zeroes, so lockdep is ready for operation immediately upon boot - lockdep_init() need not have run. The patch will also save some memory. Probably lockdep_init() and lockdep_initialized can be done away with now. Suggested-by: Mike Krinkin Reported-by: Mike Krinkin Signed-off-by: Andrew Morton Cc: Andrey Ryabinin Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: mm-commits@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 4 ++-- kernel/locking/lockdep.c | 42 +++++++++++++++++++----------------------- 2 files changed, 21 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index c57e424d914b..4dca42fd32f5 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -66,7 +66,7 @@ struct lock_class { /* * class-hash: */ - struct list_head hash_entry; + struct hlist_node hash_entry; /* * global list of all lock-classes: @@ -199,7 +199,7 @@ struct lock_chain { u8 irq_context; u8 depth; u16 base; - struct list_head entry; + struct hlist_node entry; u64 chain_key; }; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c7710e4092ef..716547fdb873 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -292,7 +292,7 @@ LIST_HEAD(all_lock_classes); #define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) #define classhashentry(key) (classhash_table + __classhashfn((key))) -static struct list_head classhash_table[CLASSHASH_SIZE]; +static struct hlist_head classhash_table[CLASSHASH_SIZE]; /* * We put the lock dependency chains into a hash-table as well, to cache @@ -303,7 +303,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE]; #define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) #define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) -static struct list_head chainhash_table[CHAINHASH_SIZE]; +static struct hlist_head chainhash_table[CHAINHASH_SIZE]; /* * The hash key of the lock dependency chains is a hash itself too: @@ -666,7 +666,7 @@ static inline struct lock_class * look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; - struct list_head *hash_head; + struct hlist_head *hash_head; struct lock_class *class; #ifdef CONFIG_DEBUG_LOCKDEP @@ -719,7 +719,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return NULL; - list_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample @@ -742,7 +742,7 @@ static inline struct lock_class * register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; - struct list_head *hash_head; + struct hlist_head *hash_head; struct lock_class *class; DEBUG_LOCKS_WARN_ON(!irqs_disabled()); @@ -774,7 +774,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * We have to do the hash-walk again, to avoid races * with another CPU: */ - list_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) goto out_unlock_set; } @@ -805,7 +805,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: */ - list_add_tail_rcu(&class->hash_entry, hash_head); + hlist_add_head_rcu(&class->hash_entry, hash_head); /* * Add it to the global list of classes: */ @@ -2021,7 +2021,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, u64 chain_key) { struct lock_class *class = hlock_class(hlock); - struct list_head *hash_head = chainhashentry(chain_key); + struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; struct held_lock *hlock_curr; int i, j; @@ -2037,7 +2037,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, * We can walk it lock-free, because entries only get added * to the hash: */ - list_for_each_entry_rcu(chain, hash_head, entry) { + hlist_for_each_entry_rcu(chain, hash_head, entry) { if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(chain_lookup_hits); @@ -2061,7 +2061,7 @@ cache_hit: /* * We have to walk the chain again locked - to avoid duplicates: */ - list_for_each_entry(chain, hash_head, entry) { + hlist_for_each_entry(chain, hash_head, entry) { if (chain->chain_key == chain_key) { graph_unlock(); goto cache_hit; @@ -2095,7 +2095,7 @@ cache_hit: } chain_hlocks[chain->base + j] = class - lock_classes; } - list_add_tail_rcu(&chain->entry, hash_head); + hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(); @@ -3879,7 +3879,7 @@ void lockdep_reset(void) nr_process_chains = 0; debug_locks = 1; for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); + INIT_HLIST_HEAD(chainhash_table + i); raw_local_irq_restore(flags); } @@ -3898,7 +3898,7 @@ static void zap_class(struct lock_class *class) /* * Unhash the class and remove it from the all_lock_classes list: */ - list_del_rcu(&class->hash_entry); + hlist_del_rcu(&class->hash_entry); list_del_rcu(&class->lock_entry); RCU_INIT_POINTER(class->key, NULL); @@ -3921,7 +3921,7 @@ static inline int within(const void *addr, void *start, unsigned long size) void lockdep_free_key_range(void *start, unsigned long size) { struct lock_class *class; - struct list_head *head; + struct hlist_head *head; unsigned long flags; int i; int locked; @@ -3934,9 +3934,7 @@ void lockdep_free_key_range(void *start, unsigned long size) */ for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_rcu(class, head, hash_entry) { + hlist_for_each_entry_rcu(class, head, hash_entry) { if (within(class->key, start, size)) zap_class(class); else if (within(class->name, start, size)) @@ -3966,7 +3964,7 @@ void lockdep_free_key_range(void *start, unsigned long size) void lockdep_reset_lock(struct lockdep_map *lock) { struct lock_class *class; - struct list_head *head; + struct hlist_head *head; unsigned long flags; int i, j; int locked; @@ -3991,9 +3989,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) locked = graph_lock(); for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_rcu(class, head, hash_entry) { + hlist_for_each_entry_rcu(class, head, hash_entry) { int match = 0; for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) @@ -4031,10 +4027,10 @@ void lockdep_init(void) return; for (i = 0; i < CLASSHASH_SIZE; i++) - INIT_LIST_HEAD(classhash_table + i); + INIT_HLIST_HEAD(classhash_table + i); for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); + INIT_HLIST_HEAD(chainhash_table + i); lockdep_initialized = 1; } -- cgit From 06bea3dbfe6a4c333c4333362c46bdf4d9e43504 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 4 Feb 2016 11:29:36 -0800 Subject: locking/lockdep: Eliminate lockdep_init() Lockdep is initialized at compile time now. Get rid of lockdep_init(). Signed-off-by: Andrey Ryabinin Signed-off-by: Andrew Morton Cc: Linus Torvalds Cc: Mike Krinkin Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: mm-commits@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/c6x/kernel/setup.c | 2 - arch/microblaze/kernel/setup.c | 2 - arch/powerpc/kernel/setup_32.c | 2 - arch/powerpc/kernel/setup_64.c | 3 -- arch/s390/kernel/early.c | 1 - arch/sparc/kernel/head_64.S | 8 ---- arch/x86/lguest/boot.c | 6 --- include/linux/lockdep.h | 2 - init/main.c | 5 --- kernel/locking/lockdep.c | 59 --------------------------- tools/lib/lockdep/common.c | 5 --- tools/lib/lockdep/include/liblockdep/common.h | 1 - tools/lib/lockdep/preload.c | 2 - 13 files changed, 98 deletions(-) (limited to 'kernel') diff --git a/arch/c6x/kernel/setup.c b/arch/c6x/kernel/setup.c index 72e17f7ebd6f..786e36e2f61d 100644 --- a/arch/c6x/kernel/setup.c +++ b/arch/c6x/kernel/setup.c @@ -281,8 +281,6 @@ notrace void __init machine_init(unsigned long dt_ptr) */ set_ist(_vectors_start); - lockdep_init(); - /* * dtb is passed in from bootloader. * fdt is linked in blob. diff --git a/arch/microblaze/kernel/setup.c b/arch/microblaze/kernel/setup.c index 89a2a9394927..f31ebb5dc26c 100644 --- a/arch/microblaze/kernel/setup.c +++ b/arch/microblaze/kernel/setup.c @@ -130,8 +130,6 @@ void __init machine_early_init(const char *cmdline, unsigned int ram, memset(__bss_start, 0, __bss_stop-__bss_start); memset(_ssbss, 0, _esbss-_ssbss); - lockdep_init(); - /* initialize device tree for usage in early_printk */ early_init_devtree(_fdt_start); diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index ad8c9db61237..d544fa311757 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -114,8 +114,6 @@ extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */ notrace void __init machine_init(u64 dt_ptr) { - lockdep_init(); - /* Enable early debugging if any specified (see udbg.h) */ udbg_early_init(); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 5c03a6a9b054..f98be8383a39 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -255,9 +255,6 @@ void __init early_setup(unsigned long dt_ptr) setup_paca(&boot_paca); fixup_boot_paca(); - /* Initialize lockdep early or else spinlocks will blow */ - lockdep_init(); - /* -------- printk is now safe to use ------- */ /* Enable early debugging if any specified (see udbg.h) */ diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index c55576bbaa1f..a0684de5a93b 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -448,7 +448,6 @@ void __init startup_init(void) rescue_initrd(); clear_bss_section(); init_kernel_storage_key(); - lockdep_init(); lockdep_off(); setup_lowcore_early(); setup_facility_list(); diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S index f2d30cab5b3f..cd1f592cd347 100644 --- a/arch/sparc/kernel/head_64.S +++ b/arch/sparc/kernel/head_64.S @@ -696,14 +696,6 @@ tlb_fixup_done: call __bzero sub %o1, %o0, %o1 -#ifdef CONFIG_LOCKDEP - /* We have this call this super early, as even prom_init can grab - * spinlocks and thus call into the lockdep code. - */ - call lockdep_init - nop -#endif - call prom_init mov %l7, %o0 ! OpenPROM cif handler diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 4ba229ac3f4f..f56cc418c87d 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1520,12 +1520,6 @@ __init void lguest_init(void) */ reserve_top_address(lguest_data.reserve_mem); - /* - * If we don't initialize the lock dependency checker now, it crashes - * atomic_notifier_chain_register, then paravirt_disable_iospace. - */ - lockdep_init(); - /* Hook in our special panic hypercall code. */ atomic_notifier_chain_register(&panic_notifier_list, &paniced); diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 4dca42fd32f5..d026b190c530 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -261,7 +261,6 @@ struct held_lock { /* * Initialization, self-test and debugging-output methods: */ -extern void lockdep_init(void); extern void lockdep_info(void); extern void lockdep_reset(void); extern void lockdep_reset_lock(struct lockdep_map *lock); @@ -392,7 +391,6 @@ static inline void lockdep_on(void) # define lockdep_set_current_reclaim_state(g) do { } while (0) # define lockdep_clear_current_reclaim_state() do { } while (0) # define lockdep_trace_alloc(g) do { } while (0) -# define lockdep_init() do { } while (0) # define lockdep_info() do { } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) diff --git a/init/main.c b/init/main.c index 58c9e374704b..b3008bcfb1dc 100644 --- a/init/main.c +++ b/init/main.c @@ -499,11 +499,6 @@ asmlinkage __visible void __init start_kernel(void) char *command_line; char *after_dashes; - /* - * Need to run as early as possible, to initialize the - * lockdep hash: - */ - lockdep_init(); set_task_stack_end_magic(&init_task); smp_setup_processor_id(); debug_objects_early_init(); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 716547fdb873..3261214323fa 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -123,8 +123,6 @@ static inline int debug_locks_off_graph_unlock(void) return ret; } -static int lockdep_initialized; - unsigned long nr_list_entries; static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; @@ -433,19 +431,6 @@ unsigned int nr_process_chains; unsigned int max_lockdep_depth; #ifdef CONFIG_DEBUG_LOCKDEP -/* - * We cannot printk in early bootup code. Not even early_printk() - * might work. So we mark any initialization errors and printk - * about it later on, in lockdep_info(). - */ -static int lockdep_init_error; -static const char *lock_init_error; -static unsigned long lockdep_init_trace_data[20]; -static struct stack_trace lockdep_init_trace = { - .max_entries = ARRAY_SIZE(lockdep_init_trace_data), - .entries = lockdep_init_trace_data, -}; - /* * Various lockdep statistics: */ @@ -669,20 +654,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) struct hlist_head *hash_head; struct lock_class *class; -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * If the architecture calls into lockdep before initializing - * the hashes then we'll warn about it later. (we cannot printk - * right now) - */ - if (unlikely(!lockdep_initialized)) { - lockdep_init(); - lockdep_init_error = 1; - lock_init_error = lock->name; - save_stack_trace(&lockdep_init_trace); - } -#endif - if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { debug_locks_off(); printk(KERN_ERR @@ -4013,28 +3984,6 @@ out_restore: raw_local_irq_restore(flags); } -void lockdep_init(void) -{ - int i; - - /* - * Some architectures have their own start_kernel() - * code which calls lockdep_init(), while we also - * call lockdep_init() from the start_kernel() itself, - * and we want to initialize the hashes only once: - */ - if (lockdep_initialized) - return; - - for (i = 0; i < CLASSHASH_SIZE; i++) - INIT_HLIST_HEAD(classhash_table + i); - - for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_HLIST_HEAD(chainhash_table + i); - - lockdep_initialized = 1; -} - void __init lockdep_info(void) { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); @@ -4061,14 +4010,6 @@ void __init lockdep_info(void) printk(" per task-struct memory footprint: %lu bytes\n", sizeof(struct held_lock) * MAX_LOCK_DEPTH); - -#ifdef CONFIG_DEBUG_LOCKDEP - if (lockdep_init_error) { - printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error); - printk("Call stack leading to lockdep invocation was:\n"); - print_stack_trace(&lockdep_init_trace, 0); - } -#endif } static void diff --git a/tools/lib/lockdep/common.c b/tools/lib/lockdep/common.c index 9be663340f0a..d1c89cc06f5f 100644 --- a/tools/lib/lockdep/common.c +++ b/tools/lib/lockdep/common.c @@ -11,11 +11,6 @@ static __thread struct task_struct current_obj; bool debug_locks = true; bool debug_locks_silent; -__attribute__((constructor)) static void liblockdep_init(void) -{ - lockdep_init(); -} - __attribute__((destructor)) static void liblockdep_exit(void) { debug_check_no_locks_held(); diff --git a/tools/lib/lockdep/include/liblockdep/common.h b/tools/lib/lockdep/include/liblockdep/common.h index a60c14b9662a..6e66277ec437 100644 --- a/tools/lib/lockdep/include/liblockdep/common.h +++ b/tools/lib/lockdep/include/liblockdep/common.h @@ -44,7 +44,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, void lock_release(struct lockdep_map *lock, int nested, unsigned long ip); extern void debug_check_no_locks_freed(const void *from, unsigned long len); -extern void lockdep_init(void); #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ { .name = (_name), .key = (void *)(_key), } diff --git a/tools/lib/lockdep/preload.c b/tools/lib/lockdep/preload.c index 21cdf869a01b..52844847569c 100644 --- a/tools/lib/lockdep/preload.c +++ b/tools/lib/lockdep/preload.c @@ -439,7 +439,5 @@ __attribute__((constructor)) static void init_preload(void) ll_pthread_rwlock_unlock = dlsym(RTLD_NEXT, "pthread_rwlock_unlock"); #endif - lockdep_init(); - __init_state = done; } -- cgit From 4142c3ebb685bb338b7d96090d8f90ff49065ff6 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 25 Jan 2016 17:07:39 -0500 Subject: sched/numa: Spread memory according to CPU and memory use The pseudo-interleaving in NUMA placement has a fundamental problem: using hard usage thresholds to spread memory equally between nodes can prevent workloads from converging, or keep memory "trapped" on nodes where the workload is barely running any more. In order for workloads to properly converge, the memory migration should not be stopped when nodes reach parity, but instead be distributed according to how heavily memory is used from each node. This way memory migration and task migration reinforce each other, instead of one putting the brakes on the other. Remove the hard thresholds from the pseudo-interleaving code, and instead use a more gradual policy on memory placement. This also seems to improve convergence of workloads that do not run flat out, but sleep in between bursts of activity. We still want to slow down NUMA scanning and migration once a workload has settled on a few actively used nodes, so keep the 3/4 hysteresis in place. Keep track of whether a workload is actively running on multiple nodes, so task_numa_migrate does a full scan of the system for better task placement. In the case of running 3 SPECjbb2005 instances on a 4 node system, this code seems to result in fairer distribution of memory between nodes, with more memory bandwidth for each instance. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: mgorman@suse.de Link: http://lkml.kernel.org/r/20160125170739.2fc9a641@annuminas.surriel.com [ Minor readability tweaks. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 87 +++++++++++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 51a45502d8a6..7ce24a456322 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -932,10 +932,11 @@ struct numa_group { spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; pid_t gid; + int active_nodes; struct rcu_head rcu; - nodemask_t active_nodes; unsigned long total_faults; + unsigned long max_faults_cpu; /* * Faults_cpu is used to decide whether memory should move * towards the CPU. As a consequence, these stats are weighted @@ -994,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; } +/* + * A node triggering more than 1/3 as many NUMA faults as the maximum is + * considered part of a numa group's pseudo-interleaving set. Migrations + * between these nodes are slowed down, to allow things to settle down. + */ +#define ACTIVE_NODE_FRACTION 3 + +static bool numa_is_active_node(int nid, struct numa_group *ng) +{ + return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; +} + /* Handle placement on systems where not all nodes are directly connected. */ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, int maxdist, bool task) @@ -1143,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, return true; /* - * Do not migrate if the destination is not a node that - * is actively used by this numa group. + * Destination node is much more heavily used than the source + * node? Allow migration. */ - if (!node_isset(dst_nid, ng->active_nodes)) - return false; - - /* - * Source is a node that is not actively used by this - * numa group, while the destination is. Migrate. - */ - if (!node_isset(src_nid, ng->active_nodes)) + if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) * + ACTIVE_NODE_FRACTION) return true; /* - * Both source and destination are nodes in active - * use by this numa group. Maximize memory bandwidth - * by migrating from more heavily used groups, to less - * heavily used ones, spreading the load around. - * Use a 1/4 hysteresis to avoid spurious page movement. + * Distribute memory according to CPU & memory use on each node, + * with 3/4 hysteresis to avoid unnecessary memory migrations: + * + * faults_cpu(dst) 3 faults_cpu(src) + * --------------- * - > --------------- + * faults_mem(dst) 4 faults_mem(src) */ - return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); + return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 > + group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } static unsigned long weighted_cpuload(const int cpu); @@ -1509,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p) .best_task = NULL, .best_imp = 0, - .best_cpu = -1 + .best_cpu = -1, }; struct sched_domain *sd; unsigned long taskweight, groupweight; @@ -1561,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p) * multiple NUMA nodes; in order to better consolidate the group, * we need to check other locations. */ - if (env.best_cpu == -1 || (p->numa_group && - nodes_weight(p->numa_group->active_nodes) > 1)) { + if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -1597,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p) * trying for a better one later. Do not set the preferred node here. */ if (p->numa_group) { + struct numa_group *ng = p->numa_group; + if (env.best_cpu == -1) nid = env.src_nid; else nid = env.dst_nid; - if (node_isset(nid, p->numa_group->active_nodes)) + if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) sched_setnuma(p, env.dst_nid); } @@ -1652,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p) } /* - * Find the nodes on which the workload is actively running. We do this by + * Find out how many nodes on the workload is actively running on. Do this by * tracking the nodes from which NUMA hinting faults are triggered. This can * be different from the set of nodes where the workload's memory is currently * located. - * - * The bitmask is used to make smarter decisions on when to do NUMA page - * migrations, To prevent flip-flopping, and excessive page migrations, nodes - * are added when they cause over 6/16 of the maximum number of faults, but - * only removed when they drop below 3/16. */ -static void update_numa_active_node_mask(struct numa_group *numa_group) +static void numa_group_count_active_nodes(struct numa_group *numa_group) { unsigned long faults, max_faults = 0; - int nid; + int nid, active_nodes = 0; for_each_online_node(nid) { faults = group_faults_cpu(numa_group, nid); @@ -1675,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) for_each_online_node(nid) { faults = group_faults_cpu(numa_group, nid); - if (!node_isset(nid, numa_group->active_nodes)) { - if (faults > max_faults * 6 / 16) - node_set(nid, numa_group->active_nodes); - } else if (faults < max_faults * 3 / 16) - node_clear(nid, numa_group->active_nodes); + if (faults * ACTIVE_NODE_FRACTION > max_faults) + active_nodes++; } + + numa_group->max_faults_cpu = max_faults; + numa_group->active_nodes = active_nodes; } /* @@ -1971,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p) update_task_scan_period(p, fault_types[0], fault_types[1]); if (p->numa_group) { - update_numa_active_node_mask(p->numa_group); + numa_group_count_active_nodes(p->numa_group); spin_unlock_irq(group_lock); max_nid = preferred_group_nid(p, max_group_nid); } @@ -2015,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, return; atomic_set(&grp->refcount, 1); + grp->active_nodes = 1; + grp->max_faults_cpu = 0; spin_lock_init(&grp->lock); grp->gid = p->pid; /* Second half of the array tracks nids where faults happen */ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * nr_node_ids; - node_set(task_node(current), grp->active_nodes); - for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] = p->numa_faults[i]; @@ -2136,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) bool migrated = flags & TNF_MIGRATED; int cpu_node = task_node(current); int local = !!(flags & TNF_FAULT_LOCAL); + struct numa_group *ng; int priv; if (!static_branch_likely(&sched_numa_balancing)) @@ -2176,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) * actively using should be counted as local. This allows the * scan rate to slow down when a workload has settled down. */ - if (!priv && !local && p->numa_group && - node_isset(cpu_node, p->numa_group->active_nodes) && - node_isset(mem_node, p->numa_group->active_nodes)) + ng = p->numa_group; + if (!priv && !local && ng && ng->active_nodes > 1 && + numa_is_active_node(cpu_node, ng) && + numa_is_active_node(mem_node, ng)) local = 1; task_numa_placement(p); -- cgit From 041bd12e272c53a35c54c13875839bcb98c999ce Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Feb 2016 16:11:26 -0500 Subject: Revert "workqueue: make sure delayed work run in local cpu" This reverts commit 874bbfe600a660cba9c776b3957b1ce393151b76. Workqueue used to implicity guarantee that work items queued without explicit CPU specified are put on the local CPU. Recent changes in timer broke the guarantee and led to vmstat breakage which was fixed by 176bed1de5bf ("vmstat: explicitly schedule per-cpu work on the CPU we need it to run on"). vmstat is the most likely to expose the issue and it's quite possible that there are other similar problems which are a lot more difficult to trigger. As a preventive measure, 874bbfe600a6 ("workqueue: make sure delayed work run in local cpu") was applied to restore the local CPU guarnatee. Unfortunately, the change exposed a bug in timer code which got fixed by 22b886dd1018 ("timers: Use proper base migration in add_timer_on()"). Due to code restructuring, the commit couldn't be backported beyond certain point and stable kernels which only had 874bbfe600a6 started crashing. The local CPU guarantee was accidental more than anything else and we want to get rid of it anyway. As, with the vmstat case fixed, 874bbfe600a6 is causing more problems than it's fixing, it has been decided to take the chance and officially break the guarantee by reverting the commit. A debug feature will be added to force foreign CPU assignment to expose cases relying on the guarantee and fixes for the individual cases will be backported to stable as necessary. Signed-off-by: Tejun Heo Fixes: 874bbfe600a6 ("workqueue: make sure delayed work run in local cpu") Link: http://lkml.kernel.org/g/20160120211926.GJ10810@quack.suse.cz Cc: stable@vger.kernel.org Cc: Mike Galbraith Cc: Henrique de Moraes Holschuh Cc: Daniel Bilik Cc: Jan Kara Cc: Shaohua Li Cc: Sasha Levin Cc: Ben Hutchings Cc: Thomas Gleixner Cc: Daniel Bilik Cc: Jiri Slaby Cc: Michal Hocko --- kernel/workqueue.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dc7faad63646..5e63d3b719ae 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1464,13 +1464,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, timer_stats_timer_set_start_info(&dwork->timer); dwork->wq = wq; - /* timer isn't guaranteed to run in this cpu, record earlier */ - if (cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); dwork->cpu = cpu; timer->expires = jiffies + delay; - add_timer_on(timer, cpu); + if (unlikely(cpu != WORK_CPU_UNBOUND)) + add_timer_on(timer, cpu); + else + add_timer(timer); } /** -- cgit From ef557180447fa9a7a0affd3abb21ecceb4b5e125 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 9 Feb 2016 17:59:38 -0500 Subject: workqueue: schedule WORK_CPU_UNBOUND work on wq_unbound_cpumask CPUs WORK_CPU_UNBOUND work items queued to a bound workqueue always run locally. This is a good thing normally, but not when the user has asked us to keep unbound work away from certain CPUs. Round robin these to wq_unbound_cpumask CPUs instead, as perturbation avoidance trumps performance. tj: Cosmetic and comment changes. WARN_ON_ONCE() dropped from empty (wq_unbound_cpumask AND cpu_online_mask). If we want that, it should be done when config changes. Signed-off-by: Mike Galbraith Signed-off-by: Tejun Heo --- kernel/workqueue.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5e63d3b719ae..054774605d2f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -301,7 +301,11 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ -static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */ +/* PL: allowable cpus for unbound wqs and work items */ +static cpumask_var_t wq_unbound_cpumask; + +/* CPU where unbound work was last round robin scheduled from this CPU */ +static DEFINE_PER_CPU(int, wq_rr_cpu_last); /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], @@ -1298,6 +1302,32 @@ static bool is_chained_work(struct workqueue_struct *wq) return worker && worker->current_pwq->wq == wq; } +/* + * When queueing an unbound work item to a wq, prefer local CPU if allowed + * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to + * avoid perturbing sensitive tasks. + */ +static int wq_select_unbound_cpu(int cpu) +{ + int new_cpu; + + if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) + return cpu; + if (cpumask_empty(wq_unbound_cpumask)) + return cpu; + + new_cpu = __this_cpu_read(wq_rr_cpu_last); + new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) { + new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) + return cpu; + } + __this_cpu_write(wq_rr_cpu_last, new_cpu); + + return new_cpu; +} + static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { @@ -1323,7 +1353,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, return; retry: if (req_cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); + cpu = wq_select_unbound_cpu(raw_smp_processor_id()); /* pwq which will be used unless @work is executing elsewhere */ if (!(wq->flags & WQ_UNBOUND)) -- cgit From f303fccb82928790ec58eea82722bd5c54d300b3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Feb 2016 17:59:38 -0500 Subject: workqueue: implement "workqueue.debug_force_rr_cpu" debug feature Workqueue used to guarantee local execution for work items queued without explicit target CPU. The guarantee is gone now which can break some usages in subtle ways. To flush out those cases, this patch implements a debug feature which forces round-robin CPU selection for all such work items. The debug feature defaults to off and can be enabled with a kernel parameter. The default can be flipped with a debug config option. If you hit this commit during bisection, please refer to 041bd12e272c ("Revert "workqueue: make sure delayed work run in local cpu"") for more information and ping me. Signed-off-by: Tejun Heo --- Documentation/kernel-parameters.txt | 11 +++++++++++ kernel/workqueue.c | 23 +++++++++++++++++++++-- lib/Kconfig.debug | 15 +++++++++++++++ 3 files changed, 47 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 87d40a72f6a1..cda2ead39093 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -4230,6 +4230,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted. The default value of this parameter is determined by the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT. + workqueue.debug_force_rr_cpu + Workqueue used to implicitly guarantee that work + items queued without explicit CPU specified are put + on the local CPU. This guarantee is no longer true + and while local CPU is still preferred work items + may be put on foreign CPUs. This debug option + forces round-robin CPU selection to flush out + usages which depend on the now broken guarantee. + When enabled, memory and cache locality will be + impacted. + x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of default x2apic cluster mode on platforms supporting x2apic. diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 054774605d2f..51d77e7c0989 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -307,6 +307,18 @@ static cpumask_var_t wq_unbound_cpumask; /* CPU where unbound work was last round robin scheduled from this CPU */ static DEFINE_PER_CPU(int, wq_rr_cpu_last); +/* + * Local execution of unbound work items is no longer guaranteed. The + * following always forces round-robin CPU selection on unbound work items + * to uncover usages which depend on it. + */ +#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU +static bool wq_debug_force_rr_cpu = true; +#else +static bool wq_debug_force_rr_cpu = false; +#endif +module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); + /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); @@ -1309,10 +1321,17 @@ static bool is_chained_work(struct workqueue_struct *wq) */ static int wq_select_unbound_cpu(int cpu) { + static bool printed_dbg_warning; int new_cpu; - if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) - return cpu; + if (likely(!wq_debug_force_rr_cpu)) { + if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) + return cpu; + } else if (!printed_dbg_warning) { + pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n"); + printed_dbg_warning = true; + } + if (cpumask_empty(wq_unbound_cpumask)) return cpu; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ecb9e75614bf..8bfd1aca7a3d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1400,6 +1400,21 @@ config RCU_EQS_DEBUG endmenu # "RCU Debugging" +config DEBUG_WQ_FORCE_RR_CPU + bool "Force round-robin CPU selection for unbound work items" + depends on DEBUG_KERNEL + default n + help + Workqueue used to implicitly guarantee that work items queued + without explicit CPU specified are put on the local CPU. This + guarantee is no longer true and while local CPU is still + preferred work items may be put on foreign CPUs. Kernel + parameter "workqueue.debug_force_rr_cpu" is added to force + round-robin CPU selection to flush out usages which depend on the + now broken guarantee. This config option enables the debug + feature by default. When enabled, memory and cache locality will + be impacted. + config DEBUG_BLOCK_EXT_DEVT bool "Force extended block device numbers and spread them" depends on DEBUG_KERNEL -- cgit From d6e022f1d207a161cd88e08ef0371554680ffc46 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Feb 2016 13:54:25 -0500 Subject: workqueue: handle NUMA_NO_NODE for unbound pool_workqueue lookup When looking up the pool_workqueue to use for an unbound workqueue, workqueue assumes that the target CPU is always bound to a valid NUMA node. However, currently, when a CPU goes offline, the mapping is destroyed and cpu_to_node() returns NUMA_NO_NODE. This has always been broken but hasn't triggered often enough before 874bbfe600a6 ("workqueue: make sure delayed work run in local cpu"). After the commit, workqueue forcifully assigns the local CPU for delayed work items without explicit target CPU to fix a different issue. This widens the window where CPU can go offline while a delayed work item is pending causing delayed work items dispatched with target CPU set to an already offlined CPU. The resulting NUMA_NO_NODE mapping makes workqueue try to queue the work item on a NULL pool_workqueue and thus crash. While 874bbfe600a6 has been reverted for a different reason making the bug less visible again, it can still happen. Fix it by mapping NUMA_NO_NODE to the default pool_workqueue from unbound_pwq_by_node(). This is a temporary workaround. The long term solution is keeping CPU -> NODE mapping stable across CPU off/online cycles which is being worked on. Signed-off-by: Tejun Heo Reported-by: Mike Galbraith Cc: Tang Chen Cc: Rafael J. Wysocki Cc: Len Brown Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/g/1454424264.11183.46.camel@gmail.com Link: http://lkml.kernel.org/g/1453702100-2597-1-git-send-email-tangchen@cn.fujitsu.com --- kernel/workqueue.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 51d77e7c0989..7ff5dc7d2ac5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -586,6 +586,16 @@ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, int node) { assert_rcu_or_wq_mutex_or_pool_mutex(wq); + + /* + * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a + * delayed item is pending. The plan is to keep CPU -> NODE + * mapping valid and stable across CPU on/offlines. Once that + * happens, this workaround can be removed. + */ + if (unlikely(node == NUMA_NO_NODE)) + return wq->dfl_pwq; + return rcu_dereference_raw(wq->numa_pwq_tbl[node]); } -- cgit From a1b14d27ed0965838350f1377ff97c93ee383492 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Feb 2016 16:47:11 +0100 Subject: bpf: fix branch offset adjustment on backjumps after patching ctx expansion When ctx access is used, the kernel often needs to expand/rewrite instructions, so after that patching, branch offsets have to be adjusted for both forward and backward jumps in the new eBPF program, but for backward jumps it fails to account the delta. Meaning, for example, if the expansion happens exactly on the insn that sits at the jump target, it doesn't fix up the back jump offset. Analysis on what the check in adjust_branches() is currently doing: /* adjust offset of jmps if necessary */ if (i < pos && i + insn->off + 1 > pos) insn->off += delta; else if (i > pos && i + insn->off + 1 < pos) insn->off -= delta; First condition (forward jumps): Before: After: insns[0] insns[0] insns[1] <--- i/insn insns[1] <--- i/insn insns[2] <--- pos insns[P] <--- pos insns[3] insns[P] `------| delta insns[4] <--- target_X insns[P] `-----| insns[5] insns[3] insns[4] <--- target_X insns[5] First case is if we cross pos-boundary and the jump instruction was before pos. This is handeled correctly. I.e. if i == pos, then this would mean our jump that we currently check was the patchlet itself that we just injected. Since such patchlets are self-contained and have no awareness of any insns before or after the patched one, the delta is correctly not adjusted. Also, for the second condition in case of i + insn->off + 1 == pos, means we jump to that newly patched instruction, so no offset adjustment are needed. That part is correct. Second condition (backward jumps): Before: After: insns[0] insns[0] insns[1] <--- target_X insns[1] <--- target_X insns[2] <--- pos <-- target_Y insns[P] <--- pos <-- target_Y insns[3] insns[P] `------| delta insns[4] <--- i/insn insns[P] `-----| insns[5] insns[3] insns[4] <--- i/insn insns[5] Second interesting case is where we cross pos-boundary and the jump instruction was after pos. Backward jump with i == pos would be impossible and pose a bug somewhere in the patchlet, so the first condition checking i > pos is okay only by itself. However, i + insn->off + 1 < pos does not always work as intended to trigger the adjustment. It works when jump targets would be far off where the delta wouldn't matter. But, for example, where the fixed insn->off before pointed to pos (target_Y), it now points to pos + delta, so that additional room needs to be taken into account for the check. This means that i) both tests here need to be adjusted into pos + delta, and ii) for the second condition, the test needs to be <= as pos itself can be a target in the backjump, too. Fixes: 9bac3d6d548e ("bpf: allow extended BPF programs access skb fields") Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d1d3e8f57de9..2e7f7ab739e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2082,7 +2082,7 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta) /* adjust offset of jmps if necessary */ if (i < pos && i + insn->off + 1 > pos) insn->off += delta; - else if (i > pos && i + insn->off + 1 < pos) + else if (i > pos + delta && i + insn->off + 1 <= pos + delta) insn->off -= delta; } } -- cgit From f7b382b988233b5851eddf4531651ffe4133e88c Mon Sep 17 00:00:00 2001 From: Abhilash Jindal Date: Sun, 31 Jan 2016 14:29:01 -0500 Subject: PM/freezer: y2038, use boottime to compare tstamps Wall time obtained from do_gettimeofday gives 32 bit timeval which can only represent time until January 2038. This patch moves to ktime_t, a 64-bit time. Also, wall time is susceptible to sudden jumps due to user setting the time or due to NTP. Boot time is constantly increasing time better suited for subtracting two timestamps. Signed-off-by: Abhilash Jindal Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 564f786df470..df058bed53ce 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -30,13 +30,12 @@ static int try_to_freeze_tasks(bool user_only) unsigned long end_time; unsigned int todo; bool wq_busy = false; - struct timeval start, end; - u64 elapsed_msecs64; + ktime_t start, end, elapsed; unsigned int elapsed_msecs; bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; - do_gettimeofday(&start); + start = ktime_get_boottime(); end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); @@ -78,10 +77,9 @@ static int try_to_freeze_tasks(bool user_only) sleep_usecs *= 2; } - do_gettimeofday(&end); - elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); - do_div(elapsed_msecs64, NSEC_PER_MSEC); - elapsed_msecs = elapsed_msecs64; + end = ktime_get_boottime(); + elapsed = ktime_sub(end, start); + elapsed_msecs = ktime_to_ms(elapsed); if (todo) { pr_cont("\n"); -- cgit From 22e09b333f0b395b3eb6ab6efa4b3284e2c06810 Mon Sep 17 00:00:00 2001 From: saurabh Date: Wed, 28 Oct 2015 08:54:01 +0530 Subject: PM / suspend: replacing printk replacing printk(s) with appropriate pr_info and pr_err in order to fix checkpatch.pl warnings Signed-off-by: Saurabh Sengar Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f9fe133c13e2..230a77225e2e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -248,7 +248,7 @@ static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG if (pm_test_level == level) { - printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n", + pr_info("suspend debug: Waiting for %d second(s).\n", pm_test_delay); mdelay(pm_test_delay * 1000); return 1; @@ -320,7 +320,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_late(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: late suspend of devices failed\n"); + pr_err("PM: late suspend of devices failed\n"); goto Platform_finish; } error = platform_suspend_prepare_late(state); @@ -329,7 +329,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: noirq suspend of devices failed\n"); + pr_err("PM: noirq suspend of devices failed\n"); goto Platform_early_resume; } error = platform_suspend_prepare_noirq(state); -- cgit From 4a389810bc3cb0e73443104f0827e81e23cb1e12 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 11 Feb 2016 16:13:14 -0800 Subject: kernel/locking/lockdep.c: convert hash tables to hlists Mike said: : CONFIG_UBSAN_ALIGNMENT breaks x86-64 kernel with lockdep enabled, i. e : kernel with CONFIG_UBSAN_ALIGNMENT fails to load without even any error : message. : : The problem is that ubsan callbacks use spinlocks and might be called : before lockdep is initialized. Particularly this line in the : reserve_ebda_region function causes problem: : : lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); : : If i put lockdep_init() before reserve_ebda_region call in : x86_64_start_reservations kernel loads well. Fix this ordering issue permanently: change lockdep so that it uses hlists for the hash tables. Unlike a list_head, an hlist_head is in its initialized state when it is all-zeroes, so lockdep is ready for operation immediately upon boot - lockdep_init() need not have run. The patch will also save some memory. lockdep_init() and lockdep_initialized can be done away with now - a 4.6 patch has been prepared to do this. Reported-by: Mike Krinkin Suggested-by: Mike Krinkin Cc: Andrey Ryabinin Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lockdep.h | 4 ++-- kernel/locking/lockdep.c | 42 +++++++++++++++++++----------------------- 2 files changed, 21 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index c57e424d914b..4dca42fd32f5 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -66,7 +66,7 @@ struct lock_class { /* * class-hash: */ - struct list_head hash_entry; + struct hlist_node hash_entry; /* * global list of all lock-classes: @@ -199,7 +199,7 @@ struct lock_chain { u8 irq_context; u8 depth; u16 base; - struct list_head entry; + struct hlist_node entry; u64 chain_key; }; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 60ace56618f6..7537e568dabe 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -292,7 +292,7 @@ LIST_HEAD(all_lock_classes); #define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) #define classhashentry(key) (classhash_table + __classhashfn((key))) -static struct list_head classhash_table[CLASSHASH_SIZE]; +static struct hlist_head classhash_table[CLASSHASH_SIZE]; /* * We put the lock dependency chains into a hash-table as well, to cache @@ -303,7 +303,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE]; #define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) #define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) -static struct list_head chainhash_table[CHAINHASH_SIZE]; +static struct hlist_head chainhash_table[CHAINHASH_SIZE]; /* * The hash key of the lock dependency chains is a hash itself too: @@ -666,7 +666,7 @@ static inline struct lock_class * look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; - struct list_head *hash_head; + struct hlist_head *hash_head; struct lock_class *class; #ifdef CONFIG_DEBUG_LOCKDEP @@ -719,7 +719,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return NULL; - list_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample @@ -742,7 +742,7 @@ static inline struct lock_class * register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; - struct list_head *hash_head; + struct hlist_head *hash_head; struct lock_class *class; DEBUG_LOCKS_WARN_ON(!irqs_disabled()); @@ -774,7 +774,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * We have to do the hash-walk again, to avoid races * with another CPU: */ - list_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) goto out_unlock_set; } @@ -805,7 +805,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: */ - list_add_tail_rcu(&class->hash_entry, hash_head); + hlist_add_head_rcu(&class->hash_entry, hash_head); /* * Add it to the global list of classes: */ @@ -2017,7 +2017,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, u64 chain_key) { struct lock_class *class = hlock_class(hlock); - struct list_head *hash_head = chainhashentry(chain_key); + struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; struct held_lock *hlock_curr; int i, j; @@ -2033,7 +2033,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, * We can walk it lock-free, because entries only get added * to the hash: */ - list_for_each_entry_rcu(chain, hash_head, entry) { + hlist_for_each_entry_rcu(chain, hash_head, entry) { if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(chain_lookup_hits); @@ -2057,7 +2057,7 @@ cache_hit: /* * We have to walk the chain again locked - to avoid duplicates: */ - list_for_each_entry(chain, hash_head, entry) { + hlist_for_each_entry(chain, hash_head, entry) { if (chain->chain_key == chain_key) { graph_unlock(); goto cache_hit; @@ -2091,7 +2091,7 @@ cache_hit: } chain_hlocks[chain->base + j] = class - lock_classes; } - list_add_tail_rcu(&chain->entry, hash_head); + hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(); @@ -3875,7 +3875,7 @@ void lockdep_reset(void) nr_process_chains = 0; debug_locks = 1; for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); + INIT_HLIST_HEAD(chainhash_table + i); raw_local_irq_restore(flags); } @@ -3894,7 +3894,7 @@ static void zap_class(struct lock_class *class) /* * Unhash the class and remove it from the all_lock_classes list: */ - list_del_rcu(&class->hash_entry); + hlist_del_rcu(&class->hash_entry); list_del_rcu(&class->lock_entry); RCU_INIT_POINTER(class->key, NULL); @@ -3917,7 +3917,7 @@ static inline int within(const void *addr, void *start, unsigned long size) void lockdep_free_key_range(void *start, unsigned long size) { struct lock_class *class; - struct list_head *head; + struct hlist_head *head; unsigned long flags; int i; int locked; @@ -3930,9 +3930,7 @@ void lockdep_free_key_range(void *start, unsigned long size) */ for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_rcu(class, head, hash_entry) { + hlist_for_each_entry_rcu(class, head, hash_entry) { if (within(class->key, start, size)) zap_class(class); else if (within(class->name, start, size)) @@ -3962,7 +3960,7 @@ void lockdep_free_key_range(void *start, unsigned long size) void lockdep_reset_lock(struct lockdep_map *lock) { struct lock_class *class; - struct list_head *head; + struct hlist_head *head; unsigned long flags; int i, j; int locked; @@ -3987,9 +3985,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) locked = graph_lock(); for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_rcu(class, head, hash_entry) { + hlist_for_each_entry_rcu(class, head, hash_entry) { int match = 0; for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) @@ -4027,10 +4023,10 @@ void lockdep_init(void) return; for (i = 0; i < CLASSHASH_SIZE; i++) - INIT_LIST_HEAD(classhash_table + i); + INIT_HLIST_HEAD(classhash_table + i); for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); + INIT_HLIST_HEAD(chainhash_table + i); lockdep_initialized = 1; } -- cgit From db78c22230d0bcc8b27b81f05b39f104f08232c5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 11 Feb 2016 16:13:17 -0800 Subject: mm: fix pfn_t vs highmem The pfn_t type uses an unsigned long to store a pfn + flags value. On a 64-bit platform the upper 12 bits of an unsigned long are never used for storing the value of a pfn. However, this is not true on highmem platforms, all 32-bits of a pfn value are used to address a 44-bit physical address space. A pfn_t needs to store a 64-bit value. Link: https://bugzilla.kernel.org/show_bug.cgi?id=112211 Fixes: 01c8f1c44b83 ("mm, dax, gpu: convert vm_insert_mixed to pfn_t") Signed-off-by: Dan Williams Reported-by: Stuart Foster Reported-by: Julian Margetson Tested-by: Julian Margetson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pfn.h | 2 +- include/linux/pfn_t.h | 19 +++++++++---------- kernel/memremap.c | 2 +- 3 files changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/pfn.h b/include/linux/pfn.h index 2d8e49711b63..1132953235c0 100644 --- a/include/linux/pfn.h +++ b/include/linux/pfn.h @@ -10,7 +10,7 @@ * backing is indicated by flags in the high bits of the value. */ typedef struct { - unsigned long val; + u64 val; } pfn_t; #endif diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 37448ab5fb5c..94994810c7c0 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -9,14 +9,13 @@ * PFN_DEV - pfn is not covered by system memmap by default * PFN_MAP - pfn has a dynamic page mapping established by a device driver */ -#define PFN_FLAGS_MASK (((unsigned long) ~PAGE_MASK) \ - << (BITS_PER_LONG - PAGE_SHIFT)) -#define PFN_SG_CHAIN (1UL << (BITS_PER_LONG - 1)) -#define PFN_SG_LAST (1UL << (BITS_PER_LONG - 2)) -#define PFN_DEV (1UL << (BITS_PER_LONG - 3)) -#define PFN_MAP (1UL << (BITS_PER_LONG - 4)) - -static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, unsigned long flags) +#define PFN_FLAGS_MASK (((u64) ~PAGE_MASK) << (BITS_PER_LONG_LONG - PAGE_SHIFT)) +#define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1)) +#define PFN_SG_LAST (1ULL << (BITS_PER_LONG_LONG - 2)) +#define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3)) +#define PFN_MAP (1ULL << (BITS_PER_LONG_LONG - 4)) + +static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, u64 flags) { pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), }; @@ -29,7 +28,7 @@ static inline pfn_t pfn_to_pfn_t(unsigned long pfn) return __pfn_to_pfn_t(pfn, 0); } -extern pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags); +extern pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags); static inline bool pfn_t_has_page(pfn_t pfn) { @@ -87,7 +86,7 @@ static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot) #ifdef __HAVE_ARCH_PTE_DEVMAP static inline bool pfn_t_devmap(pfn_t pfn) { - const unsigned long flags = PFN_DEV|PFN_MAP; + const u64 flags = PFN_DEV|PFN_MAP; return (pfn.val & flags) == flags; } diff --git a/kernel/memremap.c b/kernel/memremap.c index 70ee3775de24..2c468dea60bc 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -150,7 +150,7 @@ void devm_memunmap(struct device *dev, void *addr) } EXPORT_SYMBOL(devm_memunmap); -pfn_t phys_to_pfn_t(phys_addr_t addr, unsigned long flags) +pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags) { return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); } -- cgit From 223ffb29f9723a4b485cacf6dc7e6d639fffc322 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 11 Feb 2016 13:34:49 -0500 Subject: cgroup: provide cgroup_nov1= to disable controllers in v1 mounts Testing cgroup2 can be painful with system software automatically mounting and populating all cgroup controllers in v1 mode. Sometimes they can be unmounted from rc.local, sometimes even that is too late. Provide a commandline option to disable certain controllers in v1 mounts, so that they remain available for cgroup2 mounts. Example use: cgroup_no_v1=memory,cpu cgroup_no_v1=all Disabling will be confirmed at boot-time as such: [ 0.013770] Disabling cpu control group subsystem in v1 mounts [ 0.016004] Disabling memory control group subsystem in v1 mounts Signed-off-by: Johannes Weiner Signed-off-by: Tejun Heo --- kernel/cgroup.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d27904c193da..7ad61915967f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -180,6 +180,9 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); */ static bool cgrp_dfl_root_visible; +/* Controllers blocked by the commandline in v1 */ +static unsigned long cgroup_no_v1_mask; + /* some controllers are not supported in the default hierarchy */ static unsigned long cgrp_dfl_root_inhibit_ss_mask; @@ -241,6 +244,11 @@ static bool cgroup_ssid_enabled(int ssid) return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } +static bool cgroup_ssid_no_v1(int ssid) +{ + return cgroup_no_v1_mask & (1 << ssid); +} + /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest @@ -1678,6 +1686,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; if (!cgroup_ssid_enabled(i)) continue; + if (cgroup_ssid_no_v1(i)) + continue; /* Mutually exclusive option 'all' + subsystem name */ if (all_ss) @@ -1698,7 +1708,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) */ if (all_ss || (!one_ss && !opts->none && !opts->name)) for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i)) + if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i)) opts->subsys_mask |= (1 << i); /* @@ -5324,6 +5334,10 @@ int __init cgroup_init(void) continue; } + if (cgroup_ssid_no_v1(ssid)) + printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", + ss->name); + cgrp_dfl_root.subsys_mask |= 1 << ss->id; if (!ss->dfl_cftypes) @@ -5750,6 +5764,33 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); +static int __init cgroup_no_v1(char *str) +{ + struct cgroup_subsys *ss; + char *token; + int i; + + while ((token = strsep(&str, ",")) != NULL) { + if (!*token) + continue; + + if (!strcmp(token, "all")) { + cgroup_no_v1_mask = ~0UL; + break; + } + + for_each_subsys(ss, i) { + if (strcmp(token, ss->name) && + strcmp(token, ss->legacy_name)) + continue; + + cgroup_no_v1_mask |= 1 << i; + } + } + return 1; +} +__setup("cgroup_no_v1=", cgroup_no_v1); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest -- cgit From 5fd7a09cfb8c6852f596c1f8c891c6158395250e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 11 Aug 2015 18:03:23 +0200 Subject: atomic: Export fetch_or() Export fetch_or() that's implemented and used internally by the scheduler. We are going to use it for NO_HZ so make it generally available. Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/atomic.h | 21 +++++++++++++++++++++ kernel/sched/core.c | 14 -------------- 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 301de78d65f7..6c502cb13c95 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -548,6 +548,27 @@ static inline int atomic_dec_if_positive(atomic_t *v) } #endif +/** + * fetch_or - perform *ptr |= mask and return old value of *ptr + * @ptr: pointer to value + * @mask: mask to OR on the value + * + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#ifndef fetch_or +#define fetch_or(ptr, mask) \ +({ typeof(*(ptr)) __old, __val = *(ptr); \ + for (;;) { \ + __old = cmpxchg((ptr), __val, __val | (mask)); \ + if (__old == __val) \ + break; \ + __val = __old; \ + } \ + __old; \ +}) +#endif + + #ifdef CONFIG_GENERIC_ATOMIC64 #include #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9503d590e5ef..7142feb4b92d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -453,20 +453,6 @@ static inline void init_hrtick(void) } #endif /* CONFIG_SCHED_HRTICK */ -/* - * cmpxchg based fetch_or, macro so it works for different integer types - */ -#define fetch_or(ptr, val) \ -({ typeof(*(ptr)) __old, __val = *(ptr); \ - for (;;) { \ - __old = cmpxchg((ptr), __val, __val | (val)); \ - if (__old == __val) \ - break; \ - __val = __old; \ - } \ - __old; \ -}) - #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) /* * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -- cgit From 8537bb95a63e4be330359721f0ecd422f4a4c0ca Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 7 Dec 2015 16:55:23 +0100 Subject: nohz: Implement wide kick on top of irq work It simplifies it and allows wide kick to be performed, even when IRQs are disabled, without an asynchronous level in the middle. This comes at a cost of some more overhead on features like perf and posix cpu timers slow-paths, which is probably not much important for nohz full users. Requested-by: Peter Zijlstra Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 0b17424349eb..548a4e2551a9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -234,24 +234,20 @@ void tick_nohz_full_kick_cpu(int cpu) irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } -static void nohz_full_kick_ipi(void *info) -{ - /* Empty, the tick restart happens on tick_nohz_irq_exit() */ -} - /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ void tick_nohz_full_kick_all(void) { + int cpu; + if (!tick_nohz_full_running) return; preempt_disable(); - smp_call_function_many(tick_nohz_full_mask, - nohz_full_kick_ipi, NULL, false); - tick_nohz_full_kick(); + for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) + tick_nohz_full_kick_cpu(cpu); preempt_enable(); } -- cgit From f944b5a7aff05a244a6c8cac297819af09a199e4 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 14 Jan 2016 10:54:13 +0100 Subject: genirq: Use a common macro to go through the actions list The irq code browses the list of actions differently to inspect the element one by one. Even if it is not a problem, for the sake of consistent code, provide a macro similar to for_each_irq_desc in order to have the same loop to go through the actions list and use it in the code. [ tglx: Renamed the macro ] Signed-off-by: Daniel Lezcano Link: http://lkml.kernel.org/r/1452765253-31148-1-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 6 ++---- kernel/irq/internals.h | 3 +++ kernel/irq/manage.c | 8 +++----- kernel/irq/proc.c | 2 +- kernel/irq/spurious.c | 4 +--- 5 files changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 57bff7857e87..a15b5485b446 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -136,10 +136,9 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; unsigned int flags = 0, irq = desc->irq_data.irq; - struct irqaction *action = desc->action; + struct irqaction *action; - /* action might have become NULL since we dropped the lock */ - while (action) { + for_each_action_of_desc(desc, action) { irqreturn_t res; trace_irq_handler_entry(irq, action); @@ -173,7 +172,6 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) } retval |= res; - action = action->next; } add_interrupt_randomness(irq, flags); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fcab63c66905..eab521fc547c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -131,6 +131,9 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) #define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK) #define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) +#define for_each_action_of_desc(desc, act) \ + for (act = desc->act; act; act = act->next) + struct irq_desc * __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, unsigned int check); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 841187239adc..3ddd2297ee95 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -144,13 +144,11 @@ int irq_can_set_affinity(unsigned int irq) */ void irq_set_thread_affinity(struct irq_desc *desc) { - struct irqaction *action = desc->action; + struct irqaction *action; - while (action) { + for_each_action_of_desc(desc, action) if (action->thread) set_bit(IRQTF_AFFINITY, &action->thread_flags); - action = action->next; - } } #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -994,7 +992,7 @@ void irq_wake_thread(unsigned int irq, void *dev_id) return; raw_spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action; action; action = action->next) { + for_each_action_of_desc(desc, action) { if (action->dev_id == dev_id) { if (action->thread) __irq_wake_thread(desc, action); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a2c02fd5d6d0..4e1b94726818 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -291,7 +291,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) int ret = 1; raw_spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action ; action; action = action->next) { + for_each_action_of_desc(desc, action) { if ((action != new_action) && action->name && !strcmp(new_action->name, action->name)) { ret = 0; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 32144175458d..5707f97a3e6a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -211,14 +211,12 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) * desc->lock here. See synchronize_irq(). */ raw_spin_lock_irqsave(&desc->lock, flags); - action = desc->action; - while (action) { + for_each_action_of_desc(desc, action) { printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); if (action->thread_fn) printk(KERN_CONT " threaded [<%p>] %pf", action->thread_fn, action->thread_fn); printk(KERN_CONT "\n"); - action = action->next; } raw_spin_unlock_irqrestore(&desc->lock, flags); } -- cgit From fc4fa6e112c0f999fab022a4eb7f6614bb47c7ab Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Sun, 13 Dec 2015 15:26:11 +0900 Subject: treewide: Fix typo in printk This patch fix spelling typos found in printk and Kconfig. Signed-off-by: Masanari Iida Acked-by: Randy Dunlap Signed-off-by: Jiri Kosina --- drivers/atm/firestream.c | 2 +- drivers/crypto/nx/nx-842.c | 2 +- drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c | 2 +- drivers/input/touchscreen/wdt87xx_i2c.c | 2 +- drivers/net/ethernet/nuvoton/w90p910_ether.c | 2 +- drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c | 2 +- drivers/usb/gadget/legacy/Kconfig | 3 +-- kernel/time/timekeeping.c | 2 +- lib/842/842_decompress.c | 2 +- net/openvswitch/vport-geneve.c | 2 +- tools/testing/selftests/timers/alarmtimer-suspend.c | 2 +- 11 files changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c index 82f2ae0d7cc4..a969a7e443be 100644 --- a/drivers/atm/firestream.c +++ b/drivers/atm/firestream.c @@ -168,7 +168,7 @@ static char *res_strings[] = { "reserved 14", "Unrecognized cell", "reserved 16", - "reassemby abort: AAL5 abort", + "reassembly abort: AAL5 abort", "packet purged", "packet ageing timeout", "channel ageing timeout", diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c index 046c1c45411b..d94e25df503b 100644 --- a/drivers/crypto/nx/nx-842.c +++ b/drivers/crypto/nx/nx-842.c @@ -308,7 +308,7 @@ int nx842_crypto_compress(struct crypto_tfm *tfm, h = !n && add_header ? hdrsize : 0; if (ignore) - pr_warn("interal error, ignore is set %x\n", ignore); + pr_warn("internal error, ignore is set %x\n", ignore); ret = compress(ctx, &p, &hdr->group[n], &c, &ignore, h); if (ret) diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c index 85dc3f989ff7..704d08744116 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c @@ -64,7 +64,7 @@ const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state) case IB_QPS_ERR: return "ERR"; default: - return "UNKOWN STATE"; + return "UNKNOWN STATE"; } } diff --git a/drivers/input/touchscreen/wdt87xx_i2c.c b/drivers/input/touchscreen/wdt87xx_i2c.c index 515c20a6e10f..73861ad22df4 100644 --- a/drivers/input/touchscreen/wdt87xx_i2c.c +++ b/drivers/input/touchscreen/wdt87xx_i2c.c @@ -848,7 +848,7 @@ static int wdt87xx_do_update_firmware(struct i2c_client *client, error = wdt87xx_get_sysparam(client, &wdt->param); if (error) dev_err(&client->dev, - "failed to refresh system paramaters: %d\n", error); + "failed to refresh system parameters: %d\n", error); out: enable_irq(client->irq); mutex_unlock(&wdt->fw_mutex); diff --git a/drivers/net/ethernet/nuvoton/w90p910_ether.c b/drivers/net/ethernet/nuvoton/w90p910_ether.c index afa445842f3e..52d9a94aebb9 100644 --- a/drivers/net/ethernet/nuvoton/w90p910_ether.c +++ b/drivers/net/ethernet/nuvoton/w90p910_ether.c @@ -1038,7 +1038,7 @@ static int w90p910_ether_probe(struct platform_device *pdev) error = register_netdev(dev); if (error != 0) { - dev_err(&pdev->dev, "Regiter EMC w90p910 FAILED\n"); + dev_err(&pdev->dev, "Register EMC w90p910 FAILED\n"); error = -ENODEV; goto failed_put_rmiiclk; } diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c index b57cfd965196..95dcbff4673b 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c @@ -626,7 +626,7 @@ static void rtl8821ae_dm_find_minimum_rssi(struct ieee80211_hw *hw) rtl_dm_dig->min_undec_pwdb_for_dm = rtlpriv->dm.entry_min_undec_sm_pwdb; RT_TRACE(rtlpriv, COMP_BB_POWERSAVING, DBG_LOUD, - "AP Ext Port or disconnet PWDB = 0x%x\n", + "AP Ext Port or disconnect PWDB = 0x%x\n", rtl_dm_dig->min_undec_pwdb_for_dm); } RT_TRACE(rtlpriv, COMP_DIG, DBG_LOUD, diff --git a/drivers/usb/gadget/legacy/Kconfig b/drivers/usb/gadget/legacy/Kconfig index 4d682ad7bf23..fb1cdd9af472 100644 --- a/drivers/usb/gadget/legacy/Kconfig +++ b/drivers/usb/gadget/legacy/Kconfig @@ -103,8 +103,7 @@ config USB_ETH - CDC Ethernet Emulation Model (EEM) is a newer standard that has a simpler interface that can be used by more USB hardware. - RNDIS support is an additional option, more demanding than than - subset. + RNDIS support is an additional option, more demanding than subset. Within the USB device, this gadget driver exposes a network device "usbX", where X depends on what other networking devices you have. diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b1356b7ae570..0d4cc7601df7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -131,7 +131,7 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); } else { if (offset > (max_cycles >> 1)) { - printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n", + printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", offset, name, max_cycles >> 1); printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); } diff --git a/lib/842/842_decompress.c b/lib/842/842_decompress.c index 8881dad2a6a0..a2a941f8112d 100644 --- a/lib/842/842_decompress.c +++ b/lib/842/842_decompress.c @@ -250,7 +250,7 @@ static int do_op(struct sw842_param *p, u8 o) case OP_ACTION_NOOP: break; default: - pr_err("Interal error, invalid op %x\n", op); + pr_err("Internal error, invalid op %x\n", op); return -EINVAL; } diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index efb736bb6855..69f1de58a3b4 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -133,6 +133,6 @@ static void __exit ovs_geneve_tnl_exit(void) module_init(ovs_geneve_tnl_init); module_exit(ovs_geneve_tnl_exit); -MODULE_DESCRIPTION("OVS: Geneve swiching port"); +MODULE_DESCRIPTION("OVS: Geneve switching port"); MODULE_LICENSE("GPL"); MODULE_ALIAS("vport-type-5"); diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c index 72cacf5383dd..2b361b830395 100644 --- a/tools/testing/selftests/timers/alarmtimer-suspend.c +++ b/tools/testing/selftests/timers/alarmtimer-suspend.c @@ -153,7 +153,7 @@ int main(void) alarmcount = 0; if (timer_create(alarm_clock_id, &se, &tm1) == -1) { - printf("timer_create failled, %s unspported?\n", + printf("timer_create failed, %s unsupported?\n", clockstring(alarm_clock_id)); break; } -- cgit From 1e9877902dc7e11d2be038371c6fbf2dfcd469d7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:01:54 -0800 Subject: mm/gup: Introduce get_user_pages_remote() For protection keys, we need to understand whether protections should be enforced in software or not. In general, we enforce protections when working on our own task, but not when on others. We call these "current" and "remote" operations. This patch introduces a new get_user_pages() variant: get_user_pages_remote() Which is a replacement for when get_user_pages() is called on non-current tsk/mm. We also introduce a new gup flag: FOLL_REMOTE which can be used for the "__" gup variants to get this new behavior. The uprobes is_trap_at_addr() location holds mmap_sem and calls get_user_pages(current->mm) on an instruction address. This makes it a pretty unique gup caller. Being an instruction access and also really originating from the kernel (vs. the app), I opted to consider this a 'remote' access where protection keys will not be enforced. Without protection keys, this patch should not change any behavior. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Naoya Horiguchi Cc: Peter Zijlstra Cc: Rik van Riel Cc: Srikar Dronamraju Cc: Vlastimil Babka Cc: jack@suse.cz Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210154.3F0E51EA@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 6 +++--- drivers/gpu/drm/i915/i915_gem_userptr.c | 10 +++++----- drivers/infiniband/core/umem_odp.c | 8 ++++---- fs/exec.c | 8 ++++++-- include/linux/mm.h | 5 +++++ kernel/events/uprobes.c | 10 ++++++++-- mm/gup.c | 27 ++++++++++++++++++++++----- mm/memory.c | 2 +- mm/process_vm_access.c | 11 ++++++++--- security/tomoyo/domain.c | 9 ++++++++- virt/kvm/async_pf.c | 8 +++++++- 11 files changed, 77 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 4b519e4309b2..97d4457be8d2 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -753,9 +753,9 @@ static struct page **etnaviv_gem_userptr_do_get_pages( down_read(&mm->mmap_sem); while (pinned < npages) { - ret = get_user_pages(task, mm, ptr, npages - pinned, - !etnaviv_obj->userptr.ro, 0, - pvec + pinned, NULL); + ret = get_user_pages_remote(task, mm, ptr, npages - pinned, + !etnaviv_obj->userptr.ro, 0, + pvec + pinned, NULL); if (ret < 0) break; diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 59e45b3a6937..90dbf8121210 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -584,11 +584,11 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) down_read(&mm->mmap_sem); while (pinned < npages) { - ret = get_user_pages(work->task, mm, - obj->userptr.ptr + pinned * PAGE_SIZE, - npages - pinned, - !obj->userptr.read_only, 0, - pvec + pinned, NULL); + ret = get_user_pages_remote(work->task, mm, + obj->userptr.ptr + pinned * PAGE_SIZE, + npages - pinned, + !obj->userptr.read_only, 0, + pvec + pinned, NULL); if (ret < 0) break; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index e69bf266049d..75077a018675 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -572,10 +572,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, * complex (and doesn't gain us much performance in most use * cases). */ - npages = get_user_pages(owning_process, owning_mm, user_virt, - gup_num_pages, - access_mask & ODP_WRITE_ALLOWED_BIT, 0, - local_page_list, NULL); + npages = get_user_pages_remote(owning_process, owning_mm, + user_virt, gup_num_pages, + access_mask & ODP_WRITE_ALLOWED_BIT, + 0, local_page_list, NULL); up_read(&owning_mm->mmap_sem); if (npages < 0) diff --git a/fs/exec.c b/fs/exec.c index dcd4ac7d3f1e..d885b98b6a00 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -198,8 +198,12 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return NULL; } #endif - ret = get_user_pages(current, bprm->mm, pos, - 1, write, 1, &page, NULL); + /* + * We are doing an exec(). 'current' is the process + * doing the exec and bprm->mm is the new process's mm. + */ + ret = get_user_pages_remote(current, bprm->mm, pos, 1, write, + 1, &page, NULL); if (ret <= 0) return NULL; diff --git a/include/linux/mm.h b/include/linux/mm.h index b1d4b8c7f7cd..faf3b709eead 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1225,6 +1225,10 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int foll_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking); +long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas); long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, @@ -2170,6 +2174,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma, #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ #define FOLL_MLOCK 0x1000 /* lock present pages */ +#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0167679182c0..8eef5f55d3f0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -299,7 +299,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); if (ret <= 0) return ret; @@ -1700,7 +1700,13 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (likely(result == 0)) goto out; - result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + /* + * The NULL 'tsk' here ensures that any faults that occur here + * will not be accounted to the task. 'mm' *is* current->mm, + * but we treat this as a 'remote' access since it is + * essentially a kernel access to the memory. + */ + result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); if (result < 0) return result; diff --git a/mm/gup.c b/mm/gup.c index 7bf19ffa2199..36ca850936c9 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -870,7 +870,7 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, EXPORT_SYMBOL(get_user_pages_unlocked); /* - * get_user_pages() - pin user pages in memory + * get_user_pages_remote() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. * @mm: mm_struct of target mm @@ -924,12 +924,29 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * should use get_user_pages because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, int write, - int force, struct page **pages, struct vm_area_struct **vmas) +long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) { return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, - pages, vmas, NULL, false, FOLL_TOUCH); + pages, vmas, NULL, false, + FOLL_TOUCH | FOLL_REMOTE); +} +EXPORT_SYMBOL(get_user_pages_remote); + +/* + * This is the same as get_user_pages_remote() for the time + * being. + */ +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) +{ + return __get_user_pages_locked(tsk, mm, start, nr_pages, + write, force, pages, vmas, NULL, false, + FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); diff --git a/mm/memory.c b/mm/memory.c index 38090ca37a08..8bfbad0cca8c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3685,7 +3685,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, void *maddr; struct page *page = NULL; - ret = get_user_pages(tsk, mm, addr, 1, + ret = get_user_pages_remote(tsk, mm, addr, 1, write, 1, &page, &vma); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 5d453e58ddbf..07514d41ebcc 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -98,9 +98,14 @@ static int process_vm_rw_single_vec(unsigned long addr, int pages = min(nr_pages, max_pages_per_loop); size_t bytes; - /* Get the pages we're interested in */ - pages = get_user_pages_unlocked(task, mm, pa, pages, - vm_write, 0, process_pages); + /* + * Get the pages we're interested in. We must + * add FOLL_REMOTE because task/mm might not + * current/current->mm + */ + pages = __get_user_pages_unlocked(task, mm, pa, pages, + vm_write, 0, process_pages, + FOLL_REMOTE); if (pages <= 0) return -EFAULT; diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 38651454ed08..ade7c6cad172 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -874,7 +874,14 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, } /* Same with get_arg_page(bprm, pos, 0) in fs/exec.c */ #ifdef CONFIG_MMU - if (get_user_pages(current, bprm->mm, pos, 1, 0, 1, &page, NULL) <= 0) + /* + * This is called at execve() time in order to dig around + * in the argv/environment of the new proceess + * (represented by bprm). 'current' is the process doing + * the execve(). + */ + if (get_user_pages_remote(current, bprm->mm, pos, 1, + 0, 1, &page, NULL) <= 0) return false; #else page = bprm->page[pos / PAGE_SIZE]; diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 353159922456..d604e87a510a 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -79,7 +79,13 @@ static void async_pf_execute(struct work_struct *work) might_sleep(); - get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL); + /* + * This work is run asynchromously to the task which owns + * mm and might be done in another context, so we must + * use FOLL_REMOTE. + */ + __get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL, FOLL_REMOTE); + kvm_async_page_present_sync(vcpu, apf); spin_lock(&vcpu->async_pf.lock); -- cgit From a79a908fd2b080977b45bf103184b81c9d11ad07 Mon Sep 17 00:00:00 2001 From: Aditya Kali Date: Fri, 29 Jan 2016 02:54:06 -0600 Subject: cgroup: introduce cgroup namespaces Introduce the ability to create new cgroup namespace. The newly created cgroup namespace remembers the cgroup of the process at the point of creation of the cgroup namespace (referred as cgroupns-root). The main purpose of cgroup namespace is to virtualize the contents of /proc/self/cgroup file. Processes inside a cgroup namespace are only able to see paths relative to their namespace root (unless they are moved outside of their cgroupns-root, at which point they will see a relative path from their cgroupns-root). For a correctly setup container this enables container-tools (like libcontainer, lxc, lmctfy, etc.) to create completely virtualized containers without leaking system level cgroup hierarchy to the task. This patch only implements the 'unshare' part of the cgroupns. Signed-off-by: Aditya Kali Signed-off-by: Serge Hallyn Signed-off-by: Tejun Heo --- fs/proc/namespaces.c | 3 + include/linux/cgroup.h | 49 ++++++++++++++ include/linux/nsproxy.h | 2 + include/linux/proc_ns.h | 4 ++ kernel/cgroup.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++- kernel/cpuset.c | 8 +-- kernel/fork.c | 2 +- kernel/nsproxy.c | 19 +++++- 8 files changed, 250 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 276f12431dbf..72cb26f85d58 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -28,6 +28,9 @@ static const struct proc_ns_operations *ns_entries[] = { &userns_operations, #endif &mntns_operations, +#ifdef CONFIG_CGROUPS + &cgroupns_operations, +#endif }; static const char *proc_ns_get_link(struct dentry *dentry, diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 2162dca88dc0..a20320c666fd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -17,6 +17,11 @@ #include #include #include +#include +#include +#include +#include +#include #include @@ -611,4 +616,48 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ +struct cgroup_namespace { + atomic_t count; + struct ns_common ns; + struct user_namespace *user_ns; + struct css_set *root_cset; +}; + +extern struct cgroup_namespace init_cgroup_ns; + +#ifdef CONFIG_CGROUPS + +void free_cgroup_ns(struct cgroup_namespace *ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns); + +char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns); + +#else /* !CONFIG_CGROUPS */ + +static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } +static inline struct cgroup_namespace * +copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + return old_ns; +} + +#endif /* !CONFIG_CGROUPS */ + +static inline void get_cgroup_ns(struct cgroup_namespace *ns) +{ + if (ns) + atomic_inc(&ns->count); +} + +static inline void put_cgroup_ns(struct cgroup_namespace *ns) +{ + if (ns && atomic_dec_and_test(&ns->count)) + free_cgroup_ns(ns); +} + #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 35fa08fd7739..ac0d65bef5d0 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -8,6 +8,7 @@ struct mnt_namespace; struct uts_namespace; struct ipc_namespace; struct pid_namespace; +struct cgroup_namespace; struct fs_struct; /* @@ -33,6 +34,7 @@ struct nsproxy { struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns_for_children; struct net *net_ns; + struct cgroup_namespace *cgroup_ns; }; extern struct nsproxy init_nsproxy; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 42dfc615dbf8..de0e7719d4c5 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -9,6 +9,8 @@ struct pid_namespace; struct nsproxy; struct path; +struct task_struct; +struct inode; struct proc_ns_operations { const char *name; @@ -24,6 +26,7 @@ extern const struct proc_ns_operations ipcns_operations; extern const struct proc_ns_operations pidns_operations; extern const struct proc_ns_operations userns_operations; extern const struct proc_ns_operations mntns_operations; +extern const struct proc_ns_operations cgroupns_operations; /* * We always define these enumerators @@ -34,6 +37,7 @@ enum { PROC_UTS_INIT_INO = 0xEFFFFFFEU, PROC_USER_INIT_INO = 0xEFFFFFFDU, PROC_PID_INIT_INO = 0xEFFFFFFCU, + PROC_CGROUP_INIT_INO = 0xEFFFFFFBU, }; #ifdef CONFIG_PROC_FS diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7ad61915967f..b001c5d36bec 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -59,6 +59,9 @@ #include #include #include +#include +#include +#include #include /* @@ -212,6 +215,15 @@ static unsigned long have_fork_callback __read_mostly; static unsigned long have_exit_callback __read_mostly; static unsigned long have_free_callback __read_mostly; +/* cgroup namespace for init task */ +struct cgroup_namespace init_cgroup_ns = { + .count = { .counter = 2, }, + .user_ns = &init_user_ns, + .ns.ops = &cgroupns_operations, + .ns.inum = PROC_CGROUP_INIT_INO, + .root_cset = &init_css_set, +}; + /* Ditto for the can_fork callback. */ static unsigned long have_canfork_callback __read_mostly; @@ -2177,6 +2189,35 @@ static struct file_system_type cgroup2_fs_type = { .kill_sb = cgroup_kill_sb, }; +static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) +{ + struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); + int ret; + + ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); + if (ret < 0 || ret >= buflen) + return NULL; + return buf; +} + +char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) +{ + char *ret; + + mutex_lock(&cgroup_mutex); + spin_lock_bh(&css_set_lock); + + ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); + + spin_unlock_bh(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(cgroup_path_ns); + /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -2204,7 +2245,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) if (root) { cgrp = task_cgroup_from_root(task, root); - path = cgroup_path(cgrp, buf, buflen); + path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ if (strlcpy(buf, "/", buflen) < buflen) @@ -5297,6 +5338,8 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + get_user_ns(init_cgroup_ns.user_ns); + mutex_lock(&cgroup_mutex); /* Add init_css_set to the hash table */ @@ -5438,7 +5481,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { - path = cgroup_path(cgrp, buf, PATH_MAX); + path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, + current->nsproxy->cgroup_ns); if (!path) { retval = -ENAMETOOLONG; goto out_unlock; @@ -5720,7 +5764,9 @@ static void cgroup_release_agent(struct work_struct *work) if (!pathbuf || !agentbuf) goto out; - path = cgroup_path(cgrp, pathbuf, PATH_MAX); + spin_lock_bh(&css_set_lock); + path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + spin_unlock_bh(&css_set_lock); if (!path) goto out; @@ -5931,6 +5977,127 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ +/* cgroup namespaces */ + +static struct cgroup_namespace *alloc_cgroup_ns(void) +{ + struct cgroup_namespace *new_ns; + int ret; + + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + if (!new_ns) + return ERR_PTR(-ENOMEM); + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } + atomic_set(&new_ns->count, 1); + new_ns->ns.ops = &cgroupns_operations; + return new_ns; +} + +void free_cgroup_ns(struct cgroup_namespace *ns) +{ + put_css_set(ns->root_cset); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); +} +EXPORT_SYMBOL(free_cgroup_ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + struct cgroup_namespace *new_ns = NULL; + struct css_set *cset = NULL; + int err; + + BUG_ON(!old_ns); + + if (!(flags & CLONE_NEWCGROUP)) { + get_cgroup_ns(old_ns); + return old_ns; + } + + /* Allow only sysadmin to create cgroup namespace. */ + err = -EPERM; + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + goto err_out; + + mutex_lock(&cgroup_mutex); + spin_lock_bh(&css_set_lock); + + cset = task_css_set(current); + get_css_set(cset); + + spin_unlock_bh(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + err = -ENOMEM; + new_ns = alloc_cgroup_ns(); + if (!new_ns) + goto err_out; + + new_ns->user_ns = get_user_ns(user_ns); + new_ns->root_cset = cset; + + return new_ns; + +err_out: + if (cset) + put_css_set(cset); + kfree(new_ns); + return ERR_PTR(err); +} + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +static int cgroupns_install(struct nsproxy *nsproxy, void *ns) +{ + pr_info("setns not supported for cgroup namespace"); + return -EINVAL; +} + +static struct ns_common *cgroupns_get(struct task_struct *task) +{ + struct cgroup_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->cgroup_ns; + get_cgroup_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void cgroupns_put(struct ns_common *ns) +{ + put_cgroup_ns(to_cg_ns(ns)); +} + +const struct proc_ns_operations cgroupns_operations = { + .name = "cgroup", + .type = CLONE_NEWCGROUP, + .get = cgroupns_get, + .put = cgroupns_put, + .install = cgroupns_install, +}; + +static __init int cgroup_namespaces_init(void) +{ + return 0; +} +subsys_initcall(cgroup_namespaces_init); + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 41989ab4db57..d393125b228c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2714,10 +2714,10 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, goto out; retval = -ENAMETOOLONG; - rcu_read_lock(); - css = task_css(tsk, cpuset_cgrp_id); - p = cgroup_path(css->cgroup, buf, PATH_MAX); - rcu_read_unlock(); + css = task_get_css(tsk, cpuset_cgrp_id); + p = cgroup_path_ns(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + css_put(css); if (!p) goto out_free; seq_puts(m, p); diff --git a/kernel/fork.c b/kernel/fork.c index 2e391c754ae7..6611a6267949 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1884,7 +1884,7 @@ static int check_unshare_flags(unsigned long unshare_flags) if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWUSER|CLONE_NEWPID)) + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 49746c81ad8d..782102e59eed 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -25,6 +25,7 @@ #include #include #include +#include static struct kmem_cache *nsproxy_cachep; @@ -39,6 +40,9 @@ struct nsproxy init_nsproxy = { #ifdef CONFIG_NET .net_ns = &init_net, #endif +#ifdef CONFIG_CGROUPS + .cgroup_ns = &init_cgroup_ns, +#endif }; static inline struct nsproxy *create_nsproxy(void) @@ -92,6 +96,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_pid; } + new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns, + tsk->nsproxy->cgroup_ns); + if (IS_ERR(new_nsp->cgroup_ns)) { + err = PTR_ERR(new_nsp->cgroup_ns); + goto out_cgroup; + } + new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); @@ -101,6 +112,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, return new_nsp; out_net: + put_cgroup_ns(new_nsp->cgroup_ns); +out_cgroup: if (new_nsp->pid_ns_for_children) put_pid_ns(new_nsp->pid_ns_for_children); out_pid: @@ -128,7 +141,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) struct nsproxy *new_ns; if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWPID | CLONE_NEWNET)))) { + CLONE_NEWPID | CLONE_NEWNET | + CLONE_NEWCGROUP)))) { get_nsproxy(old_ns); return 0; } @@ -165,6 +179,7 @@ void free_nsproxy(struct nsproxy *ns) put_ipc_ns(ns->ipc_ns); if (ns->pid_ns_for_children) put_pid_ns(ns->pid_ns_for_children); + put_cgroup_ns(ns->cgroup_ns); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); } @@ -180,7 +195,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWPID))) + CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP))) return 0; user_ns = new_cred ? new_cred->user_ns : current_user_ns(); -- cgit From a0530e087e648263f81a81d62ca020f66b54bcb0 Mon Sep 17 00:00:00 2001 From: Aditya Kali Date: Fri, 29 Jan 2016 02:54:07 -0600 Subject: cgroup: cgroup namespace setns support setns on a cgroup namespace is allowed only if task has CAP_SYS_ADMIN in its current user-namespace and over the user-namespace associated with target cgroupns. No implicit cgroup changes happen with attaching to another cgroupns. It is expected that the somone moves the attaching process under the target cgroupns-root. Signed-off-by: Aditya Kali Signed-off-by: Serge E. Hallyn Signed-off-by: Tejun Heo --- kernel/cgroup.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b001c5d36bec..b086a461be23 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6057,10 +6057,23 @@ static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) return container_of(ns, struct cgroup_namespace, ns); } -static int cgroupns_install(struct nsproxy *nsproxy, void *ns) +static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) { - pr_info("setns not supported for cgroup namespace"); - return -EINVAL; + struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); + + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* Don't need to do anything if we are attaching to our own cgroupns. */ + if (cgroup_ns == nsproxy->cgroup_ns) + return 0; + + get_cgroup_ns(cgroup_ns); + put_cgroup_ns(nsproxy->cgroup_ns); + nsproxy->cgroup_ns = cgroup_ns; + + return 0; } static struct ns_common *cgroupns_get(struct task_struct *task) -- cgit From ed82571b1a14ab2bfbede2bb2c209700495749fc Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Fri, 29 Jan 2016 02:54:09 -0600 Subject: cgroup: mount cgroupns-root when inside non-init cgroupns This patch enables cgroup mounting inside userns when a process as appropriate privileges. The cgroup filesystem mounted is rooted at the cgroupns-root. Thus, in a container-setup, only the hierarchy under the cgroupns-root is exposed inside the container. This allows container management tools to run inside the containers without depending on any global state. Signed-off-by: Serge Hallyn Signed-off-by: Tejun Heo Signed-off-by: Tejun Heo --- kernel/cgroup.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b086a461be23..24989022ff62 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1994,6 +1994,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, { bool is_v2 = fs_type == &cgroup2_fs_type; struct super_block *pinned_sb = NULL; + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup_subsys *ss; struct cgroup_root *root; struct cgroup_sb_opts opts; @@ -2002,6 +2003,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int i; bool new_sb; + get_cgroup_ns(ns); + + /* Check if the caller has permission to mount. */ + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { + put_cgroup_ns(ns); + return ERR_PTR(-EPERM); + } + /* * The first time anyone tries to mount a cgroup, enable the list * linking each css_set to its tasks and fix up all existing tasks. @@ -2012,6 +2021,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (is_v2) { if (data) { pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + put_cgroup_ns(ns); return ERR_PTR(-EINVAL); } cgrp_dfl_root_visible = true; @@ -2117,6 +2127,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } + /* + * We know this subsystem has not yet been bound. Users in a non-init + * user namespace may only mount hierarchies with no bound subsystems, + * i.e. 'none,name=user1' + */ + if (!opts.none && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_unlock; + } + root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) { ret = -ENOMEM; @@ -2135,12 +2155,37 @@ out_free: kfree(opts.release_agent); kfree(opts.name); - if (ret) + if (ret) { + put_cgroup_ns(ns); return ERR_PTR(ret); + } out_mount: dentry = kernfs_mount(fs_type, flags, root->kf_root, is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, &new_sb); + + /* + * In non-init cgroup namespace, instead of root cgroup's + * dentry, we return the dentry corresponding to the + * cgroupns->root_cgrp. + */ + if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { + struct dentry *nsdentry; + struct cgroup *cgrp; + + mutex_lock(&cgroup_mutex); + spin_lock_bh(&css_set_lock); + + cgrp = cset_cgroup_from_root(ns->root_cset, root); + + spin_unlock_bh(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); + dput(dentry); + dentry = nsdentry; + } + if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); @@ -2153,6 +2198,7 @@ out_mount: deactivate_super(pinned_sb); } + put_cgroup_ns(ns); return dentry; } -- cgit From 1c53753e0df1ae4d21661053459e7c024a43f1d3 Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Fri, 29 Jan 2016 02:54:11 -0600 Subject: Add FS_USERNS_FLAG to cgroup fs allowing root in a non-init user namespace to mount it. This should now be safe, because 1. non-init-root cannot mount a previously unbound subsystem 2. the task doing the mount must be privileged with respect to the user namespace owning the cgroup namespace 3. the mounted subsystem will have its current cgroup as the root dentry. the permissions will be unchanged, so tasks will receive no new privilege over the cgroups which they did not have on the original mounts. Signed-off-by: Serge Hallyn --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 24989022ff62..afb1205fc789 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2227,12 +2227,14 @@ static struct file_system_type cgroup_fs_type = { .name = "cgroup", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type cgroup2_fs_type = { .name = "cgroup2", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, -- cgit From 3223d052b79eb9b620d170584c417d60a8bfd649 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 11 Feb 2016 11:59:38 +0900 Subject: sched/core: Remove dead statement in __schedule() Remove an unnecessary assignment of variable not used any more. ( This has no runtime effects as GCC is smart enough to optimize this out. ) Signed-off-by: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1455159578-17256-1-git-send-email-byungchul.park@lge.com [ Edited the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7e548bde67ee..87ca0bee1140 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3346,7 +3346,6 @@ static void __sched notrace __schedule(bool preempt) trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ - cpu = cpu_of(rq); } else { lockdep_unpin_lock(&rq->lock); raw_spin_unlock_irq(&rq->lock); -- cgit From c219b7ddb6a3524a61a89438d86e4a8639706308 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 10 Feb 2016 12:04:22 -0500 Subject: sched/deadline: Fix trivial typo in printk() message It's "too much" not "to much". Signed-off-by: Steven Rostedt Acked-by: Juri Lelli Cc: Jiri Kosina Cc: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160210120422.4ca77e68@gandalf.local.home Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index cd64c979d0e1..57b939c81bce 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -420,7 +420,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * entity. */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { - printk_deferred_once("sched: DL replenish lagged to much\n"); + printk_deferred_once("sched: DL replenish lagged too much\n"); dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } -- cgit From b4f75d44bed1bdbb14ac704bfc38f62a3675e591 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Feb 2016 20:11:20 +0000 Subject: perf/core: Remove bogus UP_CANCELED hotplug state If CPU_UP_PREPARE fails the perf hotplug code calls perf_event_exit_cpu(), which is a pointless exercise. The cpu is not online, so the smp function calls return -ENXIO. So the result is a list walk to call noops. Remove it. Signed-off-by: Thomas Gleixner Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Link: http://lkml.kernel.org/r/20160209201007.682184765@linutronix.de Signed-off-by: Ingo Molnar --- kernel/events/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5946460b2425..474ffeafe363 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9286,7 +9286,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) perf_event_init_cpu(cpu); break; - case CPU_UP_CANCELED: case CPU_DOWN_PREPARE: perf_event_exit_cpu(cpu); break; -- cgit From 27ca9236c96f4a21b72a2b4f08260efeab951bd0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Feb 2016 20:11:26 +0000 Subject: perf/core: Remove the bogus and dangerous CPU_DOWN_FAILED hotplug state If CPU_DOWN_PREPARE fails the perf hotplug notifier is called for CPU_DOWN_FAILED and calls perf_event_init_cpu(), which checks whether the swhash is referenced. If yes it allocates a new hash and stores the pointer in the per cpu data structure. But at this point the cpu is still online, so there must be a valid hash already. By overwriting the pointer the existing hash is not longer accessible. Remove the CPU_DOWN_FAILED state, as there is nothing to (re)allocate. Signed-off-by: Thomas Gleixner Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Link: http://lkml.kernel.org/r/20160209201007.763417379@linutronix.de Signed-off-by: Ingo Molnar --- kernel/events/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 474ffeafe363..4aa64a85a7b7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9282,7 +9282,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_DOWN_FAILED: perf_event_init_cpu(cpu); break; -- cgit From 059fcd8cd16622da6513804a7a3e826d152c6c96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Feb 2016 20:11:34 +0000 Subject: perf/core: Plug potential memory leak in CPU_UP_PREPARE If CPU_UP_PREPARE is called it is not guaranteed, that a previously allocated and assigned hash has been freed already, but perf_event_init_cpu() unconditionally allocates and assignes a new hash if the swhash is referenced. By overwriting the pointer the existing hash is not longer accessible. Verify that there is no hash assigned on this cpu before allocating and assigning a new one. Signed-off-by: Thomas Gleixner Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Link: http://lkml.kernel.org/r/20160209201007.843269966@linutronix.de Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4aa64a85a7b7..0d58522103cd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9206,7 +9206,7 @@ static void perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); - if (swhash->hlist_refcount > 0) { + if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) { struct swevent_hlist *hlist; hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); -- cgit From 3b364d7b587db0f0eeafde0f271e0698187de776 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Feb 2016 20:11:40 +0000 Subject: perf/core: Remove unused arguments from a bunch of functions No functional change, just less confusing to read. Signed-off-by: Thomas Gleixner Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Link: http://lkml.kernel.org/r/20160209201007.921540566@linutronix.de Signed-off-by: Ingo Molnar --- kernel/events/core.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0d58522103cd..94c47e3f9a0a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6710,7 +6710,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash) kfree_rcu(hlist, rcu_head); } -static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) +static void swevent_hlist_put_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -6722,15 +6722,15 @@ static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) mutex_unlock(&swhash->hlist_mutex); } -static void swevent_hlist_put(struct perf_event *event) +static void swevent_hlist_put(void) { int cpu; for_each_possible_cpu(cpu) - swevent_hlist_put_cpu(event, cpu); + swevent_hlist_put_cpu(cpu); } -static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) +static int swevent_hlist_get_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); int err = 0; @@ -6753,14 +6753,13 @@ exit: return err; } -static int swevent_hlist_get(struct perf_event *event) +static int swevent_hlist_get(void) { - int err; - int cpu, failed_cpu; + int err, cpu, failed_cpu; get_online_cpus(); for_each_possible_cpu(cpu) { - err = swevent_hlist_get_cpu(event, cpu); + err = swevent_hlist_get_cpu(cpu); if (err) { failed_cpu = cpu; goto fail; @@ -6773,7 +6772,7 @@ fail: for_each_possible_cpu(cpu) { if (cpu == failed_cpu) break; - swevent_hlist_put_cpu(event, cpu); + swevent_hlist_put_cpu(cpu); } put_online_cpus(); @@ -6789,7 +6788,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); static_key_slow_dec(&perf_swevent_enabled[event_id]); - swevent_hlist_put(event); + swevent_hlist_put(); } static int perf_swevent_init(struct perf_event *event) @@ -6820,7 +6819,7 @@ static int perf_swevent_init(struct perf_event *event) if (!event->parent) { int err; - err = swevent_hlist_get(event); + err = swevent_hlist_get(); if (err) return err; -- cgit From 8ad7b378d0d016309014cae0f640434bca7b5e11 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 9 Feb 2016 11:15:13 -0800 Subject: futex: Rename barrier references in ordering guarantees Ingo suggested we rename how we reference barriers A and B regarding futex ordering guarantees. This patch replaces, for both barriers, MB (A) with smp_mb(); (A), such that: - We explicitly state that the barriers are SMP, and - We standardize how we reference these across futex.c helping readers follow what barrier does what and where. Suggested-by: Ingo Molnar Signed-off-by: Davidlohr Bueso Reviewed-by: Thomas Gleixner Cc: Chris Mason Cc: Darren Hart Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: dave@stgolabs.net Link: http://lkml.kernel.org/r/1455045314-8305-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/futex.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 5d6ce6413ef1..08ac7009488b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -124,16 +124,16 @@ * futex_wait(futex, val); * * waiters++; (a) - * mb(); (A) <-- paired with -. - * | - * lock(hash_bucket(futex)); | - * | - * uval = *futex; | - * | *futex = newval; - * | sys_futex(WAKE, futex); - * | futex_wake(futex); - * | - * `-------> mb(); (B) + * smp_mb(); (A) <-- paired with -. + * | + * lock(hash_bucket(futex)); | + * | + * uval = *futex; | + * | *futex = newval; + * | sys_futex(WAKE, futex); + * | futex_wake(futex); + * | + * `--------> smp_mb(); (B) * if (uval == val) * queue(); * unlock(hash_bucket(futex)); @@ -334,7 +334,7 @@ static inline void futex_get_mm(union futex_key *key) /* * Ensure futex_get_mm() implies a full barrier such that * get_futex_key() implies a full barrier. This is relied upon - * as full barrier (B), see the ordering comment above. + * as smp_mb(); (B), see the ordering comment above. */ smp_mb__after_atomic(); } @@ -407,10 +407,10 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); /* implies MB (B) */ + ihold(key->shared.inode); /* implies smp_mb(); (B) */ break; case FUT_OFF_MMSHARED: - futex_get_mm(key); /* implies MB (B) */ + futex_get_mm(key); /* implies smp_mb(); (B) */ break; default: /* @@ -418,7 +418,7 @@ static void get_futex_key_refs(union futex_key *key) * mm, therefore the only purpose of calling get_futex_key_refs * is because we need the barrier for the lockless waiter check. */ - smp_mb(); /* explicit MB (B) */ + smp_mb(); /* explicit smp_mb(); (B) */ } } @@ -497,7 +497,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) if (!fshared) { key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies MB (B) */ + get_futex_key_refs(key); /* implies smp_mb(); (B) */ return 0; } @@ -572,7 +572,7 @@ again: key->shared.pgoff = basepage_index(page); } - get_futex_key_refs(key); /* implies MB (B) */ + get_futex_key_refs(key); /* implies smp_mb(); (B) */ out: unlock_page(page); @@ -1864,7 +1864,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) q->lock_ptr = &hb->lock; - spin_lock(&hb->lock); /* implies MB (A) */ + spin_lock(&hb->lock); /* implies smp_mb(); (A) */ return hb; } -- cgit From 65d8fc777f6dcfee12785c057a6b57f679641c90 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 9 Feb 2016 11:15:14 -0800 Subject: futex: Remove requirement for lock_page() in get_futex_key() When dealing with key handling for shared futexes, we can drastically reduce the usage/need of the page lock. 1) For anonymous pages, the associated futex object is the mm_struct which does not require the page lock. 2) For inode based, keys, we can check under RCU read lock if the page mapping is still valid and take reference to the inode. This just leaves one rare race that requires the page lock in the slow path when examining the swapcache. Additionally realtime users currently have a problem with the page lock being contended for unbounded periods of time during futex operations. Task A get_futex_key() lock_page() ---> preempted Now any other task trying to lock that page will have to wait until task A gets scheduled back in, which is an unbound time. With this patch, we pretty much have a lockless futex_get_key(). Experiments show that this patch can boost/speedup the hashing of shared futexes with the perf futex benchmarks (which is good for measuring such change) by up to 45% when there are high (> 100) thread counts on a 60 core Westmere. Lower counts are pretty much in the noise range or less than 10%, but mid range can be seen at over 30% overall throughput (hash ops/sec). This makes anon-mem shared futexes much closer to its private counterpart. Signed-off-by: Mel Gorman [ Ported on top of thp refcount rework, changelog, comments, fixes. ] Signed-off-by: Davidlohr Bueso Reviewed-by: Thomas Gleixner Cc: Chris Mason Cc: Darren Hart Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: dave@stgolabs.net Link: http://lkml.kernel.org/r/1455045314-8305-3-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/futex.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 08ac7009488b..bae542e4b2e9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -520,7 +520,20 @@ again: else err = 0; - lock_page(page); + /* + * The treatment of mapping from this point on is critical. The page + * lock protects many things but in this context the page lock + * stabilizes mapping, prevents inode freeing in the shared + * file-backed region case and guards against movement to swap cache. + * + * Strictly speaking the page lock is not needed in all cases being + * considered here and page lock forces unnecessarily serialization + * From this point on, mapping will be re-verified if necessary and + * page lock will be acquired only if it is unavoidable + */ + page = compound_head(page); + mapping = READ_ONCE(page->mapping); + /* * If page->mapping is NULL, then it cannot be a PageAnon * page; but it might be the ZERO_PAGE or in the gate area or @@ -536,19 +549,31 @@ again: * shmem_writepage move it from filecache to swapcache beneath us: * an unlikely race, but we do need to retry for page->mapping. */ - mapping = compound_head(page)->mapping; - if (!mapping) { - int shmem_swizzled = PageSwapCache(page); + if (unlikely(!mapping)) { + int shmem_swizzled; + + /* + * Page lock is required to identify which special case above + * applies. If this is really a shmem page then the page lock + * will prevent unexpected transitions. + */ + lock_page(page); + shmem_swizzled = PageSwapCache(page) || page->mapping; unlock_page(page); put_page(page); + if (shmem_swizzled) goto again; + return -EFAULT; } /* * Private mappings are handled in a simple way. * + * If the futex key is stored on an anonymous page, then the associated + * object is the mm which is implicitly pinned by the calling process. + * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. @@ -566,16 +591,74 @@ again: key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; + + get_futex_key_refs(key); /* implies smp_mb(); (B) */ + } else { + struct inode *inode; + + /* + * The associated futex object in this case is the inode and + * the page->mapping must be traversed. Ordinarily this should + * be stabilised under page lock but it's not strictly + * necessary in this case as we just want to pin the inode, not + * update the radix tree or anything like that. + * + * The RCU read lock is taken as the inode is finally freed + * under RCU. If the mapping still matches expectations then the + * mapping->host can be safely accessed as being a valid inode. + */ + rcu_read_lock(); + + if (READ_ONCE(page->mapping) != mapping) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + inode = READ_ONCE(mapping->host); + if (!inode) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + /* + * Take a reference unless it is about to be freed. Previously + * this reference was taken by ihold under the page lock + * pinning the inode in place so i_lock was unnecessary. The + * only way for this check to fail is if the inode was + * truncated in parallel so warn for now if this happens. + * + * We are not calling into get_futex_key_refs() in file-backed + * cases, therefore a successful atomic_inc return below will + * guarantee that get_futex_key() will still imply smp_mb(); (B). + */ + if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + /* Should be impossible but lets be paranoid for now */ + if (WARN_ON_ONCE(inode->i_mapping != mapping)) { + err = -EFAULT; + rcu_read_unlock(); + iput(inode); + + goto out; + } + key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = mapping->host; + key->shared.inode = inode; key->shared.pgoff = basepage_index(page); + rcu_read_unlock(); } - get_futex_key_refs(key); /* implies smp_mb(); (B) */ - out: - unlock_page(page); put_page(page); return err; } -- cgit From 7dcd182bec271ab341b05b66b6006995795fc0e7 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Tue, 16 Feb 2016 17:32:33 -0500 Subject: ftrace/module: remove ftrace module notifier Remove the ftrace module notifier in favor of directly calling ftrace_module_enable() and ftrace_release_mod() in the module loader. Hard-coding the function calls directly in the module loader removes dependence on the module notifier call chain and provides better visibility and control over what gets called when, which is important to kernel utilities such as livepatch. This fixes a notifier ordering issue in which the ftrace module notifier (and hence ftrace_module_enable()) for coming modules was being called after klp_module_notify(), which caused livepatch modules to initialize incorrectly. This patch removes dependence on the module notifier call chain in favor of hard coding the corresponding function calls in the module loader. This ensures that ftrace and livepatch code get called in the correct order on patch module load and unload. Fixes: 5156dca34a3e ("ftrace: Fix the race between ftrace and insmod") Signed-off-by: Jessica Yu Reviewed-by: Steven Rostedt Reviewed-by: Petr Mladek Acked-by: Rusty Russell Reviewed-by: Josh Poimboeuf Reviewed-by: Miroslav Benes Signed-off-by: Jiri Kosina --- include/linux/ftrace.h | 6 ++++-- kernel/module.c | 4 ++++ kernel/trace/ftrace.c | 36 +----------------------------------- 3 files changed, 9 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0639dcc98195..76dc15e171eb 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -604,6 +604,7 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size); extern int skip_trace(unsigned long ip); extern void ftrace_module_init(struct module *mod); +extern void ftrace_module_enable(struct module *mod); extern void ftrace_release_mod(struct module *mod); extern void ftrace_disable_daemon(void); @@ -613,8 +614,9 @@ static inline int skip_trace(unsigned long ip) { return 0; } static inline int ftrace_force_update(void) { return 0; } static inline void ftrace_disable_daemon(void) { } static inline void ftrace_enable_daemon(void) { } -static inline void ftrace_release_mod(struct module *mod) {} -static inline void ftrace_module_init(struct module *mod) {} +static inline void ftrace_module_init(struct module *mod) { } +static inline void ftrace_module_enable(struct module *mod) { } +static inline void ftrace_release_mod(struct module *mod) { } static inline __init int register_ftrace_command(struct ftrace_func_command *cmd) { return -EINVAL; diff --git a/kernel/module.c b/kernel/module.c index 8358f4697c0c..b05d466cfa02 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -981,6 +981,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, mod->exit(); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + ftrace_release_mod(mod); + async_synchronize_full(); /* Store the name of the last unloaded module for diagnostic purposes */ @@ -3295,6 +3297,7 @@ fail: module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + ftrace_release_mod(mod); free_module(mod); wake_up_all(&module_wq); return ret; @@ -3371,6 +3374,7 @@ static int complete_formation(struct module *mod, struct load_info *info) mod->state = MODULE_STATE_COMING; mutex_unlock(&module_mutex); + ftrace_module_enable(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); return 0; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eca592f977b2..57a6eea84694 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4961,7 +4961,7 @@ void ftrace_release_mod(struct module *mod) mutex_unlock(&ftrace_lock); } -static void ftrace_module_enable(struct module *mod) +void ftrace_module_enable(struct module *mod) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -5038,38 +5038,8 @@ void ftrace_module_init(struct module *mod) ftrace_process_locs(mod, mod->ftrace_callsites, mod->ftrace_callsites + mod->num_ftrace_callsites); } - -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; - - switch (val) { - case MODULE_STATE_COMING: - ftrace_module_enable(mod); - break; - case MODULE_STATE_GOING: - ftrace_release_mod(mod); - break; - default: - break; - } - - return 0; -} -#else -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_nb = { - .notifier_call = ftrace_module_notify, - .priority = INT_MIN, /* Run after anything that can remove kprobes */ -}; - void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; @@ -5098,10 +5068,6 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_nb); - if (ret) - pr_warning("Failed to register trace ftrace module exit notifier\n"); - set_ftrace_early_filters(); return; -- cgit From 23217b443b4b0439c8b55d3be0482d3cd7fbc5ac Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Wed, 17 Feb 2016 21:04:41 +0100 Subject: workqueue: Replace usage of init_name with dev_set_name() The init_name property of the device struct is sort of a hack and should only be used for statically allocated devices. Since the device is dynamically allocated here it is safe to use the proper way to set a devices name by calling dev_set_name(). Signed-off-by: Lars-Peter Clausen Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7ff5dc7d2ac5..3a1c99b0c1b3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5222,8 +5222,8 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) wq_dev->wq = wq; wq_dev->dev.bus = &wq_subsys; - wq_dev->dev.init_name = wq->name; wq_dev->dev.release = wq_device_release; + dev_set_name(&wq_dev->dev, "%s", wq->name); /* * unbound_attrs are created separately. Suppress uevent until -- cgit From cd0ea35ff5511cde299a61c21a95889b4a71464e Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:12 -0800 Subject: signals, pkeys: Notify userspace about protection key faults A protection key fault is very similar to any other access error. There must be a VMA, etc... We even want to take the same action (SIGSEGV) that we do with a normal access fault. However, we do need to let userspace know that something is different. We do this the same way what we did with SEGV_BNDERR with Memory Protection eXtensions (MPX): define a new SEGV code: SEGV_PKUERR. We add a siginfo field: si_pkey that reveals to userspace which protection key was set on the PTE that we faulted on. There is no other easy way for userspace to figure this out. They could parse smaps but that would be a bit cruel. We share space with in siginfo with _addr_bnd. #BR faults from MPX are completely separate from page faults (#PF) that trigger from protection key violations, so we never need both at the same time. Note that _pkey is a 64-bit value. The current hardware only supports 4-bit protection keys. We do this because there is _plenty_ of space in _sigfault and it is possible that future processors would support more than 4 bits of protection keys. The x86 code to actually fill in the siginfo is in the next patch. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Al Viro Cc: Amanieu d'Antras Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Rik van Riel Cc: Sasha Levin Cc: Vegard Nossum Cc: Vladimir Davydov Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210212.3A9B83AC@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- include/uapi/asm-generic/siginfo.h | 17 ++++++++++++----- kernel/signal.c | 4 ++++ 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index 1e3552037a5a..90384d55225b 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -91,10 +91,15 @@ typedef struct siginfo { int _trapno; /* TRAP # which caused the signal */ #endif short _addr_lsb; /* LSB of the reported address */ - struct { - void __user *_lower; - void __user *_upper; - } _addr_bnd; + union { + /* used when si_code=SEGV_BNDERR */ + struct { + void __user *_lower; + void __user *_upper; + } _addr_bnd; + /* used when si_code=SEGV_PKUERR */ + u64 _pkey; + }; } _sigfault; /* SIGPOLL */ @@ -137,6 +142,7 @@ typedef struct siginfo { #define si_addr_lsb _sifields._sigfault._addr_lsb #define si_lower _sifields._sigfault._addr_bnd._lower #define si_upper _sifields._sigfault._addr_bnd._upper +#define si_pkey _sifields._sigfault._pkey #define si_band _sifields._sigpoll._band #define si_fd _sifields._sigpoll._fd #ifdef __ARCH_SIGSYS @@ -206,7 +212,8 @@ typedef struct siginfo { #define SEGV_MAPERR (__SI_FAULT|1) /* address not mapped to object */ #define SEGV_ACCERR (__SI_FAULT|2) /* invalid permissions for mapped object */ #define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */ -#define NSIGSEGV 3 +#define SEGV_PKUERR (__SI_FAULT|4) /* failed protection key checks */ +#define NSIGSEGV 4 /* * SIGBUS si_codes diff --git a/kernel/signal.c b/kernel/signal.c index 0508544c8ced..fe8ed298373c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2708,6 +2708,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) err |= __put_user(from->si_lower, &to->si_lower); err |= __put_user(from->si_upper, &to->si_upper); } +#endif +#ifdef SEGV_PKUERR + if (from->si_signo == SIGSEGV && from->si_code == SEGV_PKUERR) + err |= __put_user(from->si_pkey, &to->si_pkey); #endif break; case __SI_CHLD: -- cgit From d22025570e2ebfc68819b35c5d457e53d9337217 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Feb 2016 11:44:24 -0500 Subject: cgroup: fix alloc_cgroup_ns() error handling in copy_cgroup_ns() alloc_cgroup_ns() returns an ERR_PTR value on error but copy_cgroup_ns() was checking for NULL for error. Fix it. Signed-off-by: Tejun Heo Reported-by: Dan Carpenter --- kernel/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index afb1205fc789..d92d91a4bb3e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6083,10 +6083,11 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, spin_unlock_bh(&css_set_lock); mutex_unlock(&cgroup_mutex); - err = -ENOMEM; new_ns = alloc_cgroup_ns(); - if (!new_ns) + if (IS_ERR(new_ns)) { + err = PTR_ERR(new_ns); goto err_out; + } new_ns->user_ns = get_user_ns(user_ns); new_ns->root_cset = cset; -- cgit From 9273a8bbf58a15051e53a777389a502420ddc60e Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 17 Feb 2016 13:11:29 -0800 Subject: devm_memremap_release(): fix memremap'd addr handling The pmem driver calls devm_memremap() to map a persistent memory range. When the pmem driver is unloaded, this memremap'd range is not released so the kernel will leak a vma. Fix devm_memremap_release() to handle a given memremap'd address properly. Signed-off-by: Toshi Kani Acked-by: Dan Williams Cc: Christoph Hellwig Cc: Ross Zwisler Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 2c468dea60bc..7a1b5c3ef14e 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -114,7 +114,7 @@ EXPORT_SYMBOL(memunmap); static void devm_memremap_release(struct device *dev, void *res) { - memunmap(res); + memunmap(*(void **)res); } static int devm_memremap_match(struct device *dev, void *res, void *match_data) -- cgit From 6e22c8366416251a3d88ba6c92d13d595089f0ed Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 12 Feb 2016 12:46:00 -0800 Subject: tracing, kasan: Silence Kasan warning in check_stack of stack_tracer When enabling stack trace via "echo 1 > /proc/sys/kernel/stack_tracer_enabled", the below KASAN warning is triggered: BUG: KASAN: stack-out-of-bounds in check_stack+0x344/0x848 at addr ffffffc0689ebab8 Read of size 8 by task ksoftirqd/4/29 page:ffffffbdc3a27ac0 count:0 mapcount:0 mapping: (null) index:0x0 flags: 0x0() page dumped because: kasan: bad access detected CPU: 4 PID: 29 Comm: ksoftirqd/4 Not tainted 4.5.0-rc1 #129 Hardware name: Freescale Layerscape 2085a RDB Board (DT) Call trace: [] dump_backtrace+0x0/0x3a0 [] show_stack+0x24/0x30 [] dump_stack+0xd8/0x168 [] kasan_report_error+0x6a0/0x920 [] kasan_report+0x70/0xb8 [] __asan_load8+0x60/0x78 [] check_stack+0x344/0x848 [] stack_trace_call+0x1c4/0x370 [] ftrace_ops_no_ops+0x2c0/0x590 [] ftrace_graph_call+0x0/0x14 [] fpsimd_thread_switch+0x24/0x1e8 [] __switch_to+0x34/0x218 [] __schedule+0x3ac/0x15b8 [] schedule+0x5c/0x178 [] smpboot_thread_fn+0x350/0x960 [] kthread+0x1d8/0x2b0 [] ret_from_fork+0x10/0x40 Memory state around the buggy address: ffffffc0689eb980: 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 f4 f4 f4 ffffffc0689eba00: f3 f3 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00 >ffffffc0689eba80: 00 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 ^ ffffffc0689ebb00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffffffc0689ebb80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 The stacker tracer traverses the whole kernel stack when saving the max stack trace. It may touch the stack red zones to cause the warning. So, just disable the instrumentation to silence the warning. Link: http://lkml.kernel.org/r/1455309960-18930-1-git-send-email-yang.shi@linaro.org Signed-off-by: Yang Shi Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 202df6cffcca..2a1abbaca10e 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -156,7 +156,11 @@ check_stack(unsigned long ip, unsigned long *stack) for (; p < top && i < stack_trace_max.nr_entries; p++) { if (stack_dump_trace[i] == ULONG_MAX) break; - if (*p == stack_dump_trace[i]) { + /* + * The READ_ONCE_NOCHECK is used to let KASAN know that + * this is not a stack-out-of-bounds error. + */ + if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) { stack_dump_trace[x] = stack_dump_trace[i++]; this_size = stack_trace_index[x++] = (top - p) * sizeof(unsigned long); -- cgit From 6bbd9a05a1f9839873a9290b5b7c6fafde8447ba Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Fri, 19 Feb 2016 13:53:10 -0500 Subject: bpf: grab rcu read lock for bpf_percpu_hash_update bpf_percpu_hash_update() expects rcu lock to be held and warns if it's not, which pointed out a missing rcu read lock. Fixes: 15a07b338 ("bpf: add lookup/update support for per-cpu hash and array maps") Signed-off-by: Sasha Levin Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fd5db8fe9360..a68e95133fcd 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -619,7 +619,13 @@ out: int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 map_flags) { - return __htab_percpu_map_update_elem(map, key, value, map_flags, true); + int ret; + + rcu_read_lock(); + ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); + rcu_read_unlock(); + + return ret; } static const struct bpf_map_ops htab_percpu_ops = { -- cgit From 568b329a02f75ed3aaae5eb2cca384cb9e09cb29 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 17 Feb 2016 19:58:57 -0800 Subject: perf: generalize perf_callchain . avoid walking the stack when there is no room left in the buffer . generalize get_perf_callchain() to be called from bpf helper Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/x86/include/asm/stacktrace.h | 2 +- arch/x86/kernel/cpu/perf_event.c | 4 ++-- arch/x86/kernel/dumpstack.c | 6 ++++-- arch/x86/kernel/stacktrace.c | 18 +++++++++++------- arch/x86/oprofile/backtrace.c | 3 ++- include/linux/perf_event.h | 13 +++++++++++-- kernel/events/callchain.c | 32 ++++++++++++++++++++------------ kernel/events/internal.h | 2 -- 8 files changed, 51 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 70bbe39043a9..7c247e7404be 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -37,7 +37,7 @@ print_context_stack_bp(struct thread_info *tinfo, /* Generic stack tracer with callbacks */ struct stacktrace_ops { - void (*address)(void *data, unsigned long address, int reliable); + int (*address)(void *data, unsigned long address, int reliable); /* On negative return stop dumping */ int (*stack)(void *data, char *name); walk_stack_t walk_stack; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1b443db2db50..d276b31ca473 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2180,11 +2180,11 @@ static int backtrace_stack(void *data, char *name) return 0; } -static void backtrace_address(void *data, unsigned long addr, int reliable) +static int backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; - perf_callchain_store(entry, addr); + return perf_callchain_store(entry, addr); } static const struct stacktrace_ops backtrace_ops = { diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 9c30acfadae2..0d1ff4b407d4 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -135,7 +135,8 @@ print_context_stack_bp(struct thread_info *tinfo, if (!__kernel_text_address(addr)) break; - ops->address(data, addr, 1); + if (ops->address(data, addr, 1)) + break; frame = frame->next_frame; ret_addr = &frame->return_address; print_ftrace_graph_addr(addr, data, ops, tinfo, graph); @@ -154,10 +155,11 @@ static int print_trace_stack(void *data, char *name) /* * Print one address/symbol entries per line. */ -static void print_trace_address(void *data, unsigned long addr, int reliable) +static int print_trace_address(void *data, unsigned long addr, int reliable) { touch_nmi_watchdog(); printk_stack_address(addr, reliable, data); + return 0; } static const struct stacktrace_ops print_trace_ops = { diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index fdd0c6430e5a..9ee98eefc44d 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -14,30 +14,34 @@ static int save_stack_stack(void *data, char *name) return 0; } -static void +static int __save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) { struct stack_trace *trace = data; #ifdef CONFIG_FRAME_POINTER if (!reliable) - return; + return 0; #endif if (nosched && in_sched_functions(addr)) - return; + return 0; if (trace->skip > 0) { trace->skip--; - return; + return 0; } - if (trace->nr_entries < trace->max_entries) + if (trace->nr_entries < trace->max_entries) { trace->entries[trace->nr_entries++] = addr; + return 0; + } else { + return -1; /* no more room, stop walking the stack */ + } } -static void save_stack_address(void *data, unsigned long addr, int reliable) +static int save_stack_address(void *data, unsigned long addr, int reliable) { return __save_stack_address(data, addr, reliable, false); } -static void +static int save_stack_address_nosched(void *data, unsigned long addr, int reliable) { return __save_stack_address(data, addr, reliable, true); diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 4e664bdb535a..cb31a4440e58 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -23,12 +23,13 @@ static int backtrace_stack(void *data, char *name) return 0; } -static void backtrace_address(void *data, unsigned long addr, int reliable) +static int backtrace_address(void *data, unsigned long addr, int reliable) { unsigned int *depth = data; if ((*depth)--) oprofile_add_trace(addr); + return 0; } static struct stacktrace_ops backtrace_ops = { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b35a61a481fa..7da3c25999df 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -964,11 +964,20 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs); +extern struct perf_callchain_entry * +get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + bool crosstask, bool add_mark); +extern int get_callchain_buffers(void); +extern void put_callchain_buffers(void); -static inline void perf_callchain_store(struct perf_callchain_entry *entry, u64 ip) +static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip) { - if (entry->nr < PERF_MAX_STACK_DEPTH) + if (entry->nr < PERF_MAX_STACK_DEPTH) { entry->ip[entry->nr++] = ip; + return 0; + } else { + return -1; /* no more room, stop walking the stack */ + } } extern int sysctl_perf_event_paranoid; diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 9c418002b8c1..343c22f5e867 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -159,15 +159,24 @@ put_callchain_entry(int rctx) struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { - int rctx; - struct perf_callchain_entry *entry; - - int kernel = !event->attr.exclude_callchain_kernel; - int user = !event->attr.exclude_callchain_user; + bool kernel = !event->attr.exclude_callchain_kernel; + bool user = !event->attr.exclude_callchain_user; + /* Disallow cross-task user callchains. */ + bool crosstask = event->ctx->task && event->ctx->task != current; if (!kernel && !user) return NULL; + return get_perf_callchain(regs, 0, kernel, user, crosstask, true); +} + +struct perf_callchain_entry * +get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + bool crosstask, bool add_mark) +{ + struct perf_callchain_entry *entry; + int rctx; + entry = get_callchain_entry(&rctx); if (rctx == -1) return NULL; @@ -175,10 +184,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!entry) goto exit_put; - entry->nr = 0; + entry->nr = init_nr; if (kernel && !user_mode(regs)) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + if (add_mark) + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_kernel(entry, regs); } @@ -191,13 +201,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) } if (regs) { - /* - * Disallow cross-task user callchains. - */ - if (event->ctx->task && event->ctx->task != current) + if (crosstask) goto exit_put; - perf_callchain_store(entry, PERF_CONTEXT_USER); + if (add_mark) + perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_user(entry, regs); } } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 2bbad9c1274c..4199b6d193f5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -182,8 +182,6 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) /* Callchain handling */ extern struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs); -extern int get_callchain_buffers(void); -extern void put_callchain_buffers(void); static inline int get_recursion_context(int *recursion) { -- cgit From d5a3b1f691865be576c2bffa708549b8cdccda19 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 17 Feb 2016 19:58:58 -0800 Subject: bpf: introduce BPF_MAP_TYPE_STACK_TRACE add new map type to store stack traces and corresponding helper bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id @ctx: struct pt_regs* @map: pointer to stack_trace map @flags: bits 0-7 - numer of stack frames to skip bit 8 - collect user stack instead of kernel bit 9 - compare stacks by hash only bit 10 - if two different stacks hash into the same stackid discard old other bits - reserved Return: >= 0 stackid on success or negative error stackid is a 32-bit integer handle that can be further combined with other data (including other stackid) and used as a key into maps. Userspace will access stackmap using standard lookup/delete syscall commands to retrieve full stack trace for given stackid. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 21 +++++ kernel/bpf/Makefile | 3 + kernel/bpf/stackmap.c | 237 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 6 +- kernel/trace/bpf_trace.c | 2 + 6 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/stackmap.c (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 90ee6ab24bc5..0cadbb7456c0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -237,6 +237,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_skb_vlan_push_proto; extern const struct bpf_func_proto bpf_skb_vlan_pop_proto; +extern const struct bpf_func_proto bpf_get_stackid_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2ee0fde1bf96..d3e77da8e9e8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -83,6 +83,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_MAP_TYPE_PERCPU_HASH, BPF_MAP_TYPE_PERCPU_ARRAY, + BPF_MAP_TYPE_STACK_TRACE, }; enum bpf_prog_type { @@ -272,6 +273,20 @@ enum bpf_func_id { */ BPF_FUNC_perf_event_output, BPF_FUNC_skb_load_bytes, + + /** + * bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id + * @ctx: struct pt_regs* + * @map: pointer to stack_trace map + * @flags: bits 0-7 - numer of stack frames to skip + * bit 8 - collect user stack instead of kernel + * bit 9 - compare stacks by hash only + * bit 10 - if two different stacks hash into the same stackid + * discard old + * other bits - reserved + * Return: >= 0 stackid on success or negative error + */ + BPF_FUNC_get_stackid, __BPF_FUNC_MAX_ID, }; @@ -294,6 +309,12 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ #define BPF_F_TUNINFO_IPV6 (1ULL << 0) +/* BPF_FUNC_get_stackid flags. */ +#define BPF_F_SKIP_FIELD_MASK 0xffULL +#define BPF_F_USER_STACK (1ULL << 8) +#define BPF_F_FAST_STACK_CMP (1ULL << 9) +#define BPF_F_REUSE_STACKID (1ULL << 10) + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 13272582eee0..8a932d079c24 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,3 +2,6 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o +ifeq ($(CONFIG_PERF_EVENTS),y) +obj-$(CONFIG_BPF_SYSCALL) += stackmap.o +endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c new file mode 100644 index 000000000000..8a60ee14a977 --- /dev/null +++ b/kernel/bpf/stackmap.c @@ -0,0 +1,237 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include + +struct stack_map_bucket { + struct rcu_head rcu; + u32 hash; + u32 nr; + u64 ip[]; +}; + +struct bpf_stack_map { + struct bpf_map map; + u32 n_buckets; + struct stack_map_bucket __rcu *buckets[]; +}; + +/* Called from syscall */ +static struct bpf_map *stack_map_alloc(union bpf_attr *attr) +{ + u32 value_size = attr->value_size; + struct bpf_stack_map *smap; + u64 cost, n_buckets; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + value_size < 8 || value_size % 8 || + value_size / 8 > PERF_MAX_STACK_DEPTH) + return ERR_PTR(-EINVAL); + + /* hash table size must be power of 2 */ + n_buckets = roundup_pow_of_two(attr->max_entries); + + cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-E2BIG); + + smap = kzalloc(cost, GFP_USER | __GFP_NOWARN); + if (!smap) { + smap = vzalloc(cost); + if (!smap) + return ERR_PTR(-ENOMEM); + } + + err = -E2BIG; + cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_smap; + + smap->map.map_type = attr->map_type; + smap->map.key_size = attr->key_size; + smap->map.value_size = value_size; + smap->map.max_entries = attr->max_entries; + smap->n_buckets = n_buckets; + smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = get_callchain_buffers(); + if (err) + goto free_smap; + + return &smap->map; + +free_smap: + kvfree(smap); + return ERR_PTR(err); +} + +static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +{ + struct pt_regs *regs = (struct pt_regs *) (long) r1; + struct bpf_map *map = (struct bpf_map *) (long) r2; + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct perf_callchain_entry *trace; + struct stack_map_bucket *bucket, *new_bucket, *old_bucket; + u32 max_depth = map->value_size / 8; + /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */ + u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + u32 hash, id, trace_nr, trace_len; + bool user = flags & BPF_F_USER_STACK; + bool kernel = !user; + u64 *ips; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + trace = get_perf_callchain(regs, init_nr, kernel, user, false, false); + + if (unlikely(!trace)) + /* couldn't fetch the stack trace */ + return -EFAULT; + + /* get_perf_callchain() guarantees that trace->nr >= init_nr + * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth + */ + trace_nr = trace->nr - init_nr; + + if (trace_nr <= skip) + /* skipping more than usable stack trace */ + return -EFAULT; + + trace_nr -= skip; + trace_len = trace_nr * sizeof(u64); + ips = trace->ip + skip + init_nr; + hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); + id = hash & (smap->n_buckets - 1); + bucket = rcu_dereference(smap->buckets[id]); + + if (bucket && bucket->hash == hash) { + if (flags & BPF_F_FAST_STACK_CMP) + return id; + if (bucket->nr == trace_nr && + memcmp(bucket->ip, ips, trace_len) == 0) + return id; + } + + /* this call stack is not in the map, try to add it */ + if (bucket && !(flags & BPF_F_REUSE_STACKID)) + return -EEXIST; + + new_bucket = kmalloc(sizeof(struct stack_map_bucket) + map->value_size, + GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!new_bucket)) + return -ENOMEM; + + memcpy(new_bucket->ip, ips, trace_len); + memset(new_bucket->ip + trace_len / 8, 0, map->value_size - trace_len); + new_bucket->hash = hash; + new_bucket->nr = trace_nr; + + old_bucket = xchg(&smap->buckets[id], new_bucket); + if (old_bucket) + kfree_rcu(old_bucket, rcu); + return id; +} + +const struct bpf_func_proto bpf_get_stackid_proto = { + .func = bpf_get_stackid, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +/* Called from syscall or from eBPF program */ +static void *stack_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct stack_map_bucket *bucket; + u32 id = *(u32 *)key; + + if (unlikely(id >= smap->n_buckets)) + return NULL; + bucket = rcu_dereference(smap->buckets[id]); + return bucket ? bucket->ip : NULL; +} + +static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EINVAL; +} + +static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return -EINVAL; +} + +/* Called from syscall or from eBPF program */ +static int stack_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct stack_map_bucket *old_bucket; + u32 id = *(u32 *)key; + + if (unlikely(id >= smap->n_buckets)) + return -E2BIG; + + old_bucket = xchg(&smap->buckets[id], NULL); + if (old_bucket) { + kfree_rcu(old_bucket, rcu); + return 0; + } else { + return -ENOENT; + } +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void stack_map_free(struct bpf_map *map) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + int i; + + synchronize_rcu(); + + for (i = 0; i < smap->n_buckets; i++) + if (smap->buckets[i]) + kfree_rcu(smap->buckets[i], rcu); + kvfree(smap); + put_callchain_buffers(); +} + +static const struct bpf_map_ops stack_map_ops = { + .map_alloc = stack_map_alloc, + .map_free = stack_map_free, + .map_get_next_key = stack_map_get_next_key, + .map_lookup_elem = stack_map_lookup_elem, + .map_update_elem = stack_map_update_elem, + .map_delete_elem = stack_map_delete_elem, +}; + +static struct bpf_map_type_list stack_map_type __read_mostly = { + .ops = &stack_map_ops, + .type = BPF_MAP_TYPE_STACK_TRACE, +}; + +static int __init register_stack_map(void) +{ + bpf_register_map_type(&stack_map_type); + return 0; +} +late_initcall(register_stack_map); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d1d3e8f57de9..42ba4ccc020b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -246,6 +246,7 @@ static const struct { {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output}, + {BPF_MAP_TYPE_STACK_TRACE, BPF_FUNC_get_stackid}, }; static void print_verifier_state(struct verifier_env *env) @@ -911,8 +912,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) * don't allow any other map type to be passed into * the special func; */ - if (bool_func && bool_map != bool_func) + if (bool_func && bool_map != bool_func) { + verbose("cannot pass map_type %d into func %d\n", + map->map_type, func_id); return -EINVAL; + } } return 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 326a75e884db..4b8caa392b86 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -299,6 +299,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_perf_event_read_proto; case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto; default: return NULL; } -- cgit From 59ceeaaf355fa0fb16558ef7c24413c804932ada Mon Sep 17 00:00:00 2001 From: Simon Guinot Date: Thu, 10 Sep 2015 00:15:18 +0200 Subject: kernel/resource.c: fix muxed resource handling in __request_region() In __request_region, if a conflict with a BUSY and MUXED resource is detected, then the caller goes to sleep and waits for the resource to be released. A pointer on the conflicting resource is kept. At wake-up this pointer is used as a parent to retry to request the region. A first problem is that this pointer might well be invalid (if for example the conflicting resource have already been freed). Another problem is that the next call to __request_region() fails to detect a remaining conflict. The previously conflicting resource is passed as a parameter and __request_region() will look for a conflict among the children of this resource and not at the resource itself. It is likely to succeed anyway, even if there is still a conflict. Instead, the parent of the conflicting resource should be passed to __request_region(). As a fix, this patch doesn't update the parent resource pointer in the case we have to wait for a muxed region right after. Reported-and-tested-by: Vincent Pelletier Signed-off-by: Simon Guinot Tested-by: Vincent Donnefort Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- kernel/resource.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 09c0597840b0..3669d1bfc425 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1083,9 +1083,10 @@ struct resource * __request_region(struct resource *parent, if (!conflict) break; if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) + if (!(conflict->flags & IORESOURCE_BUSY)) { + parent = conflict; continue; + } } if (conflict->flags & flags & IORESOURCE_MUXED) { add_wait_queue(&muxed_resource_wait, &wait); -- cgit From a1db74209483a24c861c848b4bb79a4d945ef6fa Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Wed, 30 Dec 2015 07:35:30 -0500 Subject: module: replace copy_module_from_fd with kernel version Replace copy_module_from_fd() with kernel_read_file_from_fd(). Although none of the upstreamed LSMs define a kernel_module_from_file hook, IMA is called, based on policy, to prevent unsigned kernel modules from being loaded by the original kernel module syscall and to measure/appraise signed kernel modules. The security function security_kernel_module_from_file() was called prior to reading a kernel module. Preventing unsigned kernel modules from being loaded by the original kernel module syscall remains on the pre-read kernel_read_file() security hook. Instead of reading the kernel module twice, once for measuring/appraising and again for loading the kernel module, the signature validation is moved to the kernel_post_read_file() security hook. This patch removes the security_kernel_module_from_file() hook and security call. Signed-off-by: Mimi Zohar Acked-by: Kees Cook Acked-by: Luis R. Rodriguez Cc: Rusty Russell --- include/linux/fs.h | 1 + include/linux/ima.h | 6 ---- include/linux/lsm_hooks.h | 7 ---- include/linux/security.h | 5 --- kernel/module.c | 68 +++++---------------------------------- security/integrity/ima/ima_main.c | 35 ++++++++------------ security/security.c | 12 ------- 7 files changed, 22 insertions(+), 112 deletions(-) (limited to 'kernel') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9c85deae1bf2..fb08b668c37a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2578,6 +2578,7 @@ extern int do_pipe_flags(int *, int); enum kernel_read_file_id { READING_FIRMWARE = 1, + READING_MODULE, READING_MAX_ID }; diff --git a/include/linux/ima.h b/include/linux/ima.h index 6adcaea8101c..e6516cbbe9bf 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -18,7 +18,6 @@ extern int ima_bprm_check(struct linux_binprm *bprm); extern int ima_file_check(struct file *file, int mask, int opened); extern void ima_file_free(struct file *file); extern int ima_file_mmap(struct file *file, unsigned long prot); -extern int ima_module_check(struct file *file); extern int ima_read_file(struct file *file, enum kernel_read_file_id id); extern int ima_post_read_file(struct file *file, void *buf, loff_t size, enum kernel_read_file_id id); @@ -44,11 +43,6 @@ static inline int ima_file_mmap(struct file *file, unsigned long prot) return 0; } -static inline int ima_module_check(struct file *file) -{ - return 0; -} - static inline int ima_read_file(struct file *file, enum kernel_read_file_id id) { return 0; diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index d32b7bd13635..cdee11cbcdf1 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -546,12 +546,6 @@ * userspace to load a kernel module with the given name. * @kmod_name name of the module requested by the kernel * Return 0 if successful. - * @kernel_module_from_file: - * Load a kernel module from userspace. - * @file contains the file structure pointing to the file containing - * the kernel module to load. If the module is being loaded from a blob, - * this argument will be NULL. - * Return 0 if permission is granted. * @kernel_read_file: * Read a file specified by userspace. * @file contains the file structure pointing to the file being read @@ -1725,7 +1719,6 @@ struct security_hook_heads { struct list_head kernel_read_file; struct list_head kernel_post_read_file; struct list_head kernel_module_request; - struct list_head kernel_module_from_file; struct list_head task_fix_setuid; struct list_head task_setpgid; struct list_head task_getpgid; diff --git a/include/linux/security.h b/include/linux/security.h index 071fb747fdbb..157f0cb1e4d2 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -859,11 +859,6 @@ static inline int security_kernel_module_request(char *kmod_name) return 0; } -static inline int security_kernel_module_from_file(struct file *file) -{ - return 0; -} - static inline int security_kernel_read_file(struct file *file, enum kernel_read_file_id id) { diff --git a/kernel/module.c b/kernel/module.c index 8358f4697c0c..955410928696 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2654,7 +2654,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; - err = security_kernel_module_from_file(NULL); + err = security_kernel_read_file(NULL, READING_MODULE); if (err) return err; @@ -2672,63 +2672,6 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return 0; } -/* Sets info->hdr and info->len. */ -static int copy_module_from_fd(int fd, struct load_info *info) -{ - struct fd f = fdget(fd); - int err; - struct kstat stat; - loff_t pos; - ssize_t bytes = 0; - - if (!f.file) - return -ENOEXEC; - - err = security_kernel_module_from_file(f.file); - if (err) - goto out; - - err = vfs_getattr(&f.file->f_path, &stat); - if (err) - goto out; - - if (stat.size > INT_MAX) { - err = -EFBIG; - goto out; - } - - /* Don't hand 0 to vmalloc, it whines. */ - if (stat.size == 0) { - err = -EINVAL; - goto out; - } - - info->hdr = vmalloc(stat.size); - if (!info->hdr) { - err = -ENOMEM; - goto out; - } - - pos = 0; - while (pos < stat.size) { - bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos, - stat.size - pos); - if (bytes < 0) { - vfree(info->hdr); - err = bytes; - goto out; - } - if (bytes == 0) - break; - pos += bytes; - } - info->len = pos; - -out: - fdput(f); - return err; -} - static void free_copy(struct load_info *info) { vfree(info->hdr); @@ -3589,8 +3532,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) { - int err; struct load_info info = { }; + loff_t size; + void *hdr; + int err; err = may_init_module(); if (err) @@ -3602,9 +3547,12 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) |MODULE_INIT_IGNORE_VERMAGIC)) return -EINVAL; - err = copy_module_from_fd(fd, &info); + err = kernel_read_file_from_fd(fd, &hdr, &size, INT_MAX, + READING_MODULE); if (err) return err; + info.hdr = hdr; + info.len = size; return load_module(&info, uargs, flags); } diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index bbb80df28fb1..5da0b9c00072 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -315,28 +315,6 @@ int ima_file_check(struct file *file, int mask, int opened) } EXPORT_SYMBOL_GPL(ima_file_check); -/** - * ima_module_check - based on policy, collect/store/appraise measurement. - * @file: pointer to the file to be measured/appraised - * - * Measure/appraise kernel modules based on policy. - * - * On success return 0. On integrity appraisal error, assuming the file - * is in policy and IMA-appraisal is in enforcing mode, return -EACCES. - */ -int ima_module_check(struct file *file) -{ - if (!file) { -#ifndef CONFIG_MODULE_SIG_FORCE - if ((ima_appraise & IMA_APPRAISE_MODULES) && - (ima_appraise & IMA_APPRAISE_ENFORCE)) - return -EACCES; /* INTEGRITY_UNKNOWN */ -#endif - return 0; /* We rely on module signature checking */ - } - return process_measurement(file, NULL, 0, MAY_EXEC, MODULE_CHECK, 0); -} - /** * ima_read_file - pre-measure/appraise hook decision based on policy * @file: pointer to the file to be measured/appraised/audit @@ -350,6 +328,14 @@ int ima_module_check(struct file *file) */ int ima_read_file(struct file *file, enum kernel_read_file_id read_id) { + if (!file && read_id == READING_MODULE) { +#ifndef CONFIG_MODULE_SIG_FORCE + if ((ima_appraise & IMA_APPRAISE_MODULES) && + (ima_appraise & IMA_APPRAISE_ENFORCE)) + return -EACCES; /* INTEGRITY_UNKNOWN */ +#endif + return 0; /* We rely on module signature checking */ + } return 0; } @@ -378,6 +364,9 @@ int ima_post_read_file(struct file *file, void *buf, loff_t size, return 0; } + if (!file && read_id == READING_MODULE) /* MODULE_SIG_FORCE enabled */ + return 0; + if (!file || !buf || size == 0) { /* should never happen */ if (ima_appraise & IMA_APPRAISE_ENFORCE) return -EACCES; @@ -386,6 +375,8 @@ int ima_post_read_file(struct file *file, void *buf, loff_t size, if (read_id == READING_FIRMWARE) func = FIRMWARE_CHECK; + else if (read_id == READING_MODULE) + func = MODULE_CHECK; return process_measurement(file, buf, size, MAY_READ, func, 0); } diff --git a/security/security.c b/security/security.c index 8e699f98a600..3644b0344d29 100644 --- a/security/security.c +++ b/security/security.c @@ -889,16 +889,6 @@ int security_kernel_module_request(char *kmod_name) return call_int_hook(kernel_module_request, 0, kmod_name); } -int security_kernel_module_from_file(struct file *file) -{ - int ret; - - ret = call_int_hook(kernel_module_from_file, 0, file); - if (ret) - return ret; - return ima_module_check(file); -} - int security_kernel_read_file(struct file *file, enum kernel_read_file_id id) { int ret; @@ -1705,8 +1695,6 @@ struct security_hook_heads security_hook_heads = { LIST_HEAD_INIT(security_hook_heads.kernel_create_files_as), .kernel_module_request = LIST_HEAD_INIT(security_hook_heads.kernel_module_request), - .kernel_module_from_file = - LIST_HEAD_INIT(security_hook_heads.kernel_module_from_file), .kernel_read_file = LIST_HEAD_INIT(security_hook_heads.kernel_read_file), .kernel_post_read_file = -- cgit From b804defe4297157a9ff45863769efe9a01953398 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Thu, 14 Jan 2016 20:59:14 -0500 Subject: kexec: replace call to copy_file_from_fd() with kernel version Replace copy_file_from_fd() with kernel_read_file_from_fd(). Two new identifiers named READING_KEXEC_IMAGE and READING_KEXEC_INITRAMFS are defined for measuring, appraising or auditing the kexec image and initramfs. Changelog v3: - return -EBADF, not -ENOEXEC - identifier change - split patch, moving copy_file_from_fd() to a separate patch - split patch, moving IMA changes to a separate patch v0: - use kstat file size type loff_t, not size_t - Calculate the file hash from the in memory buffer - Dave Young Signed-off-by: Mimi Zohar Acked-by: Kees Cook Acked-by: Luis R. Rodriguez Cc: Eric Biederman Acked-by: Dave Young --- include/linux/fs.h | 2 ++ kernel/kexec_file.c | 73 +++++++---------------------------------------------- 2 files changed, 11 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/include/linux/fs.h b/include/linux/fs.h index fb08b668c37a..52567252288e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2579,6 +2579,8 @@ extern int do_pipe_flags(int *, int); enum kernel_read_file_id { READING_FIRMWARE = 1, READING_MODULE, + READING_KEXEC_IMAGE, + READING_KEXEC_INITRAMFS, READING_MAX_ID }; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 007b791f676d..b696c3f3708f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -33,65 +34,6 @@ size_t __weak kexec_purgatory_size = 0; static int kexec_calculate_store_digests(struct kimage *image); -static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) -{ - struct fd f = fdget(fd); - int ret; - struct kstat stat; - loff_t pos; - ssize_t bytes = 0; - - if (!f.file) - return -EBADF; - - ret = vfs_getattr(&f.file->f_path, &stat); - if (ret) - goto out; - - if (stat.size > INT_MAX) { - ret = -EFBIG; - goto out; - } - - /* Don't hand 0 to vmalloc, it whines. */ - if (stat.size == 0) { - ret = -EINVAL; - goto out; - } - - *buf = vmalloc(stat.size); - if (!*buf) { - ret = -ENOMEM; - goto out; - } - - pos = 0; - while (pos < stat.size) { - bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, - stat.size - pos); - if (bytes < 0) { - vfree(*buf); - ret = bytes; - goto out; - } - - if (bytes == 0) - break; - pos += bytes; - } - - if (pos != stat.size) { - ret = -EBADF; - vfree(*buf); - goto out; - } - - *buf_len = pos; -out: - fdput(f); - return ret; -} - /* Architectures can provide this probe function */ int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) @@ -182,16 +124,17 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, { int ret = 0; void *ldata; + loff_t size; - ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, - &image->kernel_buf_len); + ret = kernel_read_file_from_fd(kernel_fd, &image->kernel_buf, + &size, INT_MAX, READING_KEXEC_IMAGE); if (ret) return ret; + image->kernel_buf_len = size; /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, image->kernel_buf_len); - if (ret) goto out; @@ -206,10 +149,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, #endif /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { - ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, - &image->initrd_buf_len); + ret = kernel_read_file_from_fd(initrd_fd, &image->initrd_buf, + &size, INT_MAX, + READING_KEXEC_INITRAMFS); if (ret) goto out; + image->initrd_buf_len = size; } if (cmdline_len) { -- cgit From 8e2fe1d9f1a20924f98ea46931a1d7fb092aa876 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 19 Feb 2016 23:05:22 +0100 Subject: bpf: add new arg_type that allows for 0 sized stack buffer Currently, when we pass a buffer from the eBPF stack into a helper function, the function proto indicates argument types as ARG_PTR_TO_STACK and ARG_CONST_STACK_SIZE pair. If R contains the former, then R must be of the latter type. Then, verifier checks whether the buffer points into eBPF stack, is initialized, etc. The verifier also guarantees that the constant value passed in R is greater than 0, so helper functions don't need to test for it and can always assume a non-NULL initialized buffer as well as non-0 buffer size. This patch adds a new argument types ARG_CONST_STACK_SIZE_OR_ZERO that allows to also pass NULL as R and 0 as R into the helper function. Such helper functions, of course, need to be able to handle these cases internally then. Verifier guarantees that either R == NULL && R == 0 or R != NULL && R != 0 (like the case of ARG_CONST_STACK_SIZE), any other combinations are not possible to load. I went through various options of extending the verifier, and introducing the type ARG_CONST_STACK_SIZE_OR_ZERO seems to have most minimal changes needed to the verifier. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 42 ++++++++++++++++++++++++++++++++---------- 2 files changed, 33 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0cadbb7456c0..51e498e5470e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -65,6 +65,7 @@ enum bpf_arg_type { */ ARG_PTR_TO_STACK, /* any pointer to eBPF program stack */ ARG_CONST_STACK_SIZE, /* number of bytes accessed from stack */ + ARG_CONST_STACK_SIZE_OR_ZERO, /* number of bytes accessed from stack or 0 */ ARG_PTR_TO_CTX, /* pointer to context */ ARG_ANYTHING, /* any (initialized) argument is ok */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 42ba4ccc020b..36dc497deaa3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -779,15 +779,24 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized */ -static int check_stack_boundary(struct verifier_env *env, - int regno, int access_size) +static int check_stack_boundary(struct verifier_env *env, int regno, + int access_size, bool zero_size_allowed) { struct verifier_state *state = &env->cur_state; struct reg_state *regs = state->regs; int off, i; - if (regs[regno].type != PTR_TO_STACK) + if (regs[regno].type != PTR_TO_STACK) { + if (zero_size_allowed && access_size == 0 && + regs[regno].type == CONST_IMM && + regs[regno].imm == 0) + return 0; + + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[regs[regno].type], + reg_type_str[PTR_TO_STACK]); return -EACCES; + } off = regs[regno].imm; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || @@ -830,15 +839,24 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return 0; } - if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || + if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - } else if (arg_type == ARG_CONST_STACK_SIZE) { + } else if (arg_type == ARG_CONST_STACK_SIZE || + arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { expected_type = CONST_IMM; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; } else if (arg_type == ARG_PTR_TO_CTX) { expected_type = PTR_TO_CTX; + } else if (arg_type == ARG_PTR_TO_STACK) { + expected_type = PTR_TO_STACK; + /* One exception here. In case function allows for NULL to be + * passed in as argument, it's a CONST_IMM type. Final test + * happens during stack boundary checking. + */ + if (reg->type == CONST_IMM && reg->imm == 0) + expected_type = CONST_IMM; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -868,8 +886,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->key_size); - + err = check_stack_boundary(env, regno, (*mapp)->key_size, + false); } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -879,9 +897,12 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->value_size); + err = check_stack_boundary(env, regno, (*mapp)->value_size, + false); + } else if (arg_type == ARG_CONST_STACK_SIZE || + arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { + bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); - } else if (arg_type == ARG_CONST_STACK_SIZE) { /* bpf_xxx(..., buf, len) call will access 'len' bytes * from stack pointer 'buf'. Check it * note: regno == len, regno - 1 == buf @@ -891,7 +912,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); return -EACCES; } - err = check_stack_boundary(env, regno - 1, reg->imm); + err = check_stack_boundary(env, regno - 1, reg->imm, + zero_size_allowed); } return err; -- cgit From d2aa1acad22f1bdd0cfa67b3861800e392254454 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Feb 2016 14:41:13 -0800 Subject: mm/init: Add 'rodata=off' boot cmdline parameter to disable read-only kernel mappings It may be useful to debug writes to the readonly sections of memory, so provide a cmdline "rodata=off" to allow for this. This can be expanded in the future to support "log" and "write" modes, but that will need to be architecture-specific. This also makes KDB software breakpoints more usable, as read-only mappings can now be disabled on any kernel. Suggested-by: H. Peter Anvin Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: David Brown Cc: Denys Vlasenko Cc: Emese Revfy Cc: Linus Torvalds Cc: Mathias Krause Cc: Michael Ellerman Cc: PaX Team Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Cc: linux-arch Link: http://lkml.kernel.org/r/1455748879-21872-3-git-send-email-keescook@chromium.org Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 4 ++++ init/main.c | 27 +++++++++++++++++++++++---- kernel/debug/kdb/kdb_bp.c | 4 +--- 3 files changed, 28 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9a53c929f017..000336733a6a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3491,6 +3491,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ro [KNL] Mount root device read-only on boot + rodata= [KNL] + on Mark read-only kernel memory as read-only (default). + off Leave read-only kernel memory writable for debugging. + root= [KNL] Root filesystem See name_to_dev_t comment in init/do_mounts.c. diff --git a/init/main.c b/init/main.c index 58c9e374704b..928a3438f7ac 100644 --- a/init/main.c +++ b/init/main.c @@ -93,9 +93,6 @@ static int kernel_init(void *); extern void init_IRQ(void); extern void fork_init(void); extern void radix_tree_init(void); -#ifndef CONFIG_DEBUG_RODATA -static inline void mark_rodata_ro(void) { } -#endif /* * Debug helper: via this flag we know that we are in 'early bootup code' @@ -929,6 +926,28 @@ static int try_to_run_init_process(const char *init_filename) static noinline void __init kernel_init_freeable(void); +#ifdef CONFIG_DEBUG_RODATA +static bool rodata_enabled = true; +static int __init set_debug_rodata(char *str) +{ + return strtobool(str, &rodata_enabled); +} +__setup("rodata=", set_debug_rodata); + +static void mark_readonly(void) +{ + if (rodata_enabled) + mark_rodata_ro(); + else + pr_info("Kernel memory protection disabled.\n"); +} +#else +static inline void mark_readonly(void) +{ + pr_warn("This architecture does not have kernel memory protection.\n"); +} +#endif + static int __ref kernel_init(void *unused) { int ret; @@ -937,7 +956,7 @@ static int __ref kernel_init(void *unused) /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); free_initmem(); - mark_rodata_ro(); + mark_readonly(); system_state = SYSTEM_RUNNING; numa_default_policy(); diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index e1dbf4a2c69e..90ff129c88a2 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp) } else { kdb_printf("%s: failed to set breakpoint at 0x%lx\n", __func__, bp->bp_addr); -#ifdef CONFIG_DEBUG_RODATA if (!bp->bp_type) { kdb_printf("Software breakpoints are unavailable.\n" - " Change the kernel CONFIG_DEBUG_RODATA=n\n" + " Boot the kernel with rodata=off\n" " OR use hw breaks: help bph\n"); } -#endif return 1; } return 0; -- cgit From b598dde354de22d87f664a7b99b8c21437da8efb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Feb 2016 22:25:45 -0500 Subject: cgroup: fix error return value of cgroup_addrm_files() cgroup_addrm_files() incorrectly returned 0 after add failure. Fix it. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7ad61915967f..68b032df77f5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3369,7 +3369,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, bool is_add) { struct cftype *cft, *cft_end = NULL; - int ret; + int ret = 0; lockdep_assert_held(&cgroup_mutex); @@ -3398,7 +3398,7 @@ restart: cgroup_rm_file(cgrp, cft); } } - return 0; + return ret; } static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) -- cgit From 5eb385cc5ae1b31fbcdd727854a00c5a083f6b9b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Feb 2016 22:25:46 -0500 Subject: Revert "cgroup: add cgroup_subsys->css_e_css_changed()" This reverts commit 56c807ba4e91f0980567b6a69de239677879b17f. cgroup_subsys->css_e_css_changed() was supposed to be used by cgroup writeback support; however, the change to per-inode cgroup association made it unnecessary and the callback doesn't have any user. Remove it. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner --- include/linux/cgroup-defs.h | 1 - kernel/cgroup.c | 18 ------------------ 2 files changed, 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 789471dba6fb..4f3c0dac26b5 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -434,7 +434,6 @@ struct cgroup_subsys { void (*css_released)(struct cgroup_subsys_state *css); void (*css_free)(struct cgroup_subsys_state *css); void (*css_reset)(struct cgroup_subsys_state *css); - void (*css_e_css_changed)(struct cgroup_subsys_state *css); int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 68b032df77f5..7727b6e43e10 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3127,24 +3127,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } } - /* - * The effective csses of all the descendants (excluding @cgrp) may - * have changed. Subsystems can optionally subscribe to this event - * by implementing ->css_e_css_changed() which is invoked if any of - * the effective csses seen from the css's cgroup may have changed. - */ - for_each_subsys(ss, ssid) { - struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss); - struct cgroup_subsys_state *css; - - if (!ss->css_e_css_changed || !this_css) - continue; - - css_for_each_descendant_pre(css, this_css) - if (css != this_css) - ss->css_e_css_changed(css); - } - kernfs_activate(cgrp->kn); ret = 0; out_unlock: -- cgit From 8699b7762a623c46ced891b3cf490058b56cf99c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Feb 2016 22:25:46 -0500 Subject: cgroup: s/child_subsys_mask/subtree_ss_mask/ For consistency with cgroup->subtree_control. * cgroup->child_subsys_mask -> cgroup->subtree_ss_mask * cgroup_calc_child_subsys_mask() -> cgroup_calc_subtree_ss_mask() * cgroup_refresh_child_subsys_mask() -> cgroup_refresh_subtree_ss_mask() No functional changes. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner --- include/linux/cgroup-defs.h | 11 +++++------ kernel/cgroup.c | 48 ++++++++++++++++++++++----------------------- 2 files changed, 29 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4f3c0dac26b5..c68ae7f0fb5f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -253,13 +253,12 @@ struct cgroup { /* * The bitmask of subsystems enabled on the child cgroups. * ->subtree_control is the one configured through - * "cgroup.subtree_control" while ->child_subsys_mask is the - * effective one which may have more subsystems enabled. - * Controller knobs are made available iff it's enabled in - * ->subtree_control. + * "cgroup.subtree_control" while ->child_ss_mask is the effective + * one which may have more subsystems enabled. Controller knobs + * are made available iff it's enabled in ->subtree_control. */ - unsigned int subtree_control; - unsigned int child_subsys_mask; + unsigned long subtree_control; + unsigned long subtree_ss_mask; /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7727b6e43e10..f3cd67bfe6c0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -391,10 +391,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, /* * This function is used while updating css associations and thus - * can't test the csses directly. Use ->child_subsys_mask. + * can't test the csses directly. Use ->subtree_ss_mask. */ while (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) + !(cgroup_parent(cgrp)->subtree_ss_mask & (1 << ss->id))) cgrp = cgroup_parent(cgrp); return cgroup_css(cgrp, ss); @@ -1256,7 +1256,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) } /** - * cgroup_calc_child_subsys_mask - calculate child_subsys_mask + * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask * @cgrp: the target cgroup * @subtree_control: the new subtree_control mask to consider * @@ -1268,8 +1268,8 @@ static umode_t cgroup_file_mode(const struct cftype *cft) * @subtree_control is to be applied to @cgrp. The returned mask is always * a superset of @subtree_control and follows the usual hierarchy rules. */ -static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, - unsigned long subtree_control) +static unsigned long cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, + unsigned long subtree_control) { struct cgroup *parent = cgroup_parent(cgrp); unsigned long cur_ss_mask = subtree_control; @@ -1293,7 +1293,7 @@ static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, * to non-default hierarchies. */ if (parent) - new_ss_mask &= parent->child_subsys_mask; + new_ss_mask &= parent->subtree_ss_mask; else new_ss_mask &= cgrp->root->subsys_mask; @@ -1306,16 +1306,16 @@ static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, } /** - * cgroup_refresh_child_subsys_mask - update child_subsys_mask + * cgroup_refresh_subtree_ss_mask - update subtree_ss_mask * @cgrp: the target cgroup * - * Update @cgrp->child_subsys_mask according to the current - * @cgrp->subtree_control using cgroup_calc_child_subsys_mask(). + * Update @cgrp->subtree_ss_mask according to the current + * @cgrp->subtree_control using cgroup_calc_subtree_ss_mask(). */ -static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) +static void cgroup_refresh_subtree_ss_mask(struct cgroup *cgrp) { - cgrp->child_subsys_mask = - cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control); + cgrp->subtree_ss_mask = + cgroup_calc_subtree_ss_mask(cgrp, cgrp->subtree_control); } /** @@ -1542,7 +1542,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, src_root->subsys_mask &= ~(1 << ssid); scgrp->subtree_control &= ~(1 << ssid); - cgroup_refresh_child_subsys_mask(scgrp); + cgroup_refresh_subtree_ss_mask(scgrp); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; @@ -1550,7 +1550,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); } else { dcgrp->subtree_control |= 1 << ssid; - cgroup_refresh_child_subsys_mask(dcgrp); + cgroup_refresh_subtree_ss_mask(dcgrp); static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } @@ -2523,11 +2523,11 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, lockdep_assert_held(&cgroup_mutex); /* - * Except for the root, child_subsys_mask must be zero for a cgroup + * Except for the root, subtree_ss_mask must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && - dst_cgrp->child_subsys_mask) + dst_cgrp->subtree_ss_mask) return -EBUSY; /* look up the dst cset for each src cset and link it to src */ @@ -2880,7 +2880,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy * @cgrp: root of the subtree to update csses for * - * @cgrp's child_subsys_mask has changed and its subtree's (self excluded) + * @cgrp's subtree_ss_mask has changed and its subtree's (self excluded) * css associations need to be updated accordingly. This function looks up * all css_sets which are attached to the subtree, creates the matching * updated css_sets and migrates the tasks to the new ones. @@ -2902,7 +2902,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { struct cgrp_cset_link *link; - /* self is not affected by child_subsys_mask change */ + /* self is not affected by subtree_ss_mask change */ if (css->cgroup == cgrp) continue; @@ -3034,9 +3034,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * depending on subsystem dependencies. */ old_sc = cgrp->subtree_control; - old_ss = cgrp->child_subsys_mask; + old_ss = cgrp->subtree_ss_mask; new_sc = (old_sc | enable) & ~disable; - new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc); + new_ss = cgroup_calc_subtree_ss_mask(cgrp, new_sc); css_enable = ~old_ss & new_ss; css_disable = old_ss & ~new_ss; @@ -3069,7 +3069,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } cgrp->subtree_control = new_sc; - cgrp->child_subsys_mask = new_ss; + cgrp->subtree_ss_mask = new_ss; /* * Create new csses or make the existing ones visible. A css is @@ -3135,7 +3135,7 @@ out_unlock: err_undo_css: cgrp->subtree_control = old_sc; - cgrp->child_subsys_mask = old_ss; + cgrp->subtree_ss_mask = old_ss; for_each_subsys(ss, ssid) { if (!(enable & (1 << ssid))) @@ -4969,7 +4969,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, /* let's create and online css's */ for_each_subsys(ss, ssid) { - if (parent->child_subsys_mask & (1 << ssid)) { + if (parent->subtree_ss_mask & (1 << ssid)) { ret = create_css(cgrp, ss, parent->subtree_control & (1 << ssid)); if (ret) @@ -4983,7 +4983,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, */ if (!cgroup_on_dfl(cgrp)) { cgrp->subtree_control = parent->subtree_control; - cgroup_refresh_child_subsys_mask(cgrp); + cgroup_refresh_subtree_ss_mask(cgrp); } kernfs_activate(kn); -- cgit From b4e0eeafba61b141c3af22d6636be3f477c5d3bd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Feb 2016 22:25:46 -0500 Subject: cgroup: convert for_each_subsys_which() to do-while style for_each_subsys_which() allows iterating subsystems specified in a subsystem bitmask; unfortunately, it requires the mask to be an unsigned long l-value which can be inconvenient and makes it awkward to use a smaller type for subsystem masks. This patch converts for_each_subsy_which() to do-while style which allows it to drop the l-value requirement. The new iterator is named do_each_subsys_mask() / while_each_subsys_mask(). Signed-off-by: Tejun Heo Cc: Aleksa Sarai Acked-by: Johannes Weiner --- kernel/cgroup.c | 72 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f3cd67bfe6c0..5d102980dc27 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -514,22 +514,28 @@ static int notify_on_release(const struct cgroup *cgrp) (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) /** - * for_each_subsys_which - filter for_each_subsys with a bitmask + * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end - * @ss_maskp: a pointer to the bitmask + * @ss_mask: the bitmask * * The block will only run for cases where the ssid-th bit (1 << ssid) of - * mask is set to 1. + * @ss_mask is set. */ -#define for_each_subsys_which(ss, ssid, ss_maskp) \ - if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \ +#define do_each_subsys_mask(ss, ssid, ss_mask) do { \ + unsigned long __ss_mask = (ss_mask); \ + if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \ (ssid) = 0; \ - else \ - for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \ - if (((ss) = cgroup_subsys[ssid]) && false) \ - break; \ - else + break; \ + } \ + for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \ + (ss) = cgroup_subsys[ssid]; \ + { + +#define while_each_subsys_mask() \ + } \ + } \ +} while (false) /* iterate across the hierarchies */ #define for_each_root(root) \ @@ -1284,8 +1290,9 @@ static unsigned long cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, while (true) { unsigned long new_ss_mask = cur_ss_mask; - for_each_subsys_which(ss, ssid, &cur_ss_mask) + do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; + } while_each_subsys_mask(); /* * Mask out subsystems which aren't available. This can @@ -1469,7 +1476,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, lockdep_assert_held(&cgroup_mutex); - for_each_subsys_which(ss, ssid, &ss_mask) { + do_each_subsys_mask(ss, ssid, ss_mask) { /* if @ss has non-root csses attached to it, can't move */ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) return -EBUSY; @@ -1477,14 +1484,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root, /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; - } + } while_each_subsys_mask(); /* skip creating root files on dfl_root for inhibited subsystems */ tmp_ss_mask = ss_mask; if (dst_root == &cgrp_dfl_root) tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; - for_each_subsys_which(ss, ssid, &tmp_ss_mask) { + do_each_subsys_mask(ss, ssid, tmp_ss_mask) { struct cgroup *scgrp = &ss->root->cgrp; int tssid; @@ -1507,19 +1514,19 @@ static int rebind_subsystems(struct cgroup_root *dst_root, continue; } - for_each_subsys_which(ss, tssid, &tmp_ss_mask) { + do_each_subsys_mask(ss, tssid, tmp_ss_mask) { if (tssid == ssid) break; css_clear_dir(cgroup_css(scgrp, ss), dcgrp); - } + } while_each_subsys_mask(); return ret; - } + } while_each_subsys_mask(); /* * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ - for_each_subsys_which(ss, ssid, &ss_mask) { + do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); @@ -1556,7 +1563,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, if (ss->bind) ss->bind(css); - } + } while_each_subsys_mask(); kernfs_activate(dcgrp->kn); return 0; @@ -2838,12 +2845,12 @@ static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask) bool printed = false; int ssid; - for_each_subsys_which(ss, ssid, &ss_mask) { + do_each_subsys_mask(ss, ssid, ss_mask) { if (printed) seq_putc(seq, ' '); seq_printf(seq, "%s", ss->name); printed = true; - } + } while_each_subsys_mask(); if (printed) seq_putc(seq, '\n'); } @@ -2956,11 +2963,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, */ buf = strstrip(buf); while ((tok = strsep(&buf, " "))) { - unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask; - if (tok[0] == '\0') continue; - for_each_subsys_which(ss, ssid, &tmp_ss_mask) { + do_each_subsys_mask(ss, ssid, ~cgrp_dfl_root_inhibit_ss_mask) { if (!cgroup_ssid_enabled(ssid) || strcmp(tok + 1, ss->name)) continue; @@ -2975,7 +2980,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return -EINVAL; } break; - } + } while_each_subsys_mask(); if (ssid == CGROUP_SUBSYS_COUNT) return -EINVAL; } @@ -3049,7 +3054,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * still around. In such cases, wait till it's gone using * offline_waitq. */ - for_each_subsys_which(ss, ssid, &css_enable) { + do_each_subsys_mask(ss, ssid, css_enable) { cgroup_for_each_live_child(child, cgrp) { DEFINE_WAIT(wait); @@ -3066,7 +3071,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return restart_syscall(); } - } + } while_each_subsys_mask(); cgrp->subtree_control = new_sc; cgrp->subtree_ss_mask = new_ss; @@ -5509,11 +5514,11 @@ int cgroup_can_fork(struct task_struct *child) struct cgroup_subsys *ss; int i, j, ret; - for_each_subsys_which(ss, i, &have_canfork_callback) { + do_each_subsys_mask(ss, i, have_canfork_callback) { ret = ss->can_fork(child); if (ret) goto out_revert; - } + } while_each_subsys_mask(); return 0; @@ -5598,8 +5603,9 @@ void cgroup_post_fork(struct task_struct *child) * css_set; otherwise, @child might change state between ->fork() * and addition to css_set. */ - for_each_subsys_which(ss, i, &have_fork_callback) + do_each_subsys_mask(ss, i, have_fork_callback) { ss->fork(child); + } while_each_subsys_mask(); } /** @@ -5642,8 +5648,9 @@ void cgroup_exit(struct task_struct *tsk) } /* see cgroup_post_fork() for details */ - for_each_subsys_which(ss, i, &have_exit_callback) + do_each_subsys_mask(ss, i, have_exit_callback) { ss->exit(tsk); + } while_each_subsys_mask(); } void cgroup_free(struct task_struct *task) @@ -5652,8 +5659,9 @@ void cgroup_free(struct task_struct *task) struct cgroup_subsys *ss; int ssid; - for_each_subsys_which(ss, ssid, &have_free_callback) + do_each_subsys_mask(ss, ssid, have_free_callback) { ss->free(task); + } while_each_subsys_mask(); put_css_set(cset); } -- cgit From 996cd1fb7383cf087496e8a441bb10b9873b1eb6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Feb 2016 22:25:46 -0500 Subject: cgroup: use do_each_subsys_mask() where applicable There are several places in cgroup_subtree_control_write() which can use do_each_subsys_mask() instead of manual mask testing. Use it. No functional changes. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner --- kernel/cgroup.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5d102980dc27..1e561bd990b9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3082,10 +3082,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * dependency. An invisible css is made visible when the userland * explicitly enables it. */ - for_each_subsys(ss, ssid) { - if (!(enable & (1 << ssid))) - continue; - + do_each_subsys_mask(ss, ssid, enable) { cgroup_for_each_live_child(child, cgrp) { if (css_enable & (1 << ssid)) ret = create_css(child, ss, @@ -3096,7 +3093,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (ret) goto err_undo_css; } - } + } while_each_subsys_mask(); /* * At this point, cgroup_e_css() results reflect the new csses @@ -3115,10 +3112,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * state if it's made visible again later. Controllers which may * be depended upon should provide ->css_reset() for this purpose. */ - for_each_subsys(ss, ssid) { - if (!(disable & (1 << ssid))) - continue; - + do_each_subsys_mask(ss, ssid, disable) { cgroup_for_each_live_child(child, cgrp) { struct cgroup_subsys_state *css = cgroup_css(child, ss); @@ -3130,7 +3124,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, ss->css_reset(css); } } - } + } while_each_subsys_mask(); kernfs_activate(cgrp->kn); ret = 0; @@ -3142,10 +3136,7 @@ err_undo_css: cgrp->subtree_control = old_sc; cgrp->subtree_ss_mask = old_ss; - for_each_subsys(ss, ssid) { - if (!(enable & (1 << ssid))) - continue; - + do_each_subsys_mask(ss, ssid, enable) { cgroup_for_each_live_child(child, cgrp) { struct cgroup_subsys_state *css = cgroup_css(child, ss); @@ -3157,7 +3148,7 @@ err_undo_css: else css_clear_dir(css, NULL); } - } + } while_each_subsys_mask(); goto out_unlock; } @@ -4973,14 +4964,12 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, goto out_destroy; /* let's create and online css's */ - for_each_subsys(ss, ssid) { - if (parent->subtree_ss_mask & (1 << ssid)) { - ret = create_css(cgrp, ss, - parent->subtree_control & (1 << ssid)); - if (ret) - goto out_destroy; - } - } + do_each_subsys_mask(ss, ssid, parent->subtree_ss_mask) { + ret = create_css(cgrp, ss, + parent->subtree_control & (1 << ssid)); + if (ret) + goto out_destroy; + } while_each_subsys_mask(); /* * On the default hierarchy, a child doesn't automatically inherit -- cgit From 6e5c830770f9045a17b1b931c3e11fbd5591e630 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Feb 2016 22:25:47 -0500 Subject: cgroup: make cgroup subsystem masks u16 After the recent do_each_subsys_mask() conversion, there's no reason to use ulong for subsystem masks. We'll be adding more subsystem masks to persistent data structures, let's reduce its size to u16 which should be enough for now and the foreseeable future. This doesn't create any noticeable behavior differences. v2: Johannes spotted that the initial patch missed cgroup_no_v1_mask. Converted. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner --- include/linux/cgroup-defs.h | 4 ++-- kernel/cgroup.c | 50 ++++++++++++++++++++++----------------------- 2 files changed, 26 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index c68ae7f0fb5f..0abf6aa86c81 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -257,8 +257,8 @@ struct cgroup { * one which may have more subsystems enabled. Controller knobs * are made available iff it's enabled in ->subtree_control. */ - unsigned long subtree_control; - unsigned long subtree_ss_mask; + u16 subtree_control; + u16 subtree_ss_mask; /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1e561bd990b9..7669f68077b8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -181,10 +181,10 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); static bool cgrp_dfl_root_visible; /* Controllers blocked by the commandline in v1 */ -static unsigned long cgroup_no_v1_mask; +static u16 cgroup_no_v1_mask; /* some controllers are not supported in the default hierarchy */ -static unsigned long cgrp_dfl_root_inhibit_ss_mask; +static u16 cgrp_dfl_root_inhibit_ss_mask; /* The list of hierarchy roots */ @@ -208,19 +208,18 @@ static u64 css_serial_nr_next = 1; * fork/exit handlers to call. This avoids us having to do extra work in the * fork/exit path to check which subsystems have fork/exit callbacks. */ -static unsigned long have_fork_callback __read_mostly; -static unsigned long have_exit_callback __read_mostly; -static unsigned long have_free_callback __read_mostly; +static u16 have_fork_callback __read_mostly; +static u16 have_exit_callback __read_mostly; +static u16 have_free_callback __read_mostly; /* Ditto for the can_fork callback. */ -static unsigned long have_canfork_callback __read_mostly; +static u16 have_canfork_callback __read_mostly; static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; -static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask); +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, @@ -1274,11 +1273,10 @@ static umode_t cgroup_file_mode(const struct cftype *cft) * @subtree_control is to be applied to @cgrp. The returned mask is always * a superset of @subtree_control and follows the usual hierarchy rules. */ -static unsigned long cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, - unsigned long subtree_control) +static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control) { struct cgroup *parent = cgroup_parent(cgrp); - unsigned long cur_ss_mask = subtree_control; + u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; @@ -1288,7 +1286,7 @@ static unsigned long cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, return cur_ss_mask; while (true) { - unsigned long new_ss_mask = cur_ss_mask; + u16 new_ss_mask = cur_ss_mask; do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; @@ -1466,12 +1464,11 @@ err: return ret; } -static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask) +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - unsigned long tmp_ss_mask; + u16 tmp_ss_mask; int ssid, i, ret; lockdep_assert_held(&cgroup_mutex); @@ -1507,7 +1504,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, */ if (dst_root == &cgrp_dfl_root) { if (cgrp_dfl_root_visible) { - pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", + pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", ret, ss_mask); pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); } @@ -1599,7 +1596,7 @@ static int cgroup_show_options(struct seq_file *seq, } struct cgroup_sb_opts { - unsigned long subsys_mask; + u16 subsys_mask; unsigned int flags; char *release_agent; bool cpuset_clone_children; @@ -1612,13 +1609,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; bool all_ss = false, one_ss = false; - unsigned long mask = -1UL; + u16 mask = U16_MAX; struct cgroup_subsys *ss; int nr_opts = 0; int i; #ifdef CONFIG_CPUSETS - mask = ~(1U << cpuset_cgrp_id); + mask = ~((u16)1 << cpuset_cgrp_id); #endif memset(opts, 0, sizeof(*opts)); @@ -1745,7 +1742,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) int ret = 0; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; - unsigned long added_mask, removed_mask; + u16 added_mask, removed_mask; if (root == &cgrp_dfl_root) { pr_err("remount is not allowed\n"); @@ -1893,7 +1890,7 @@ static void init_cgroup_root(struct cgroup_root *root, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -2839,7 +2836,7 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } -static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask) +static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; bool printed = false; @@ -2950,8 +2947,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - unsigned long enable = 0, disable = 0; - unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; + u16 enable = 0, disable = 0; + u16 css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -5255,7 +5252,7 @@ int __init cgroup_init_early(void) return 0; } -static unsigned long cgroup_disable_mask __initdata; +static u16 cgroup_disable_mask __initdata; /** * cgroup_init - cgroup initialization @@ -5269,6 +5266,7 @@ int __init cgroup_init(void) unsigned long key; int ssid; + BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); @@ -5754,7 +5752,7 @@ static int __init cgroup_no_v1(char *str) continue; if (!strcmp(token, "all")) { - cgroup_no_v1_mask = ~0UL; + cgroup_no_v1_mask = U16_MAX; break; } -- cgit From a7165264429b7b0d95557f306f310e77407fc2ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Feb 2016 10:00:50 -0500 Subject: cgroup: s/cgrp_dfl_root_/cgrp_dfl_/ These var names are unnecessarily unwiedly and another similar variable will be added. Let's shorten them. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7669f68077b8..afbed523b22f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -178,13 +178,13 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ -static bool cgrp_dfl_root_visible; +static bool cgrp_dfl_visible; /* Controllers blocked by the commandline in v1 */ static u16 cgroup_no_v1_mask; /* some controllers are not supported in the default hierarchy */ -static u16 cgrp_dfl_root_inhibit_ss_mask; +static u16 cgrp_dfl_inhibit_ss_mask; /* The list of hierarchy roots */ @@ -1486,7 +1486,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) /* skip creating root files on dfl_root for inhibited subsystems */ tmp_ss_mask = ss_mask; if (dst_root == &cgrp_dfl_root) - tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; + tmp_ss_mask &= ~cgrp_dfl_inhibit_ss_mask; do_each_subsys_mask(ss, ssid, tmp_ss_mask) { struct cgroup *scgrp = &ss->root->cgrp; @@ -1503,7 +1503,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) * Just warn about it and continue. */ if (dst_root == &cgrp_dfl_root) { - if (cgrp_dfl_root_visible) { + if (cgrp_dfl_visible) { pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", ret, ss_mask); pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); @@ -2006,7 +2006,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); return ERR_PTR(-EINVAL); } - cgrp_dfl_root_visible = true; + cgrp_dfl_visible = true; root = &cgrp_dfl_root; cgroup_get(&root->cgrp); goto out_mount; @@ -2858,7 +2858,7 @@ static int cgroup_root_controllers_show(struct seq_file *seq, void *v) struct cgroup *cgrp = seq_css(seq)->cgroup; cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & - ~cgrp_dfl_root_inhibit_ss_mask); + ~cgrp_dfl_inhibit_ss_mask); return 0; } @@ -2962,7 +2962,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, while ((tok = strsep(&buf, " "))) { if (tok[0] == '\0') continue; - do_each_subsys_mask(ss, ssid, ~cgrp_dfl_root_inhibit_ss_mask) { + do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) { if (!cgroup_ssid_enabled(ssid) || strcmp(tok + 1, ss->name)) continue; @@ -5315,7 +5315,7 @@ int __init cgroup_init(void) cgrp_dfl_root.subsys_mask |= 1 << ss->id; if (!ss->dfl_cftypes) - cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; + cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; if (ss->dfl_cftypes == ss->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); @@ -5386,7 +5386,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct cgroup *cgrp; int ssid, count = 0; - if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible) + if (root == &cgrp_dfl_root && !cgrp_dfl_visible) continue; seq_printf(m, "%d:", root->hierarchy_id); -- cgit From b38e42e962dbc2fbc3839ce70750881db7c9277e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Feb 2016 10:00:50 -0500 Subject: cgroup: convert cgroup_subsys flag fields to bool bitfields Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Ingo Molnar Cc: Peter Zijlstra --- include/linux/cgroup-defs.h | 6 +++--- kernel/cpuset.c | 2 +- kernel/sched/core.c | 2 +- kernel/sched/cpuacct.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 0abf6aa86c81..8fc3f0452289 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -444,7 +444,7 @@ struct cgroup_subsys { void (*free)(struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); - int early_init; + bool early_init:1; /* * If %false, this subsystem is properly hierarchical - @@ -458,8 +458,8 @@ struct cgroup_subsys { * cases. Eventually, all subsystems will be made properly * hierarchical and this will go away. */ - bool broken_hierarchy; - bool warned_broken_hierarchy; + bool broken_hierarchy:1; + bool warned_broken_hierarchy:1; /* the following two fields are initialized automtically during boot */ int id; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 41989ab4db57..90899837ea78 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2089,7 +2089,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .attach = cpuset_attach, .bind = cpuset_bind, .legacy_cftypes = files, - .early_init = 1, + .early_init = true, }; /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44253adb3c36..0f5abc6e4ff3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8706,7 +8706,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .legacy_cftypes = cpu_files, - .early_init = 1, + .early_init = true, }; #endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dd7cbb55bbf2..2ddaebf7469a 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -279,5 +279,5 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, .legacy_cftypes = files, - .early_init = 1, + .early_init = true, }; -- cgit From f17fc25f2b4f4bd8edafe36af6d7379eb9db27a0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Feb 2016 10:00:51 -0500 Subject: cgroup: make css_tryget_online_from_dir() also recognize cgroup2 fs The function currently returns -EBADF for a directory on the default hierarchy. Make it also recognize cgroup2_fs_type. This will be used for perf_event cgroup2 support. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index afbed523b22f..2b114368666c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5781,12 +5781,13 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct file_system_type *s_type = dentry->d_sb->s_type; struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; /* is @dentry a cgroup dir? */ - if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || - kernfs_type(kn) != KERNFS_DIR) + if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) || + !kn || kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); rcu_read_lock(); -- cgit From 62716ea0f2ea4253984008fd4a96a532674ac58f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Feb 2016 10:00:51 -0500 Subject: cgroup: use ->subtree_control when testing no internal process rule No internal process rule is enforced by cgroup_migrate_prepare_dst() during process migration. It tests whether the target cgroup's ->child_subsys_mask is zero which is different from "subtree_control" write path which tests ->subtree_control. This hasn't mattered because up until now, both ->child_subsys_mask and ->subtree_control are zero or non-zero at the same time. However, with the planned addition of implicit controllers, this will no longer be true. This patch prepares for the change by making cgorup_migrate_prepare_dst() test ->subtree_control instead. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2b114368666c..ac5451e7c458 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2527,11 +2527,11 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, lockdep_assert_held(&cgroup_mutex); /* - * Except for the root, subtree_ss_mask must be zero for a cgroup + * Except for the root, subtree_control must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && - dst_cgrp->subtree_ss_mask) + dst_cgrp->subtree_control) return -EBUSY; /* look up the dst cset for each src cset and link it to src */ -- cgit From 93f834df9c2d4e362dfdc4b05daa0a4e18814836 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Sat, 20 Feb 2016 14:32:24 -0800 Subject: devm_memremap: Fix error value when memremap failed devm_memremap() returns an ERR_PTR() value in case of error. However, it returns NULL when memremap() failed. This causes the caller, such as the pmem driver, to proceed and oops later. Change devm_memremap() to return ERR_PTR(-ENXIO) when memremap() failed. Signed-off-by: Toshi Kani Cc: Andrew Morton Cc: Reviewed-by: Ross Zwisler Signed-off-by: Dan Williams --- kernel/memremap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 2c468dea60bc..b04ea2f5fbfe 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -136,8 +136,10 @@ void *devm_memremap(struct device *dev, resource_size_t offset, if (addr) { *ptr = addr; devres_add(dev, ptr); - } else + } else { devres_free(ptr); + return ERR_PTR(-ENXIO); + } return addr; } -- cgit From bb53e416e02cd5406d5e1dbdd89432ff5875e982 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Dec 2015 09:30:12 -0800 Subject: rcu: Assign false instead of 0 for ->core_needs_qs A zero seems to have escaped earlier true/false substitution efforts, so this commit changes 0 to false for the ->core_needs_qs boolean field. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e41dd4131f7a..b8bbf3b27241 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2390,7 +2390,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { - rdp->core_needs_qs = 0; + rdp->core_needs_qs = false; /* * This GP can't end until cpu checks in, so all of our -- cgit From 8994515cf0030e5020d67bb837a9d9a92441d0f7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Dec 2015 09:59:43 -0800 Subject: rcu: Update rcu_report_qs_rsp() comment The header comment for rcu_report_qs_rsp() was obsolete, dating well before the advent of RCU grace-period kthreads. This commit therefore brings this comment back into alignment with current reality. Reported-by: Lihao Liang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b8bbf3b27241..a91836868ade 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2234,11 +2234,13 @@ static bool rcu_start_gp(struct rcu_state *rsp) } /* - * Report a full set of quiescent states to the specified rcu_state - * data structure. This involves cleaning up after the prior grace - * period and letting rcu_start_gp() start up the next grace period - * if one is needed. Note that the caller must hold rnp->lock, which - * is released before return. + * Report a full set of quiescent states to the specified rcu_state data + * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period + * kthread if another grace period is required. Whether we wake + * the grace-period kthread or it awakens itself for the next round + * of quiescent-state forcing, that kthread will clean up after the + * just-completed grace period. Note that the caller must hold rnp->lock, + * which is released before return. */ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) -- cgit From 4914950aaa12de8a2004b8031b45480526caf42b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Dec 2015 13:48:43 -0800 Subject: rcu: Stop treating in-kernel CPU-bound workloads as errors Commit 4a81e8328d379 ("Reduce overhead of cond_resched() checks for RCU") handles the error case where a nohz_full loops indefinitely in the kernel with the scheduling-clock interrupt disabled. However, this handling includes IPIing the CPU running the offending loop, which is not what we want for real-time workloads. And there are starting to be real-time CPU-bound in-kernel workloads, and these must be handled without IPIing the CPU, at least not in the common case. Therefore, this situation can no longer be dismissed as an error case. This commit therefore splits the handling out, so that the setting of bits in the per-CPU rcu_sched_qs_mask variable is done relatively early, but if the problem persists, resched_cpu() is eventually used to IPI the CPU containing the offending loop. Assuming that in-kernel CPU-bound loops used by real-time tasks contain frequent calls cond_resched_rcu_qs() (as in more than once per few tens of milliseconds), the real-time tasks will never be IPIed. Signed-off-by: Paul E. McKenney Cc: Steven Rostedt --- kernel/rcu/tree.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a91836868ade..68f4bee3ecc3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1173,15 +1173,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, smp_mb(); /* ->cond_resched_completed before *rcrmp. */ WRITE_ONCE(*rcrmp, READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); - resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ - rdp->rsp->jiffies_resched += 5; /* Enable beating. */ - } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { - /* Time to beat on that CPU again! */ - resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ - rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } + rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } + /* And if it has been a really long time, kick the CPU as well. */ + if (ULONG_CMP_GE(jiffies, + rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) || + ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs)) + resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ + return 0; } -- cgit From 23a9bacd357ab8388c7e07101b186a4282874992 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 13 Dec 2015 08:57:10 -0800 Subject: rcu: Set rdp->gpwrap when CPU is idle Commit #e3663b1024d1 ("rcu: Handle gpnum/completed wrap while dyntick idle") sets rdp->gpwrap on the wrong side of the "if" statement in dyntick_save_progress_counter(), that is, it sets it when the CPU is not idle instead of when it is idle. Of course, if the CPU is not idle, its rdp->gpnum won't be lagging beind the global rsp->gpnum, which means that rdp->gpwrap will never be set. This commit therefore moves this code to the proper leg of that "if" statement. This change means that the "else" cause is just "return 0" and the "then" clause ends with "return 1", so also move the "return 0" to follow the "if", dropping the "else" clause. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 68f4bee3ecc3..976a166f3fa3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1083,13 +1083,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, rcu_sysidle_check_cpu(rdp, isidle, maxj); if ((rdp->dynticks_snap & 0x1) == 0) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); - return 1; - } else { if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rdp->mynode->gpnum)) WRITE_ONCE(rdp->gpwrap, true); - return 0; + return 1; } + return 0; } /* -- cgit From aa5a8988760142fe33e38790a578bd84ddf3e339 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 31 Dec 2015 16:27:06 -0800 Subject: rcutorture: Correct no-expedite console messages The "Disabled dynamic grace-period expediting" console message is currently printed unconditionally. This commit causes it to be output only when it is impossible to switch between normal and expedited grace periods, which was the original intent. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d2988d047d66..65ae0e5c35da 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -932,12 +932,14 @@ rcu_torture_writer(void *arg) int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - pr_alert("%s" TORTURE_FLAG - " Grace periods expedited from boot/sysfs for %s,\n", - torture_type, cur_ops->name); - pr_alert("%s" TORTURE_FLAG - " Testing of dynamic grace-period expediting diabled.\n", - torture_type); + if (!can_expedite) { + pr_alert("%s" TORTURE_FLAG + " Grace periods expedited from boot/sysfs for %s,\n", + torture_type, cur_ops->name); + pr_alert("%s" TORTURE_FLAG + " Disabled dynamic grace-period expediting.\n", + torture_type); + } /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) -- cgit From 1914aab54319ed70608078df4f18ac4767753977 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Sat, 26 Dec 2015 21:41:44 +0800 Subject: rcu: Remove useless rcu_data_p when !PREEMPT_RCU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The related warning from gcc 6.0: In file included from kernel/rcu/tree.c:4630:0: kernel/rcu/tree_plugin.h:810:40: warning: ‘rcu_data_p’ defined but not used [-Wunused-const-variable] static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data; ^~~~~~~~~~ Also remove always redundant rcu_data_p in tree.c. Signed-off-by: Chen Gang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 - kernel/rcu/tree_plugin.h | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 976a166f3fa3..1e44fce73839 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -108,7 +108,6 @@ RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); static struct rcu_state *const rcu_state_p; -static struct rcu_data __percpu *const rcu_data_p; LIST_HEAD(rcu_struct_flavors); /* Dump rcu_node combining tree at boot to verify correct setup. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9467a8b7e756..e6b88ad51959 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -807,7 +807,6 @@ void exit_rcu(void) #else /* #ifdef CONFIG_PREEMPT_RCU */ static struct rcu_state *const rcu_state_p = &rcu_sched_state; -static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data; /* * Tell them what RCU they are running. -- cgit From 67c583a7de3433a971983490b37ad2bff3c55463 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 29 Dec 2015 12:18:47 +0800 Subject: RCU: Privatize rcu_node::lock In patch: "rcu: Add transitivity to remaining rcu_node ->lock acquisitions" All locking operations on rcu_node::lock are replaced with the wrappers because of the need of transitivity, which indicates we should never write code using LOCK primitives alone(i.e. without a proper barrier following) on rcu_node::lock outside those wrappers. We could detect this kind of misuses on rcu_node::lock in the future by adding __private modifier on rcu_node::lock. To privatize rcu_node::lock, unlock wrappers are also needed. Replacing spinlock unlocks with these wrappers not only privatizes rcu_node::lock but also makes it easier to figure out critical sections of rcu_node. This patch adds __private modifier to rcu_node::lock and makes every access to it wrapped by ACCESS_PRIVATE(). Besides, unlock wrappers are added and raw_spin_unlock(&rnp->lock) and its friends are replaced with those wrappers. Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 103 ++++++++++++++++++++++++----------------------- kernel/rcu/tree.h | 42 ++++++++++++++----- kernel/rcu/tree_plugin.h | 26 ++++++------ 3 files changed, 96 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1e44fce73839..d5a6d2f0cbf4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1245,7 +1245,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) if (rnp->qsmask & (1UL << cpu)) dump_cpu_task(rnp->grplo + cpu); } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1265,12 +1265,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) raw_spin_lock_irqsave_rcu_node(rnp, flags); delta = jiffies - READ_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* * OK, time to rat on our buddy... @@ -1291,7 +1291,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) ndetected++; } } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } print_cpu_stall_info_end(); @@ -1356,7 +1356,7 @@ static void print_cpu_stall(struct rcu_state *rsp) if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* * Attempt to revive the RCU machinery by forcing a context switch. @@ -1594,7 +1594,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, } unlock_out: if (rnp != rnp_root) - raw_spin_unlock(&rnp_root->lock); + raw_spin_unlock_rcu_node(rnp_root); out: if (c_out != NULL) *c_out = c; @@ -1814,7 +1814,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) return; } needwake = __note_gp_changes(rsp, rnp, rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rsp); } @@ -1839,7 +1839,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) raw_spin_lock_irq_rcu_node(rnp); if (!READ_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); return false; } WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ @@ -1849,7 +1849,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * Grace period already in progress, don't start another. * Not supposed to be able to happen. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); return false; } @@ -1858,7 +1858,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) /* Record GP times before starting GP, hence smp_store_release(). */ smp_store_release(&rsp->gpnum, rsp->gpnum + 1); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); /* * Apply per-leaf buffered online and offline operations to the @@ -1872,7 +1872,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); continue; } @@ -1906,7 +1906,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rcu_cleanup_dead_rnp(rnp); } - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } /* @@ -1937,7 +1937,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); } @@ -1995,7 +1995,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) raw_spin_lock_irq_rcu_node(rnp); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } } @@ -2024,7 +2024,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); /* * Propagate new ->completed value to rcu_node structures so @@ -2045,7 +2045,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); rcu_gp_slow(rsp, gp_cleanup_delay); @@ -2067,7 +2067,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) READ_ONCE(rsp->gpnum), TPS("newreq")); } - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } /* @@ -2246,7 +2246,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); - raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); rcu_gp_kthread_wake(rsp); } @@ -2276,7 +2276,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * Our bit has already been cleared, or the * relevant grace period is already over, so done. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ @@ -2288,7 +2288,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { /* Other bits still set at this level, so done. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } mask = rnp->grpmask; @@ -2298,7 +2298,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, break; } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rnp_c = rnp; rnp = rnp->parent; raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -2330,7 +2330,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; /* Still need more quiescent states! */ } @@ -2347,7 +2347,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, /* Report up the rest of the hierarchy, tracking current ->gpnum. */ gps = rnp->gpnum; mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); } @@ -2384,12 +2384,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) */ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } mask = rdp->grpmask; if ((rnp->qsmask & mask) == 0) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { rdp->core_needs_qs = false; @@ -2600,10 +2600,11 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) rnp->qsmaskinit &= ~mask; rnp->qsmask &= ~mask; if (rnp->qsmaskinit) { - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); + /* irqs remain disabled. */ return; } - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } } @@ -2626,7 +2627,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rnp->qsmaskinitnext &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -2860,7 +2861,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); } else { /* Nothing to do here, so just drop the lock. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } } @@ -2896,11 +2897,11 @@ static void force_quiescent_state(struct rcu_state *rsp) raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; - raw_spin_unlock_irqrestore(&rnp_old->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); - raw_spin_unlock_irqrestore(&rnp_old->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); rcu_gp_kthread_wake(rsp); } @@ -2926,7 +2927,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) if (cpu_needs_another_gp(rsp, rdp)) { raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ needwake = rcu_start_gp(rsp); - raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); if (needwake) rcu_gp_kthread_wake(rsp); } else { @@ -3017,7 +3018,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, raw_spin_lock_rcu_node(rnp_root); needwake = rcu_start_gp(rsp); - raw_spin_unlock(&rnp_root->lock); + raw_spin_unlock_rcu_node(rnp_root); if (needwake) rcu_gp_kthread_wake(rsp); } else { @@ -3437,14 +3438,14 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmaskinit == rnp->expmaskinitnext) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); continue; /* No new CPUs, nothing to do. */ } /* Update this node's mask, track old value for propagation. */ oldmask = rnp->expmaskinit; rnp->expmaskinit = rnp->expmaskinitnext; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* If was already nonzero, nothing to propagate. */ if (oldmask) @@ -3459,7 +3460,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) if (rnp_up->expmaskinit) done = true; rnp_up->expmaskinit |= mask; - raw_spin_unlock_irqrestore(&rnp_up->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); if (done) break; mask = rnp_up->grpmask; @@ -3482,7 +3483,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); rnp->expmask = rnp->expmaskinit; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -3523,11 +3524,11 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, if (!rnp->expmask) rcu_initiate_boost(rnp, flags); else - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); break; } if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ wake_up(&rsp->expedited_wq); @@ -3535,7 +3536,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, break; } mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ rnp = rnp->parent; raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); @@ -3570,7 +3571,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!(rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } rnp->expmask &= ~mask; @@ -3731,7 +3732,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, */ if (rcu_preempt_has_tasks(rnp)) rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ mask = 1; @@ -3748,7 +3749,7 @@ retry_ipi: raw_spin_lock_irqsave_rcu_node(rnp, flags); if (cpu_online(cpu) && (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); schedule_timeout_uninterruptible(1); if (cpu_online(cpu) && (rnp->expmask & mask)) @@ -3757,7 +3758,7 @@ retry_ipi: } if (!(rnp->expmask & mask)) mask_ofl_ipi &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Report quiescent states for those that went offline. */ mask_ofl_test |= mask_ofl_ipi; @@ -4164,7 +4165,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) return; raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ rnp->qsmaskinit |= mask; - raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */ } } @@ -4188,7 +4189,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->rsp = rsp; mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -4216,7 +4217,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rcu_sysidle_init_percpu_data(rdp->dynticks); atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* * Add CPU to leaf rcu_node pending-online bitmask. Any needed @@ -4237,7 +4238,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } static void rcu_prepare_cpu(int cpu) @@ -4359,7 +4360,7 @@ static int __init rcu_spawn_gp_kthread(void) sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); } rcu_spawn_nocb_kthreads(); @@ -4450,8 +4451,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) cpustride *= levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < levelcnt[i]; j++, rnp++) { - raw_spin_lock_init(&rnp->lock); - lockdep_set_class_and_name(&rnp->lock, + raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); + lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), &rcu_node_class[i], buf[i]); raw_spin_lock_init(&rnp->fqslock); lockdep_set_class_and_name(&rnp->fqslock, diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 83360b4f4352..4886d6a03353 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -149,8 +149,9 @@ struct rcu_dynticks { * Definition for node within the RCU grace-period-detection hierarchy. */ struct rcu_node { - raw_spinlock_t lock; /* Root rcu_node's lock protects some */ - /* rcu_state fields as well as following. */ + raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ + /* some rcu_state fields as well as */ + /* following. */ unsigned long gpnum; /* Current grace period for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ @@ -680,7 +681,7 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #endif /* #else #ifdef CONFIG_PPC */ /* - * Wrappers for the rcu_node::lock acquire. + * Wrappers for the rcu_node::lock acquire and release. * * Because the rcu_nodes form a tree, the tree traversal locking will observe * different lock values, this in turn means that an UNLOCK of one level @@ -689,29 +690,48 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) * * In order to restore full ordering between tree levels, augment the regular * lock acquire functions with smp_mb__after_unlock_lock(). + * + * As ->lock of struct rcu_node is a __private field, therefore one should use + * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. */ static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) { - raw_spin_lock(&rnp->lock); + raw_spin_lock(&ACCESS_PRIVATE(rnp, lock)); smp_mb__after_unlock_lock(); } +static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp) +{ + raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock)); +} + static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) { - raw_spin_lock_irq(&rnp->lock); + raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock)); smp_mb__after_unlock_lock(); } -#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_lock_irqsave(&(rnp)->lock, flags); \ - smp_mb__after_unlock_lock(); \ +static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp) +{ + raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock)); +} + +#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ } while (0) static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) { - bool locked = raw_spin_trylock(&rnp->lock); + bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock)); if (locked) smp_mb__after_unlock_lock(); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e6b88ad51959..43e9b4a4bcd9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -235,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; - raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */ /* * Report the quiescent state for the expedited GP. This expedited @@ -489,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t) !!rnp->gp_tasks); rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags); } else { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Unboost if we were boosted. */ @@ -518,14 +518,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) sched_show_task(t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -990,7 +990,7 @@ static int rcu_boost(struct rcu_node *rnp) * might exit their RCU read-side critical sections on their own. */ if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return 0; } @@ -1027,7 +1027,7 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ @@ -1087,7 +1087,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { rnp->n_balk_exp_gp_tasks++; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } if (rnp->exp_tasks != NULL || @@ -1097,13 +1097,13 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) ULONG_CMP_GE(jiffies, rnp->boost_time))) { if (rnp->exp_tasks == NULL) rnp->boost_tasks = rnp->gp_tasks; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); t = rnp->boost_kthread_task; if (t) rcu_wake_cond(t, rnp->boost_kthread_status); } else { rcu_initiate_boost_trace(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1171,7 +1171,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return PTR_ERR(t); raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ @@ -1307,7 +1307,7 @@ static void rcu_prepare_kthreads(int cpu) static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } static void invoke_rcu_callbacks_kthread(void) @@ -1558,7 +1558,7 @@ static void rcu_prepare_for_idle(void) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ if (needwake) rcu_gp_kthread_wake(rsp); } @@ -2058,7 +2058,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) raw_spin_lock_irqsave_rcu_node(rnp, flags); needwake = rcu_start_future_gp(rnp, rdp, &c); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rdp->rsp); -- cgit From b354286effa52da6cb1b1f16604d41ff81b8c445 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 29 Dec 2015 12:18:48 +0800 Subject: irq: Privatize irq_common_data::state_use_accessors irq_common_data::state_use_accessors is not designed for public use. Therefore make it private so that people who write code accessing it directly will get blamed by sparse. Also #undef the macro __irqd_to_state after used in header files, so that the macro can't be misused. Signed-off-by: Boqun Feng Reviewed-by: Thomas Gleixner Signed-off-by: Paul E. McKenney --- include/linux/irq.h | 6 ++++-- kernel/irq/internals.h | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 3c1c96786248..cd14cd4a22b4 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -137,7 +137,7 @@ struct irq_domain; * @msi_desc: MSI descriptor */ struct irq_common_data { - unsigned int state_use_accessors; + unsigned int __private state_use_accessors; #ifdef CONFIG_NUMA unsigned int node; #endif @@ -208,7 +208,7 @@ enum { IRQD_FORWARDED_TO_VCPU = (1 << 20), }; -#define __irqd_to_state(d) ((d)->common->state_use_accessors) +#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) static inline bool irqd_is_setaffinity_pending(struct irq_data *d) { @@ -299,6 +299,8 @@ static inline void irqd_clr_forwarded_to_vcpu(struct irq_data *d) __irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU; } +#undef __irqd_to_state + static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) { return d->hwirq; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fcab63c66905..3d182932d2d1 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -160,6 +160,8 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) __irq_put_desc_unlock(desc, flags, false); } +#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) + /* * Manipulation functions for irq_data.state */ @@ -188,6 +190,8 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) return __irqd_to_state(d) & mask; } +#undef __irqd_to_state + static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __this_cpu_inc(*desc->kstat_irqs); -- cgit From 9fc9204ef9fdbe550fd81a554abce2b210e63e49 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 7 Jan 2016 19:05:19 -0500 Subject: rcu: Make rcu/tiny_plugin.h explicitly non-modular The Kconfig currently controlling compilation of this code is: init/Kconfig:config TINY_RCU init/Kconfig: bool ...meaning that it currently is not being built as a module by anyone. Lets remove the modular code that is essentially orphaned, so that when reading the code there is no doubt it is builtin-only. Since module_init translates to device_initcall in the non-modular case, the init ordering remains unchanged with this commit. We could consider moving this to an earlier initcall (subsys?) if desired. We also delete the MODULE_LICENSE tag etc. since all that information is already contained at the top of the file in the comments. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny_plugin.h | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index e492a5253e0f..196f0302e2f4 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -23,7 +23,7 @@ */ #include -#include +#include #include #include @@ -122,18 +122,7 @@ free_out: debugfs_remove_recursive(rcudir); return 1; } - -static void __exit rcutiny_trace_cleanup(void) -{ - debugfs_remove_recursive(rcudir); -} - -module_init(rcutiny_trace_init); -module_exit(rcutiny_trace_cleanup); - -MODULE_AUTHOR("Paul E. McKenney"); -MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); -MODULE_LICENSE("GPL"); +device_initcall(rcutiny_trace_init); static void check_cpu_stall(struct rcu_ctrlblk *rcp) { -- cgit From 4b455dc3e13064795ef2cd3415132df747e64063 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Jan 2016 22:44:45 -0800 Subject: rcu: Catch up rcu_report_qs_rdp() comment with reality Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d5a6d2f0cbf4..39f9c73d33c5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2354,12 +2354,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, /* * Record a quiescent state for the specified CPU to that CPU's rcu_data - * structure. This must be either called from the specified CPU, or - * called when the specified CPU is known to be offline (and when it is - * also known that no other CPU is concurrently trying to help the offline - * CPU). The lastcomp argument is used to make sure we are still in the - * grace period of interest. We don't want to end the current grace period - * based on quiescent states detected in an earlier grace period! + * structure. This must be called from the specified CPU. */ static void rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) -- cgit From 4f2a848c567c72f778352d65cc7c155d1a0977fd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 Jan 2016 13:38:12 -0800 Subject: rcu: Export rcu_gp_is_normal() This commit exports rcu_gp_is_normal() in order to allow it to be used by rcutorture and rcuperf. Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 76b94e19430b..ca828b41c938 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -128,6 +128,7 @@ bool rcu_gp_is_normal(void) { return READ_ONCE(rcu_normal); } +EXPORT_SYMBOL_GPL(rcu_gp_is_normal); static atomic_t rcu_expedited_nesting = ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); -- cgit From d045437a169f899dfb0f6f7ede24cc042543ced9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 24 Feb 2016 09:04:24 -0500 Subject: tracing: Fix showing function event in available_events The ftrace:function event is only displayed for parsing the function tracer data. It is not used to enable function tracing, and does not include an "enable" file in its event directory. Originally, this event was kept separate from other events because it did not have a ->reg parameter. But perf added a "reg" parameter for its use which caused issues, because it made the event available to functions where it was not compatible for. Commit 9b63776fa3ca9 "tracing: Do not enable function event with enable" added a TRACE_EVENT_FL_IGNORE_ENABLE flag that prevented the function event from being enabled by normal trace events. But this commit missed keeping the function event from being displayed by the "available_events" directory, which is used to show what events can be enabled by set_event. One documented way to enable all events is to: cat available_events > set_event But because the function event is displayed in the available_events, this now causes an INVALID error: cat: write error: Invalid argument Reported-by: Chunyu Hu Fixes: 9b63776fa3ca9 "tracing: Do not enable function event with enable" Cc: stable@vger.kernel.org # 3.4+ Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f333e57c4614..ab09829d3b97 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -869,7 +869,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->class && call->class->reg) + if (call->class && call->class->reg && + !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) return file; } -- cgit From 84c4e620d35f49f486a900af214ad12276afb386 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2016 18:45:40 +0100 Subject: perf: Close install vs. exit race Consider the following scenario: CPU0 CPU1 ctx = find_get_ctx(); perf_event_exit_task_context() mutex_lock(&ctx->mutex); perf_install_in_context(ctx, ...); /* NO-OP */ mutex_unlock(&ctx->mutex); ... perf_release() WARN_ON_ONCE(event->state != STATE_EXIT); Since the event doesn't pass through perf_remove_from_context() because perf_install_in_context() NO-OPs because the ctx is dead, and perf_event_exit_task_context() will not observe the event because its not attached yet, the event->state will not be set. Solve this by revalidating ctx->task after we acquire ctx->mutex and failing the event creation as a whole. Tested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dvyukov@google.com Cc: eranian@google.com Cc: oleg@redhat.com Cc: panand@redhat.com Cc: sasha.levin@oracle.com Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160224174947.626853419@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0d58522103cd..d7b0316e3465 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2158,13 +2158,15 @@ perf_install_in_context(struct perf_event_context *ctx, */ raw_spin_lock_irq(&ctx->lock); task = ctx->task; + /* - * Worse, we cannot even rely on the ctx actually existing anymore. If - * between find_get_context() and perf_install_in_context() the task - * went through perf_event_exit_task() its dead and we should not be - * adding new events. + * If between ctx = find_get_context() and mutex_lock(&ctx->mutex) the + * ctx gets destroyed, we must not install an event into it. + * + * This is normally tested for after we acquire the mutex, so this is + * a sanity check. */ - if (task == TASK_TOMBSTONE) { + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { raw_spin_unlock_irq(&ctx->lock); return; } @@ -8389,10 +8391,19 @@ SYSCALL_DEFINE5(perf_event_open, if (move_group) { gctx = group_leader->ctx; mutex_lock_double(&gctx->mutex, &ctx->mutex); + if (gctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } } else { mutex_lock(&ctx->mutex); } + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } + if (!perf_event_validate_size(event)) { err = -E2BIG; goto err_locked; @@ -8563,12 +8574,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_unlock; + } + if (!exclusive_event_installable(event, ctx)) { - mutex_unlock(&ctx->mutex); - perf_unpin_context(ctx); - put_ctx(ctx); err = -EBUSY; - goto err_free; + goto err_unlock; } perf_install_in_context(ctx, event, cpu); @@ -8577,6 +8590,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, return event; +err_unlock: + mutex_unlock(&ctx->mutex); + perf_unpin_context(ctx); + put_ctx(ctx); err_free: free_event(event); err: -- cgit From 130056275ade730e7a79c110212c8815202773ee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2016 18:45:41 +0100 Subject: perf: Do not double free In case of: err_file: fput(event_file), we'll end up calling perf_release() which in turn will free the event. Do not then free the event _again_. Tested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dvyukov@google.com Cc: eranian@google.com Cc: oleg@redhat.com Cc: panand@redhat.com Cc: sasha.levin@oracle.com Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160224174947.697350349@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d7b0316e3465..211a5ce64e3f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8520,7 +8520,12 @@ err_context: perf_unpin_context(ctx); put_ctx(ctx); err_alloc: - free_event(event); + /* + * If event_file is set, the fput() above will have called ->release() + * and that will take care of freeing the event. + */ + if (!event_file) + free_event(event); err_cpus: put_online_cpus(); err_task: -- cgit From a4f4bb6d0c69d0bb573f1d9e6f1b806f9b038b19 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2016 18:45:42 +0100 Subject: perf: Allow perf_release() with !event->ctx In the err_file: fput(event_file) case, the event will not yet have been attached to a context. However perf_release() does assume it has been. Cure this. Tested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dvyukov@google.com Cc: eranian@google.com Cc: oleg@redhat.com Cc: panand@redhat.com Cc: sasha.levin@oracle.com Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160224174947.793996260@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 211a5ce64e3f..d5299e2e435d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3754,9 +3754,19 @@ static void put_event(struct perf_event *event) */ int perf_event_release_kernel(struct perf_event *event) { - struct perf_event_context *ctx; + struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; + /* + * If we got here through err_file: fput(event_file); we will not have + * attached to a context yet. + */ + if (!ctx) { + WARN_ON_ONCE(event->attach_state & + (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); + goto no_ctx; + } + if (!is_kernel_event(event)) perf_remove_from_owner(event); @@ -3832,8 +3842,8 @@ again: } mutex_unlock(&event->child_mutex); - /* Must be the last reference */ - put_event(event); +no_ctx: + put_event(event); /* Must be the 'last' reference */ return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); -- cgit From 6f932e5be1503ab0783699e843db325d44c2fabb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2016 18:45:43 +0100 Subject: perf: Only update context time when active Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dvyukov@google.com Cc: eranian@google.com Cc: oleg@redhat.com Cc: panand@redhat.com Cc: sasha.levin@oracle.com Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160224174947.860690919@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d5299e2e435d..64698fbfad9f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2170,12 +2170,12 @@ perf_install_in_context(struct perf_event_context *ctx, raw_spin_unlock_irq(&ctx->lock); return; } - update_context_time(ctx); - /* - * Update cgrp time only if current cgrp matches event->cgrp. - * Must be done before calling add_event_to_ctx(). - */ - update_cgrp_time_from_event(event); + + if (ctx->is_active) { + update_context_time(ctx); + update_cgrp_time_from_event(event); + } + add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); -- cgit From a69b0ca4ac3bf5427b571f11cbf33f0a32b728d5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2016 18:45:44 +0100 Subject: perf: Fix cloning Alexander reported that when the 'original' context gets destroyed, no new clones happen. This can happen irrespective of the ctx switch optimization, any task can die, even the parent, and we want to continue monitoring the task hierarchy until we either close the event or no tasks are left in the hierarchy. perf_event_init_context() will attempt to pin the 'parent' context during clone(). At that point current is the parent, and since current cannot have exited while executing clone(), its context cannot have passed through perf_event_exit_task_context(). Therefore perf_pin_task_context() cannot observe ctx->task == TASK_TOMBSTONE. However, since inherit_event() does: if (parent_event->parent) parent_event = parent_event->parent; it looks at the 'original' event when it does: is_orphaned_event(). This can return true if the context that contains the this event has passed through perf_event_exit_task_context(). And thus we'll fail to clone the perf context. Fix this by adding a new state: STATE_DEAD, which is set by perf_release() to indicate that the filedesc (or kernel reference) is dead and there are no observers for our data left. Only for STATE_DEAD will is_orphaned_event() be true and inhibit cloning. STATE_EXIT is otherwise preserved such that is_event_hup() remains functional and will report when the observed task hierarchy becomes empty. Reported-by: Alexander Shishkin Tested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dvyukov@google.com Cc: eranian@google.com Cc: oleg@redhat.com Cc: panand@redhat.com Cc: sasha.levin@oracle.com Cc: vince@deater.net Fixes: c6e5b73242d2 ("perf: Synchronously clean up child events") Link: http://lkml.kernel.org/r/20160224174947.919845295@infradead.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + kernel/events/core.c | 29 ++++++++++++++--------------- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b35a61a481fa..39156619e108 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -397,6 +397,7 @@ struct pmu { * enum perf_event_active_state - the states of a event */ enum perf_event_active_state { + PERF_EVENT_STATE_DEAD = -4, PERF_EVENT_STATE_EXIT = -3, PERF_EVENT_STATE_ERROR = -2, PERF_EVENT_STATE_OFF = -1, diff --git a/kernel/events/core.c b/kernel/events/core.c index 64698fbfad9f..92d6999a4f2f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1645,7 +1645,7 @@ out: static bool is_orphaned_event(struct perf_event *event) { - return event->state == PERF_EVENT_STATE_EXIT; + return event->state == PERF_EVENT_STATE_DEAD; } static inline int pmu_filter_match(struct perf_event *event) @@ -1732,7 +1732,6 @@ group_sched_out(struct perf_event *group_event, } #define DETACH_GROUP 0x01UL -#define DETACH_STATE 0x02UL /* * Cross CPU call to remove a performance event @@ -1752,8 +1751,6 @@ __perf_remove_from_context(struct perf_event *event, if (flags & DETACH_GROUP) perf_group_detach(event); list_del_event(event, ctx); - if (flags & DETACH_STATE) - event->state = PERF_EVENT_STATE_EXIT; if (!ctx->nr_events && ctx->is_active) { ctx->is_active = 0; @@ -3772,22 +3769,24 @@ int perf_event_release_kernel(struct perf_event *event) ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE); - perf_event_ctx_unlock(event, ctx); + perf_remove_from_context(event, DETACH_GROUP); + raw_spin_lock_irq(&ctx->lock); /* - * At this point we must have event->state == PERF_EVENT_STATE_EXIT, - * either from the above perf_remove_from_context() or through - * perf_event_exit_event(). + * Mark this even as STATE_DEAD, there is no external reference to it + * anymore. * - * Therefore, anybody acquiring event->child_mutex after the below - * loop _must_ also see this, most importantly inherit_event() which - * will avoid placing more children on the list. + * Anybody acquiring event->child_mutex after the below loop _must_ + * also see this, most importantly inherit_event() which will avoid + * placing more children on the list. * * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT); + event->state = PERF_EVENT_STATE_DEAD; + raw_spin_unlock_irq(&ctx->lock); + + perf_event_ctx_unlock(event, ctx); again: mutex_lock(&event->child_mutex); @@ -4000,7 +3999,7 @@ static bool is_event_hup(struct perf_event *event) { bool no_children; - if (event->state != PERF_EVENT_STATE_EXIT) + if (event->state > PERF_EVENT_STATE_EXIT) return false; mutex_lock(&event->child_mutex); @@ -8727,7 +8726,7 @@ perf_event_exit_event(struct perf_event *child_event, if (parent_event) perf_group_detach(child_event); list_del_event(child_event, child_ctx); - child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */ + child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ raw_spin_unlock_irq(&child_ctx->lock); /* -- cgit From 9107c89e269d2738019861bb518e3d59bef01781 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2016 18:45:45 +0100 Subject: perf: Fix race between event install and jump_labels perf_install_in_context() relies upon the context switch hooks to have scheduled in events when the IPI misses its target -- after all, if the task has moved from the CPU (or wasn't running at all), it will have to context switch to run elsewhere. This however doesn't appear to be happening. It is possible for the IPI to not happen (task wasn't running) only to later observe the task running with an inactive context. The only possible explanation is that the context switch hooks are not called. Therefore put in a sync_sched() after toggling the jump_label to guarantee all CPUs will have them enabled before we install an event. A simple if (0->1) sync_sched() will not in fact work, because any further increment can race and complete before the sync_sched(). Therefore we must jump through some hoops. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dvyukov@google.com Cc: eranian@google.com Cc: oleg@redhat.com Cc: panand@redhat.com Cc: sasha.levin@oracle.com Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160224174947.980211985@infradead.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 6 +++--- kernel/events/core.c | 49 ++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 44 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 39156619e108..f5c5a3fa2c81 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -906,7 +906,7 @@ perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) } } -extern struct static_key_deferred perf_sched_events; +extern struct static_key_false perf_sched_events; static __always_inline bool perf_sw_migrate_enabled(void) @@ -925,7 +925,7 @@ static inline void perf_event_task_migrate(struct task_struct *task) static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { - if (static_key_false(&perf_sched_events.key)) + if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_in(prev, task); if (perf_sw_migrate_enabled() && task->sched_migrated) { @@ -942,7 +942,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, { perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); - if (static_key_false(&perf_sched_events.key)) + if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_out(prev, next); } diff --git a/kernel/events/core.c b/kernel/events/core.c index 92d6999a4f2f..ea064ca8dd3c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -321,7 +321,13 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct static_key_deferred perf_sched_events __read_mostly; + +static void perf_sched_delayed(struct work_struct *work); +DEFINE_STATIC_KEY_FALSE(perf_sched_events); +static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); +static DEFINE_MUTEX(perf_sched_mutex); +static atomic_t perf_sched_count; + static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); @@ -3536,12 +3542,22 @@ static void unaccount_event(struct perf_event *event) if (has_branch_stack(event)) dec = true; - if (dec) - static_key_slow_dec_deferred(&perf_sched_events); + if (dec) { + if (!atomic_add_unless(&perf_sched_count, -1, 1)) + schedule_delayed_work(&perf_sched_work, HZ); + } unaccount_event_cpu(event, event->cpu); } +static void perf_sched_delayed(struct work_struct *work) +{ + mutex_lock(&perf_sched_mutex); + if (atomic_dec_and_test(&perf_sched_count)) + static_branch_disable(&perf_sched_events); + mutex_unlock(&perf_sched_mutex); +} + /* * The following implement mutual exclusion of events on "exclusive" pmus * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled @@ -7780,8 +7796,28 @@ static void account_event(struct perf_event *event) if (is_cgroup_event(event)) inc = true; - if (inc) - static_key_slow_inc(&perf_sched_events.key); + if (inc) { + if (atomic_inc_not_zero(&perf_sched_count)) + goto enabled; + + mutex_lock(&perf_sched_mutex); + if (!atomic_read(&perf_sched_count)) { + static_branch_enable(&perf_sched_events); + /* + * Guarantee that all CPUs observe they key change and + * call the perf scheduling hooks before proceeding to + * install events that need them. + */ + synchronize_sched(); + } + /* + * Now that we have waited for the sync_sched(), allow further + * increments to by-pass the mutex. + */ + atomic_inc(&perf_sched_count); + mutex_unlock(&perf_sched_mutex); + } +enabled: account_event_cpu(event, event->cpu); } @@ -9344,9 +9380,6 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); - /* do not patch jump label more than once per second */ - jump_label_rate_limit(&perf_sched_events, HZ); - /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. -- cgit From 28a967c3a2f99fa3b5f762f25cb2a319d933571b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2016 18:45:46 +0100 Subject: perf: Cure event->pending_disable race Because event_sched_out() checks event->pending_disable _before_ actually disabling the event, it can happen that the event fires after it checks but before it gets disabled. This would leave event->pending_disable set and the queued irq_work will try and process it. However, if the event trigger was during schedule(), the event might have been de-scheduled by the time the irq_work runs, and perf_event_disable_local() will fail. Fix this by checking event->pending_disable _after_ we call event->pmu->del(). This depends on the latter being a compiler barrier, such that the compiler does not lift the load and re-creates the problem. Tested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dvyukov@google.com Cc: eranian@google.com Cc: oleg@redhat.com Cc: panand@redhat.com Cc: sasha.levin@oracle.com Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160224174948.040469884@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ea064ca8dd3c..de14b67a3b0b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1696,14 +1696,14 @@ event_sched_out(struct perf_event *event, perf_pmu_disable(event->pmu); + event->tstamp_stopped = tstamp; + event->pmu->del(event, 0); + event->oncpu = -1; event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = tstamp; - event->pmu->del(event, 0); - event->oncpu = -1; if (!is_software_event(event)) cpuctx->active_oncpu--; -- cgit From 3cbaa59069677920186dcf502632ca1df4329f80 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2016 18:45:47 +0100 Subject: perf: Fix ctx time tracking by introducing EVENT_TIME Currently any ctx_sched_in() call will re-start the ctx time tracking, this means that calls like: ctx_sched_in(.event_type = EVENT_PINNED); ctx_sched_in(.event_type = EVENT_FLEXIBLE); will have a hole in their ctx time tracking. This is likely harmless but can confuse things a little. By adding EVENT_TIME, we can have the first ctx_sched_in() (is_active: 0 -> !0) start the time and any further ctx_sched_in() will leave the timestamps alone. Secondly, this allows for an early disable like: ctx_sched_out(.event_type = EVENT_TIME); which would update the ctx time (if the ctx is active) and any further calls to ctx_sched_out() would not further modify the ctx time. For ctx_sched_in() any 0 -> !0 transition will automatically include EVENT_TIME. For ctx_sched_out(), any transition that clears EVENT_ALL will automatically clear EVENT_TIME. These two rules ensure that under normal circumstances we need not bother with EVENT_TIME and get natural ctx time behaviour. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dvyukov@google.com Cc: eranian@google.com Cc: oleg@redhat.com Cc: panand@redhat.com Cc: sasha.levin@oracle.com Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160224174948.100446561@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index de14b67a3b0b..75bde93eb76f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -314,6 +314,7 @@ again: enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, + EVENT_TIME = 0x4, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -1294,16 +1295,18 @@ static u64 perf_event_time(struct perf_event *event) /* * Update the total_time_enabled and total_time_running fields for a event. - * The caller of this function needs to hold the ctx->lock. */ static void update_event_times(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; u64 run_end; + lockdep_assert_held(&ctx->lock); + if (event->state < PERF_EVENT_STATE_INACTIVE || event->group_leader->state < PERF_EVENT_STATE_INACTIVE) return; + /* * in cgroup mode, time_enabled represents * the time the event was enabled AND active @@ -2349,24 +2352,33 @@ static void ctx_sched_out(struct perf_event_context *ctx, } ctx->is_active &= ~event_type; + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); if (!ctx->is_active) cpuctx->task_ctx = NULL; } - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); - if (!ctx->nr_active) + is_active ^= ctx->is_active; /* changed bits */ + + if (is_active & EVENT_TIME) { + /* update (and stop) ctx time */ + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); + } + + if (!ctx->nr_active || !(is_active & EVENT_ALL)) return; perf_pmu_disable(ctx->pmu); - if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { + if (is_active & EVENT_PINNED) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); } - if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { + if (is_active & EVENT_FLEXIBLE) { list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } @@ -2740,7 +2752,7 @@ ctx_sched_in(struct perf_event_context *ctx, if (likely(!ctx->nr_events)) return; - ctx->is_active |= event_type; + ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!is_active) cpuctx->task_ctx = ctx; @@ -2748,18 +2760,24 @@ ctx_sched_in(struct perf_event_context *ctx, WARN_ON_ONCE(cpuctx->task_ctx != ctx); } - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); + is_active ^= ctx->is_active; /* changed bits */ + + if (is_active & EVENT_TIME) { + /* start ctx time */ + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, ctx); + } + /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) + if (is_active & EVENT_PINNED) ctx_pinned_sched_in(ctx, cpuctx); /* Then walk through the lower prio flexible groups */ - if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) + if (is_active & EVENT_FLEXIBLE) ctx_flexible_sched_in(ctx, cpuctx); } -- cgit From 7fce250915efca0f8f51dddee3ae89bf30d86ca5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2016 18:45:48 +0100 Subject: perf: Fix scaling vs. perf_event_enable_on_exec() The recent commit 3e349507d12d ("perf: Fix perf_enable_on_exec() event scheduling") caused this by moving task_ctx_sched_out() from before __perf_event_mask_enable() to after it. The overlooked consequence of that change is that task_ctx_sched_out() would update the ctx time fields, and now __perf_event_mask_enable() uses stale time. In order to fix this, explicitly stop our context's time before enabling the event(s). Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dvyukov@google.com Cc: eranian@google.com Cc: panand@redhat.com Cc: sasha.levin@oracle.com Cc: vince@deater.net Fixes: 3e349507d12d ("perf: Fix perf_enable_on_exec() event scheduling") Link: http://lkml.kernel.org/r/20160224174948.159242158@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 75bde93eb76f..d0030886c402 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3143,6 +3143,7 @@ static void perf_event_enable_on_exec(int ctxn) cpuctx = __get_cpu_context(ctx); perf_ctx_lock(cpuctx, ctx); + ctx_sched_out(ctx, cpuctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) enabled |= event_enable_on_exec(event, ctx); -- cgit From bd2afa49d194c6412c333e9fdd48bc5d06bb465d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2016 18:45:49 +0100 Subject: perf: Fix scaling vs. perf_event_enable() Similar to the perf_enable_on_exec(), ensure that event timings are consistent across perf_event_enable(). Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dvyukov@google.com Cc: eranian@google.com Cc: oleg@redhat.com Cc: panand@redhat.com Cc: sasha.levin@oracle.com Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160224174948.218288698@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d0030886c402..57c25faecfa5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2069,14 +2069,27 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } -static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx); +static void ctx_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type, struct task_struct *task); +static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + ctx_sched_out(ctx, cpuctx, EVENT_ALL); +} + static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, struct task_struct *task) @@ -2227,17 +2240,18 @@ static void __perf_event_enable(struct perf_event *event, event->state <= PERF_EVENT_STATE_ERROR) return; - update_context_time(ctx); + if (ctx->is_active) + ctx_sched_out(ctx, cpuctx, EVENT_TIME); + __perf_event_mark_enabled(event); if (!ctx->is_active) return; if (!event_filter_match(event)) { - if (is_cgroup_event(event)) { - perf_cgroup_set_timestamp(current, ctx); // XXX ? + if (is_cgroup_event(event)) perf_cgroup_defer_enabled(event); - } + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); return; } @@ -2245,8 +2259,10 @@ static void __perf_event_enable(struct perf_event *event, * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); return; + } task_ctx = cpuctx->task_ctx; if (ctx->task) @@ -2658,18 +2674,6 @@ void __perf_event_task_sched_out(struct task_struct *task, perf_cgroup_sched_out(task, next); } -static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - ctx_sched_out(ctx, cpuctx, EVENT_ALL); -} - /* * Called with IRQs disabled */ -- cgit From a096309bc4677f60caa8e93fcc613a55073c51d4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2016 18:45:50 +0100 Subject: perf: Fix scaling vs. perf_install_in_context() Completely reworks perf_install_in_context() (again!) in order to ensure that there will be no ctx time hole between add_event_to_ctx() and any potential ctx_sched_in(). Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dvyukov@google.com Cc: eranian@google.com Cc: oleg@redhat.com Cc: panand@redhat.com Cc: sasha.levin@oracle.com Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160224174948.279399438@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 115 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 70 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 57c25faecfa5..25edabd207de 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -276,10 +276,10 @@ static void event_function_call(struct perf_event *event, event_f func, void *da return; } -again: if (task == TASK_TOMBSTONE) return; +again: if (!task_function_call(task, event_function, &efs)) return; @@ -289,13 +289,15 @@ again: * a concurrent perf_event_context_sched_out(). */ task = ctx->task; - if (task != TASK_TOMBSTONE) { - if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); - goto again; - } - func(event, NULL, ctx, data); + if (task == TASK_TOMBSTONE) { + raw_spin_unlock_irq(&ctx->lock); + return; + } + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + goto again; } + func(event, NULL, ctx, data); raw_spin_unlock_irq(&ctx->lock); } @@ -2116,49 +2118,68 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, /* * Cross CPU call to install and enable a performance event * - * Must be called with ctx->mutex held + * Very similar to remote_function() + event_function() but cannot assume that + * things like ctx->is_active and cpuctx->task_ctx are set. */ static int __perf_install_in_context(void *info) { - struct perf_event_context *ctx = info; + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; + bool activate = true; + int ret = 0; raw_spin_lock(&cpuctx->ctx.lock); if (ctx->task) { raw_spin_lock(&ctx->lock); - /* - * If we hit the 'wrong' task, we've since scheduled and - * everything should be sorted, nothing to do! - */ task_ctx = ctx; - if (ctx->task != current) + + /* If we're on the wrong CPU, try again */ + if (task_cpu(ctx->task) != smp_processor_id()) { + ret = -ESRCH; goto unlock; + } /* - * If task_ctx is set, it had better be to us. + * If we're on the right CPU, see if the task we target is + * current, if not we don't have to activate the ctx, a future + * context switch will do that for us. */ - WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); + if (ctx->task != current) + activate = false; + else + WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); + } else if (task_ctx) { raw_spin_lock(&task_ctx->lock); } - ctx_resched(cpuctx, task_ctx); + if (activate) { + ctx_sched_out(ctx, cpuctx, EVENT_TIME); + add_event_to_ctx(event, ctx); + ctx_resched(cpuctx, task_ctx); + } else { + add_event_to_ctx(event, ctx); + } + unlock: perf_ctx_unlock(cpuctx, task_ctx); - return 0; + return ret; } /* - * Attach a performance event to a context + * Attach a performance event to a context. + * + * Very similar to event_function_call, see comment there. */ static void perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { - struct task_struct *task = NULL; + struct task_struct *task = READ_ONCE(ctx->task); lockdep_assert_held(&ctx->mutex); @@ -2166,42 +2187,46 @@ perf_install_in_context(struct perf_event_context *ctx, if (event->cpu != -1) event->cpu = cpu; + if (!task) { + cpu_function_call(cpu, __perf_install_in_context, event); + return; + } + + /* + * Should not happen, we validate the ctx is still alive before calling. + */ + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) + return; + /* * Installing events is tricky because we cannot rely on ctx->is_active * to be set in case this is the nr_events 0 -> 1 transition. - * - * So what we do is we add the event to the list here, which will allow - * a future context switch to DTRT and then send a racy IPI. If the IPI - * fails to hit the right task, this means a context switch must have - * happened and that will have taken care of business. */ - raw_spin_lock_irq(&ctx->lock); - task = ctx->task; - +again: /* - * If between ctx = find_get_context() and mutex_lock(&ctx->mutex) the - * ctx gets destroyed, we must not install an event into it. - * - * This is normally tested for after we acquire the mutex, so this is - * a sanity check. + * Cannot use task_function_call() because we need to run on the task's + * CPU regardless of whether its current or not. */ + if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + task = ctx->task; if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { + /* + * Cannot happen because we already checked above (which also + * cannot happen), and we hold ctx->mutex, which serializes us + * against perf_event_exit_task_context(). + */ raw_spin_unlock_irq(&ctx->lock); return; } - - if (ctx->is_active) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } - - add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); - - if (task) - task_function_call(task, __perf_install_in_context, ctx); - else - cpu_function_call(cpu, __perf_install_in_context, ctx); + /* + * Since !ctx->is_active doesn't mean anything, we must IPI + * unconditionally. + */ + goto again; } /* -- cgit From 0da4cf3e0a68c97ef811569804616a811f786729 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Feb 2016 18:45:51 +0100 Subject: perf: Robustify task_function_call() Since there is no serialization between task_function_call() doing task_curr() and the other CPU doing context switches, we could end up not sending an IPI even if we had to. And I'm not sure I still buy my own argument we're OK. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dvyukov@google.com Cc: eranian@google.com Cc: oleg@redhat.com Cc: panand@redhat.com Cc: sasha.levin@oracle.com Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160224174948.340031200@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 25edabd207de..614614821f00 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -64,8 +64,17 @@ static void remote_function(void *data) struct task_struct *p = tfc->p; if (p) { - tfc->ret = -EAGAIN; - if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + /* -EAGAIN */ + if (task_cpu(p) != smp_processor_id()) + return; + + /* + * Now that we're on right CPU with IRQs disabled, we can test + * if we hit the right task without races. + */ + + tfc->ret = -ESRCH; /* No such (running) process */ + if (p != current) return; } @@ -92,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) .p = p, .func = func, .info = info, - .ret = -ESRCH, /* No such (running) process */ + .ret = -EAGAIN, }; + int ret; - if (task_curr(p)) - smp_call_function_single(task_cpu(p), remote_function, &data, 1); + do { + ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); + if (!ret) + ret = data.ret; + } while (ret == -EAGAIN); - return data.ret; + return ret; } /** @@ -169,19 +182,6 @@ static bool is_kernel_event(struct perf_event *event) * rely on ctx->is_active and therefore cannot use event_function_call(). * See perf_install_in_context(). * - * This is because we need a ctx->lock serialized variable (ctx->is_active) - * to reliably determine if a particular task/context is scheduled in. The - * task_curr() use in task_function_call() is racy in that a remote context - * switch is not a single atomic operation. - * - * As is, the situation is 'safe' because we set rq->curr before we do the - * actual context switch. This means that task_curr() will fail early, but - * we'll continue spinning on ctx->is_active until we've passed - * perf_event_task_sched_out(). - * - * Without this ctx->lock serialized variable we could have race where we find - * the task (and hence the context) would not be active while in fact they are. - * * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. */ @@ -212,7 +212,7 @@ static int event_function(void *info) */ if (ctx->task) { if (ctx->task != current) { - ret = -EAGAIN; + ret = -ESRCH; goto unlock; } -- cgit From 379b656446a36c7a80b27cbdc34eec8e57b5f290 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:14 +0000 Subject: genirq: Add GENERIC_IRQ_IPI Kconfig symbol Select this to enable the generic IPI domain support Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-4-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- kernel/irq/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 3b48dab80164..3bbfd6a9c475 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -64,6 +64,10 @@ config IRQ_DOMAIN_HIERARCHY bool select IRQ_DOMAIN +# Generic IRQ IPI support +config GENERIC_IRQ_IPI + bool + # Generic MSI interrupt support config GENERIC_MSI_IRQ bool -- cgit From ac0a0cd266d1f21236d5975ca6bced9b377a2a6a Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:18 +0000 Subject: genirq: Make irq_domain_alloc_descs() non static We will need to use this function to implement irq_reserve_ipi() later. So make it non static and move the prototype to irqdomain.h to allow using it outside irqdomain.c Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-8-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 2 ++ kernel/irq/irqdomain.c | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 130e1c3117c3..c466cc17b8e9 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -213,6 +213,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, extern struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, enum irq_domain_bus_token bus_token); extern void irq_set_default_host(struct irq_domain *host); +extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, + irq_hw_number_t hwirq, int node); static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) { diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3e56d2f03e24..86811541f073 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,8 +23,6 @@ static DEFINE_MUTEX(irq_domain_mutex); static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; -static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, - irq_hw_number_t hwirq, int node); static void irq_domain_check_hierarchy(struct irq_domain *domain); struct irqchip_fwid { @@ -840,8 +838,8 @@ const struct irq_domain_ops irq_domain_simple_ops = { }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); -static int irq_domain_alloc_descs(int virq, unsigned int cnt, - irq_hw_number_t hwirq, int node) +int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, + int node) { unsigned int hint; -- cgit From d17bf24e695290d3fe7943aca52ab48098a10653 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:19 +0000 Subject: genirq: Add a new generic IPI reservation code to irq core Add a generic mechanism to dynamically allocate an IPI. Depending on the underlying implementation this creates either a single Linux irq or a consective range of Linux irqs. The Linux irq is used later to send IPIs to other CPUs. [ tglx: Massaged the code and removed the 'consecutive mask' restriction for the single IRQ case ] Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-9-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 3 + include/linux/irqdomain.h | 5 ++ kernel/irq/Makefile | 1 + kernel/irq/ipi.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 146 insertions(+) create mode 100644 kernel/irq/ipi.c (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index a32b47fbf874..95f4f66f95f3 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -940,4 +940,7 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc, return readl(gc->reg_base + reg_offset); } +/* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */ +#define INVALID_HWIRQ (~0UL) + #endif /* _LINUX_IRQ_H */ diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index c466cc17b8e9..ed48594e96d2 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -344,6 +344,11 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type); +/* IPI functions */ +unsigned int irq_reserve_ipi(struct irq_domain *domain, + const struct cpumask *dest); +void irq_destroy_ipi(unsigned int irq); + /* V2 interfaces to support hierarchy IRQ domains. */ extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, unsigned int virq); diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2fc9cbdf35b6..2ee42e95a3ce 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -8,3 +8,4 @@ obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o +obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c new file mode 100644 index 000000000000..340af273429c --- /dev/null +++ b/kernel/irq/ipi.c @@ -0,0 +1,137 @@ +/* + * linux/kernel/irq/ipi.c + * + * Copyright (C) 2015 Imagination Technologies Ltd + * Author: Qais Yousef + * + * This file contains driver APIs to the IPI subsystem. + */ + +#define pr_fmt(fmt) "genirq/ipi: " fmt + +#include +#include + +/** + * irq_reserve_ipi() - Setup an IPI to destination cpumask + * @domain: IPI domain + * @dest: cpumask of cpus which can receive the IPI + * + * Allocate a virq that can be used to send IPI to any CPU in dest mask. + * + * On success it'll return linux irq number and 0 on failure + */ +unsigned int irq_reserve_ipi(struct irq_domain *domain, + const struct cpumask *dest) +{ + unsigned int nr_irqs, offset; + struct irq_data *data; + int virq, i; + + if (!domain ||!irq_domain_is_ipi(domain)) { + pr_warn("Reservation on a non IPI domain\n"); + return 0; + } + + if (!cpumask_subset(dest, cpu_possible_mask)) { + pr_warn("Reservation is not in possible_cpu_mask\n"); + return 0; + } + + nr_irqs = cpumask_weight(dest); + if (!nr_irqs) { + pr_warn("Reservation for empty destination mask\n"); + return 0; + } + + if (irq_domain_is_ipi_single(domain)) { + /* + * If the underlying implementation uses a single HW irq on + * all cpus then we only need a single Linux irq number for + * it. We have no restrictions vs. the destination mask. The + * underlying implementation can deal with holes nicely. + */ + nr_irqs = 1; + offset = 0; + } else { + unsigned int next; + + /* + * The IPI requires a seperate HW irq on each CPU. We require + * that the destination mask is consecutive. If an + * implementation needs to support holes, it can reserve + * several IPI ranges. + */ + offset = cpumask_first(dest); + /* + * Find a hole and if found look for another set bit after the + * hole. For now we don't support this scenario. + */ + next = cpumask_next_zero(offset, dest); + if (next < nr_cpu_ids) + next = cpumask_next(next, dest); + if (next < nr_cpu_ids) { + pr_warn("Destination mask has holes\n"); + return 0; + } + } + + virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE); + if (virq <= 0) { + pr_warn("Can't reserve IPI, failed to alloc descs\n"); + return 0; + } + + virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, + (void *) dest, true); + + if (virq <= 0) { + pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); + goto free_descs; + } + + for (i = 0; i < nr_irqs; i++) { + data = irq_get_irq_data(virq + i); + cpumask_copy(data->common->affinity, dest); + data->common->ipi_offset = offset; + } + return virq; + +free_descs: + irq_free_descs(virq, nr_irqs); + return 0; +} + +/** + * irq_destroy_ipi() - unreserve an IPI that was previously allocated + * @irq: linux irq number to be destroyed + * + * Return the IPIs allocated with irq_reserve_ipi() to the system destroying + * all virqs associated with them. + */ +void irq_destroy_ipi(unsigned int irq) +{ + struct irq_data *data = irq_get_irq_data(irq); + struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; + struct irq_domain *domain; + unsigned int nr_irqs; + + if (!irq || !data || !ipimask) + return; + + domain = data->domain; + if (WARN_ON(domain == NULL)) + return; + + if (!irq_domain_is_ipi(domain)) { + pr_warn("Trying to destroy a non IPI domain!\n"); + return; + } + + if (irq_domain_is_ipi_per_cpu(domain)) + nr_irqs = cpumask_weight(ipimask); + else + nr_irqs = 1; + + irq_domain_free_irqs(irq, nr_irqs); +} -- cgit From f9bce791ae2a1a10a965b30427f5507c1a77669f Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:20 +0000 Subject: genirq: Add a new function to get IPI reverse mapping When dealing with coprocessors we need to find out the actual hwirqs values to pass on to the firmware so that it knows what it needs to use to receive IPIs from and send IPIs to Linux cpus. [ tglx: Fixed the single hwirq IPI case. The hardware irq number does not change due to the cpu number ] Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-10-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 1 + kernel/irq/ipi.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 95f4f66f95f3..10273dce058a 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -942,5 +942,6 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc, /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */ #define INVALID_HWIRQ (~0UL) +irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu); #endif /* _LINUX_IRQ_H */ diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 340af273429c..6f34f2930bc0 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -135,3 +135,37 @@ void irq_destroy_ipi(unsigned int irq) irq_domain_free_irqs(irq, nr_irqs); } + +/** + * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu + * @irq: linux irq number + * @cpu: the target cpu + * + * When dealing with coprocessors IPI, we need to inform the coprocessor of + * the hwirq it needs to use to receive and send IPIs. + * + * Returns hwirq value on success and INVALID_HWIRQ on failure. + */ +irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) +{ + struct irq_data *data = irq_get_irq_data(irq); + struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; + + if (!data || !ipimask || cpu > nr_cpu_ids) + return INVALID_HWIRQ; + + if (!cpumask_test_cpu(cpu, ipimask)) + return INVALID_HWIRQ; + + /* + * Get the real hardware irq number if the underlying implementation + * uses a seperate irq per cpu. If the underlying implementation uses + * a single hardware irq for all cpus then the IPI send mechanism + * needs to take care of this. + */ + if (irq_domain_is_ipi_per_cpu(data->domain)) + data = irq_get_irq_data(irq + cpu - data->common->ipi_offset); + + return data ? irqd_to_hwirq(data) : INVALID_HWIRQ; +} +EXPORT_SYMBOL_GPL(ipi_get_hwirq); -- cgit From 3b8e29a82dd16c1f2061e0b955a71cd36eeb061b Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 8 Dec 2015 13:20:22 +0000 Subject: genirq: Implement ipi_send_mask/single() Add APIs to send IPIs from driver and arch code. We have different functions because we allow architecture code to cache the irq descriptor to avoid lookups. Driver code has to use the irq number and is subject to more restrictive checks. [ tglx: Polish the implementation ] Signed-off-by: Qais Yousef Cc: Cc: Cc: Cc: Cc: Cc: Cc: Qais Yousef Link: http://lkml.kernel.org/r/1449580830-23652-12-git-send-email-qais.yousef@imgtec.com Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 ++ kernel/irq/ipi.c | 157 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 160 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 3b3a5b817469..d5ebd94822d2 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -948,5 +948,9 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc, /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */ #define INVALID_HWIRQ (~0UL) irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu); +int __ipi_send_single(struct irq_desc *desc, unsigned int cpu); +int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest); +int ipi_send_single(unsigned int virq, unsigned int cpu); +int ipi_send_mask(unsigned int virq, const struct cpumask *dest); #endif /* _LINUX_IRQ_H */ diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 6f34f2930bc0..c37f34b00a11 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -161,7 +161,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) * Get the real hardware irq number if the underlying implementation * uses a seperate irq per cpu. If the underlying implementation uses * a single hardware irq for all cpus then the IPI send mechanism - * needs to take care of this. + * needs to take care of the cpu destinations. */ if (irq_domain_is_ipi_per_cpu(data->domain)) data = irq_get_irq_data(irq + cpu - data->common->ipi_offset); @@ -169,3 +169,158 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) return data ? irqd_to_hwirq(data) : INVALID_HWIRQ; } EXPORT_SYMBOL_GPL(ipi_get_hwirq); + +static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, + const struct cpumask *dest, unsigned int cpu) +{ + struct cpumask *ipimask = irq_data_get_affinity_mask(data); + + if (!chip || !ipimask) + return -EINVAL; + + if (!chip->ipi_send_single && !chip->ipi_send_mask) + return -EINVAL; + + if (cpu > nr_cpu_ids) + return -EINVAL; + + if (dest) { + if (!cpumask_subset(dest, ipimask)) + return -EINVAL; + } else { + if (!cpumask_test_cpu(cpu, ipimask)) + return -EINVAL; + } + return 0; +} + +/** + * __ipi_send_single - send an IPI to a target Linux SMP CPU + * @desc: pointer to irq_desc of the IRQ + * @cpu: destination CPU, must in the destination mask passed to + * irq_reserve_ipi() + * + * This function is for architecture or core code to speed up IPI sending. Not + * usable from driver code. + * + * Returns zero on success and negative error number on failure. + */ +int __ipi_send_single(struct irq_desc *desc, unsigned int cpu) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(data); + +#ifdef DEBUG + /* + * Minimise the overhead by omitting the checks for Linux SMP IPIs. + * Since the callers should be arch or core code which is generally + * trusted, only check for errors when debugging. + */ + if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu))) + return -EINVAL; +#endif + if (!chip->ipi_send_single) { + chip->ipi_send_mask(data, cpumask_of(cpu)); + return 0; + } + + /* FIXME: Store this information in irqdata flags */ + if (irq_domain_is_ipi_per_cpu(data->domain) && + cpu != data->common->ipi_offset) { + /* use the correct data for that cpu */ + unsigned irq = data->irq + cpu - data->common->ipi_offset; + + data = irq_get_irq_data(irq); + } + chip->ipi_send_single(data, cpu); + return 0; +} + +/** + * ipi_send_mask - send an IPI to target Linux SMP CPU(s) + * @desc: pointer to irq_desc of the IRQ + * @dest: dest CPU(s), must be a subset of the mask passed to + * irq_reserve_ipi() + * + * This function is for architecture or core code to speed up IPI sending. Not + * usable from driver code. + * + * Returns zero on success and negative error number on failure. + */ +int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(data); + unsigned int cpu; + +#ifdef DEBUG + /* + * Minimise the overhead by omitting the checks for Linux SMP IPIs. + * Since the callers should be arch or core code which is generally + * trusted, only check for errors when debugging. + */ + if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0))) + return -EINVAL; +#endif + if (chip->ipi_send_mask) { + chip->ipi_send_mask(data, dest); + return 0; + } + + if (irq_domain_is_ipi_per_cpu(data->domain)) { + unsigned int base = data->irq; + + for_each_cpu(cpu, dest) { + unsigned irq = base + cpu - data->common->ipi_offset; + + data = irq_get_irq_data(irq); + chip->ipi_send_single(data, cpu); + } + } else { + for_each_cpu(cpu, dest) + chip->ipi_send_single(data, cpu); + } + return 0; +} + +/** + * ipi_send_single - Send an IPI to a single CPU + * @virq: linux irq number from irq_reserve_ipi() + * @cpu: destination CPU, must in the destination mask passed to + * irq_reserve_ipi() + * + * Returns zero on success and negative error number on failure. + */ +int ipi_send_single(unsigned int virq, unsigned int cpu) +{ + struct irq_desc *desc = irq_to_desc(virq); + struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL; + struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL; + + if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu))) + return -EINVAL; + + return __ipi_send_single(desc, cpu); +} +EXPORT_SYMBOL_GPL(ipi_send_single); + +/** + * ipi_send_mask - Send an IPI to target CPU(s) + * @virq: linux irq number from irq_reserve_ipi() + * @dest: dest CPU(s), must be a subset of the mask passed to + * irq_reserve_ipi() + * + * Returns zero on success and negative error number on failure. + */ +int ipi_send_mask(unsigned int virq, const struct cpumask *dest) +{ + struct irq_desc *desc = irq_to_desc(virq); + struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL; + struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL; + + if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0))) + return -EINVAL; + + return __ipi_send_mask(desc, dest); +} +EXPORT_SYMBOL_GPL(ipi_send_mask); -- cgit From 13b35686e8b934ff78f59cef0c65fa3a43f8eeaf Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Fri, 19 Feb 2016 09:46:37 +0100 Subject: wait.[ch]: Introduce the simple waitqueue (swait) implementation The existing wait queue support has support for custom wake up call backs, wake flags, wake key (passed to call back) and exclusive flags that allow wakers to be tagged as exclusive, for limiting the number of wakers. In a lot of cases, none of these features are used, and hence we can benefit from a slimmed down version that lowers memory overhead and reduces runtime overhead. The concept originated from -rt, where waitqueues are a constant source of trouble, as we can't convert the head lock to a raw spinlock due to fancy and long lasting callbacks. With the removal of custom callbacks, we can use a raw lock for queue list manipulations, hence allowing the simple wait support to be used in -rt. [Patch is from PeterZ which is based on Thomas version. Commit message is written by Paul G. Daniel: - Fixed some compile issues - Added non-lazy implementation of swake_up_locked as suggested by Boqun Feng.] Originally-by: Thomas Gleixner Signed-off-by: Daniel Wagner Acked-by: Peter Zijlstra (Intel) Cc: linux-rt-users@vger.kernel.org Cc: Boqun Feng Cc: Marcelo Tosatti Cc: Steven Rostedt Cc: Paul Gortmaker Cc: Paolo Bonzini Cc: "Paul E. McKenney" Link: http://lkml.kernel.org/r/1455871601-27484-2-git-send-email-wagi@monom.org Signed-off-by: Thomas Gleixner --- include/linux/swait.h | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/Makefile | 2 +- kernel/sched/swait.c | 123 ++++++++++++++++++++++++++++++++++++ 3 files changed, 296 insertions(+), 1 deletion(-) create mode 100644 include/linux/swait.h create mode 100644 kernel/sched/swait.c (limited to 'kernel') diff --git a/include/linux/swait.h b/include/linux/swait.h new file mode 100644 index 000000000000..c1f9c62a8a50 --- /dev/null +++ b/include/linux/swait.h @@ -0,0 +1,172 @@ +#ifndef _LINUX_SWAIT_H +#define _LINUX_SWAIT_H + +#include +#include +#include +#include + +/* + * Simple wait queues + * + * While these are very similar to the other/complex wait queues (wait.h) the + * most important difference is that the simple waitqueue allows for + * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold + * times. + * + * In order to make this so, we had to drop a fair number of features of the + * other waitqueue code; notably: + * + * - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue; + * all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right + * sleeper state. + * + * - the exclusive mode; because this requires preserving the list order + * and this is hard. + * + * - custom wake functions; because you cannot give any guarantees about + * random code. + * + * As a side effect of this; the data structures are slimmer. + * + * One would recommend using this wait queue where possible. + */ + +struct task_struct; + +struct swait_queue_head { + raw_spinlock_t lock; + struct list_head task_list; +}; + +struct swait_queue { + struct task_struct *task; + struct list_head task_list; +}; + +#define __SWAITQUEUE_INITIALIZER(name) { \ + .task = current, \ + .task_list = LIST_HEAD_INIT((name).task_list), \ +} + +#define DECLARE_SWAITQUEUE(name) \ + struct swait_queue name = __SWAITQUEUE_INITIALIZER(name) + +#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) { \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ + .task_list = LIST_HEAD_INIT((name).task_list), \ +} + +#define DECLARE_SWAIT_QUEUE_HEAD(name) \ + struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name) + +extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name, + struct lock_class_key *key); + +#define init_swait_queue_head(q) \ + do { \ + static struct lock_class_key __key; \ + __init_swait_queue_head((q), #q, &__key); \ + } while (0) + +#ifdef CONFIG_LOCKDEP +# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ + ({ init_swait_queue_head(&name); name; }) +# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name) \ + struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name) +#else +# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name) \ + DECLARE_SWAIT_QUEUE_HEAD(name) +#endif + +static inline int swait_active(struct swait_queue_head *q) +{ + return !list_empty(&q->task_list); +} + +extern void swake_up(struct swait_queue_head *q); +extern void swake_up_all(struct swait_queue_head *q); +extern void swake_up_locked(struct swait_queue_head *q); + +extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); +extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state); +extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state); + +extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait); +extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait); + +/* as per ___wait_event() but for swait, therefore "exclusive == 0" */ +#define ___swait_event(wq, condition, state, ret, cmd) \ +({ \ + struct swait_queue __wait; \ + long __ret = ret; \ + \ + INIT_LIST_HEAD(&__wait.task_list); \ + for (;;) { \ + long __int = prepare_to_swait_event(&wq, &__wait, state);\ + \ + if (condition) \ + break; \ + \ + if (___wait_is_interruptible(state) && __int) { \ + __ret = __int; \ + break; \ + } \ + \ + cmd; \ + } \ + finish_swait(&wq, &__wait); \ + __ret; \ +}) + +#define __swait_event(wq, condition) \ + (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \ + schedule()) + +#define swait_event(wq, condition) \ +do { \ + if (condition) \ + break; \ + __swait_event(wq, condition); \ +} while (0) + +#define __swait_event_timeout(wq, condition, timeout) \ + ___swait_event(wq, ___wait_cond_timeout(condition), \ + TASK_UNINTERRUPTIBLE, timeout, \ + __ret = schedule_timeout(__ret)) + +#define swait_event_timeout(wq, condition, timeout) \ +({ \ + long __ret = timeout; \ + if (!___wait_cond_timeout(condition)) \ + __ret = __swait_event_timeout(wq, condition, timeout); \ + __ret; \ +}) + +#define __swait_event_interruptible(wq, condition) \ + ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \ + schedule()) + +#define swait_event_interruptible(wq, condition) \ +({ \ + int __ret = 0; \ + if (!(condition)) \ + __ret = __swait_event_interruptible(wq, condition); \ + __ret; \ +}) + +#define __swait_event_interruptible_timeout(wq, condition, timeout) \ + ___swait_event(wq, ___wait_cond_timeout(condition), \ + TASK_INTERRUPTIBLE, timeout, \ + __ret = schedule_timeout(__ret)) + +#define swait_event_interruptible_timeout(wq, condition, timeout) \ +({ \ + long __ret = timeout; \ + if (!___wait_cond_timeout(condition)) \ + __ret = __swait_event_interruptible_timeout(wq, \ + condition, timeout); \ + __ret; \ +}) + +#endif /* _LINUX_SWAIT_H */ diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..7d4cba227cbd 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -13,7 +13,7 @@ endif obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o completion.o idle.o +obj-y += wait.o swait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c new file mode 100644 index 000000000000..82f0dff90030 --- /dev/null +++ b/kernel/sched/swait.c @@ -0,0 +1,123 @@ +#include +#include + +void __init_swait_queue_head(struct swait_queue_head *q, const char *name, + struct lock_class_key *key) +{ + raw_spin_lock_init(&q->lock); + lockdep_set_class_and_name(&q->lock, key, name); + INIT_LIST_HEAD(&q->task_list); +} +EXPORT_SYMBOL(__init_swait_queue_head); + +/* + * The thing about the wake_up_state() return value; I think we can ignore it. + * + * If for some reason it would return 0, that means the previously waiting + * task is already running, so it will observe condition true (or has already). + */ +void swake_up_locked(struct swait_queue_head *q) +{ + struct swait_queue *curr; + + if (list_empty(&q->task_list)) + return; + + curr = list_first_entry(&q->task_list, typeof(*curr), task_list); + wake_up_process(curr->task); + list_del_init(&curr->task_list); +} +EXPORT_SYMBOL(swake_up_locked); + +void swake_up(struct swait_queue_head *q) +{ + unsigned long flags; + + if (!swait_active(q)) + return; + + raw_spin_lock_irqsave(&q->lock, flags); + swake_up_locked(q); + raw_spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(swake_up); + +/* + * Does not allow usage from IRQ disabled, since we must be able to + * release IRQs to guarantee bounded hold time. + */ +void swake_up_all(struct swait_queue_head *q) +{ + struct swait_queue *curr; + LIST_HEAD(tmp); + + if (!swait_active(q)) + return; + + raw_spin_lock_irq(&q->lock); + list_splice_init(&q->task_list, &tmp); + while (!list_empty(&tmp)) { + curr = list_first_entry(&tmp, typeof(*curr), task_list); + + wake_up_state(curr->task, TASK_NORMAL); + list_del_init(&curr->task_list); + + if (list_empty(&tmp)) + break; + + raw_spin_unlock_irq(&q->lock); + raw_spin_lock_irq(&q->lock); + } + raw_spin_unlock_irq(&q->lock); +} +EXPORT_SYMBOL(swake_up_all); + +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + wait->task = current; + if (list_empty(&wait->task_list)) + list_add(&wait->task_list, &q->task_list); +} + +void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&q->lock, flags); + __prepare_to_swait(q, wait); + set_current_state(state); + raw_spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_swait); + +long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) +{ + if (signal_pending_state(state, current)) + return -ERESTARTSYS; + + prepare_to_swait(q, wait, state); + + return 0; +} +EXPORT_SYMBOL(prepare_to_swait_event); + +void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + __set_current_state(TASK_RUNNING); + if (!list_empty(&wait->task_list)) + list_del_init(&wait->task_list); +} + +void finish_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + + if (!list_empty_careful(&wait->task_list)) { + raw_spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + raw_spin_unlock_irqrestore(&q->lock, flags); + } +} +EXPORT_SYMBOL(finish_swait); -- cgit From 065bb78c5b09df54d1c32e03227deb777ddff57b Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Fri, 19 Feb 2016 09:46:40 +0100 Subject: rcu: Do not call rcu_nocb_gp_cleanup() while holding rnp->lock rcu_nocb_gp_cleanup() is called while holding rnp->lock. Currently, this is okay because the wake_up_all() in rcu_nocb_gp_cleanup() will not enable the IRQs. lockdep is happy. By switching over using swait this is not true anymore. swake_up_all() enables the IRQs while processing the waiters. __do_softirq() can now run and will eventually call rcu_process_callbacks() which wants to grap nrp->lock. Let's move the rcu_nocb_gp_cleanup() call outside the lock before we switch over to swait. If we would hold the rnp->lock and use swait, lockdep reports following: ================================= [ INFO: inconsistent lock state ] 4.2.0-rc5-00025-g9a73ba0 #136 Not tainted --------------------------------- inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. rcu_preempt/8 [HC0[0]:SC0[0]:HE1:SE1] takes: (rcu_node_1){+.?...}, at: [] rcu_gp_kthread+0xb97/0xeb0 {IN-SOFTIRQ-W} state was registered at: [] __lock_acquire+0xd5f/0x21e0 [] lock_acquire+0xdf/0x2b0 [] _raw_spin_lock_irqsave+0x59/0xa0 [] rcu_process_callbacks+0x141/0x3c0 [] __do_softirq+0x14d/0x670 [] irq_exit+0x104/0x110 [] smp_apic_timer_interrupt+0x46/0x60 [] apic_timer_interrupt+0x70/0x80 [] rq_attach_root+0xa6/0x100 [] cpu_attach_domain+0x16d/0x650 [] build_sched_domains+0x942/0xb00 [] sched_init_smp+0x509/0x5c1 [] kernel_init_freeable+0x172/0x28f [] kernel_init+0xe/0xe0 [] ret_from_fork+0x3f/0x70 irq event stamp: 76 hardirqs last enabled at (75): [] _raw_spin_unlock_irq+0x30/0x60 hardirqs last disabled at (76): [] _raw_spin_lock_irq+0x1f/0x90 softirqs last enabled at (0): [] copy_process.part.26+0x602/0x1cf0 softirqs last disabled at (0): [< (null)>] (null) other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(rcu_node_1); lock(rcu_node_1); *** DEADLOCK *** 1 lock held by rcu_preempt/8: #0: (rcu_node_1){+.?...}, at: [] rcu_gp_kthread+0xb97/0xeb0 stack backtrace: CPU: 0 PID: 8 Comm: rcu_preempt Not tainted 4.2.0-rc5-00025-g9a73ba0 #136 Hardware name: Dell Inc. PowerEdge R820/066N7P, BIOS 2.0.20 01/16/2014 0000000000000000 000000006d7e67d8 ffff881fb081fbd8 ffffffff818379e0 0000000000000000 ffff881fb0812a00 ffff881fb081fc38 ffffffff8110813b 0000000000000000 0000000000000001 ffff881f00000001 ffffffff8102fa4f Call Trace: [] dump_stack+0x4f/0x7b [] print_usage_bug+0x1db/0x1e0 [] ? save_stack_trace+0x2f/0x50 [] mark_lock+0x66d/0x6e0 [] ? check_usage_forwards+0x150/0x150 [] mark_held_locks+0x78/0xa0 [] ? _raw_spin_unlock_irq+0x30/0x60 [] trace_hardirqs_on_caller+0x168/0x220 [] trace_hardirqs_on+0xd/0x10 [] _raw_spin_unlock_irq+0x30/0x60 [] swake_up_all+0xb7/0xe0 [] rcu_gp_kthread+0xab1/0xeb0 [] ? trace_hardirqs_on_caller+0xff/0x220 [] ? _raw_spin_unlock_irq+0x41/0x60 [] ? rcu_barrier+0x20/0x20 [] kthread+0x104/0x120 [] ? _raw_spin_unlock_irq+0x30/0x60 [] ? kthread_create_on_node+0x260/0x260 [] ret_from_fork+0x3f/0x70 [] ? kthread_create_on_node+0x260/0x260 Signed-off-by: Daniel Wagner Acked-by: Peter Zijlstra (Intel) Cc: linux-rt-users@vger.kernel.org Cc: Boqun Feng Cc: Marcelo Tosatti Cc: Steven Rostedt Cc: Paul Gortmaker Cc: Paolo Bonzini Cc: "Paul E. McKenney" Link: http://lkml.kernel.org/r/1455871601-27484-5-git-send-email-wagi@monom.org Signed-off-by: Thomas Gleixner --- kernel/rcu/tree.c | 4 +++- kernel/rcu/tree.h | 3 ++- kernel/rcu/tree_plugin.h | 16 +++++++++++++--- 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e41dd4131f7a..8e8c6ec1d30f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1614,7 +1614,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) int needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - rcu_nocb_gp_cleanup(rsp, rnp); rnp->need_future_gp[c & 0x1] = 0; needmore = rnp->need_future_gp[(c + 1) & 0x1]; trace_rcu_future_gp(rnp, rdp, c, @@ -2010,6 +2009,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + wait_queue_head_t *sq; WRITE_ONCE(rsp->gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); @@ -2046,7 +2046,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); + sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq(&rnp->lock); + rcu_nocb_gp_cleanup(sq); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); rcu_gp_slow(rsp, gp_cleanup_delay); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 83360b4f4352..10dedfbef09d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -621,7 +621,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); +static wait_queue_head_t *rcu_nocb_gp_get(struct rcu_node *rnp); +static void rcu_nocb_gp_cleanup(wait_queue_head_t *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9467a8b7e756..631aff61abe9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1811,9 +1811,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll); * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended * grace period. */ -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_nocb_gp_cleanup(wait_queue_head_t *sq) { - wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); + wake_up_all(sq); } /* @@ -1829,6 +1829,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; } +static wait_queue_head_t *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return &rnp->nocb_gp_wq[rnp->completed & 0x1]; +} + static void rcu_init_one_nocb(struct rcu_node *rnp) { init_waitqueue_head(&rnp->nocb_gp_wq[0]); @@ -2502,7 +2507,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) return false; } -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_nocb_gp_cleanup(wait_queue_head_t *sq) { } @@ -2510,6 +2515,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) { } +static wait_queue_head_t *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return NULL; +} + static void rcu_init_one_nocb(struct rcu_node *rnp) { } -- cgit From abedf8e2419fb873d919dd74de2e84b510259339 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 19 Feb 2016 09:46:41 +0100 Subject: rcu: Use simple wait queues where possible in rcutree As of commit dae6e64d2bcfd ("rcu: Introduce proper blocking to no-CBs kthreads GP waits") the RCU subsystem started making use of wait queues. Here we convert all additions of RCU wait queues to use simple wait queues, since they don't need the extra overhead of the full wait queue features. Originally this was done for RT kernels[1], since we would get things like... BUG: sleeping function called from invalid context at kernel/rtmutex.c:659 in_atomic(): 1, irqs_disabled(): 1, pid: 8, name: rcu_preempt Pid: 8, comm: rcu_preempt Not tainted Call Trace: [] __might_sleep+0xd0/0xf0 [] rt_spin_lock+0x24/0x50 [] __wake_up+0x36/0x70 [] rcu_gp_kthread+0x4d2/0x680 [] ? __init_waitqueue_head+0x50/0x50 [] ? rcu_gp_fqs+0x80/0x80 [] kthread+0xdb/0xe0 [] ? finish_task_switch+0x52/0x100 [] kernel_thread_helper+0x4/0x10 [] ? __init_kthread_worker+0x60/0x60 [] ? gs_change+0xb/0xb ...and hence simple wait queues were deployed on RT out of necessity (as simple wait uses a raw lock), but mainline might as well take advantage of the more streamline support as well. [1] This is a carry forward of work from v3.10-rt; the original conversion was by Thomas on an earlier -rt version, and Sebastian extended it to additional post-3.10 added RCU waiters; here I've added a commit log and unified the RCU changes into one, and uprev'd it to match mainline RCU. Signed-off-by: Daniel Wagner Acked-by: Peter Zijlstra (Intel) Cc: linux-rt-users@vger.kernel.org Cc: Boqun Feng Cc: Marcelo Tosatti Cc: Steven Rostedt Cc: Paul Gortmaker Cc: Paolo Bonzini Cc: "Paul E. McKenney" Link: http://lkml.kernel.org/r/1455871601-27484-6-git-send-email-wagi@monom.org Signed-off-by: Thomas Gleixner --- kernel/rcu/tree.c | 22 +++++++++++----------- kernel/rcu/tree.h | 13 +++++++------ kernel/rcu/tree_plugin.h | 26 +++++++++++++------------- 3 files changed, 31 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e8c6ec1d30f..9fd5b628a88d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1634,7 +1634,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) !READ_ONCE(rsp->gp_flags) || !rsp->gp_kthread) return; - wake_up(&rsp->gp_wq); + swake_up(&rsp->gp_wq); } /* @@ -2009,7 +2009,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); - wait_queue_head_t *sq; + struct swait_queue_head *sq; WRITE_ONCE(rsp->gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); @@ -2094,7 +2094,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; - wait_event_interruptible(rsp->gp_wq, + swait_event_interruptible(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); rsp->gp_state = RCU_GP_DONE_GPS; @@ -2124,7 +2124,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; - ret = wait_event_interruptible_timeout(rsp->gp_wq, + ret = swait_event_interruptible_timeout(rsp->gp_wq, rcu_gp_fqs_check_wake(rsp, &gf), j); rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ @@ -2248,7 +2248,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); - rcu_gp_kthread_wake(rsp); + swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ } /* @@ -2902,7 +2902,7 @@ static void force_quiescent_state(struct rcu_state *rsp) } WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore(&rnp_old->lock, flags); - rcu_gp_kthread_wake(rsp); + swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ } /* @@ -3531,7 +3531,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_unlock_irqrestore(&rnp->lock, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ - wake_up(&rsp->expedited_wq); + swake_up(&rsp->expedited_wq); } break; } @@ -3782,7 +3782,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) jiffies_start = jiffies; for (;;) { - ret = wait_event_interruptible_timeout( + ret = swait_event_timeout( rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root), jiffies_stall); @@ -3790,7 +3790,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) return; if (ret < 0) { /* Hit a signal, disable CPU stall warnings. */ - wait_event(rsp->expedited_wq, + swait_event(rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root)); return; } @@ -4484,8 +4484,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) } } - init_waitqueue_head(&rsp->gp_wq); - init_waitqueue_head(&rsp->expedited_wq); + init_swait_queue_head(&rsp->gp_wq); + init_swait_queue_head(&rsp->expedited_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 10dedfbef09d..bbd235d0e71f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -27,6 +27,7 @@ #include #include #include +#include #include /* @@ -243,7 +244,7 @@ struct rcu_node { /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ #ifdef CONFIG_RCU_NOCB_CPU - wait_queue_head_t nocb_gp_wq[2]; + struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ int need_future_gp[2]; @@ -399,7 +400,7 @@ struct rcu_data { atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ struct rcu_head **nocb_follower_tail; - wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ + struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ @@ -478,7 +479,7 @@ struct rcu_state { unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ struct task_struct *gp_kthread; /* Task for grace periods. */ - wait_queue_head_t gp_wq; /* Where GP task waits. */ + struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ short gp_state; /* GP kthread sleep state. */ @@ -506,7 +507,7 @@ struct rcu_state { unsigned long expedited_sequence; /* Take a ticket. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ - wait_queue_head_t expedited_wq; /* Wait for check-ins. */ + struct swait_queue_head expedited_wq; /* Wait for check-ins. */ int ncpus_snap; /* # CPUs seen last time. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ @@ -621,8 +622,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); -static wait_queue_head_t *rcu_nocb_gp_get(struct rcu_node *rnp); -static void rcu_nocb_gp_cleanup(wait_queue_head_t *sq); +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 631aff61abe9..080bd202d360 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1811,9 +1811,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll); * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended * grace period. */ -static void rcu_nocb_gp_cleanup(wait_queue_head_t *sq) +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { - wake_up_all(sq); + swake_up_all(sq); } /* @@ -1829,15 +1829,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; } -static wait_queue_head_t *rcu_nocb_gp_get(struct rcu_node *rnp) +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { return &rnp->nocb_gp_wq[rnp->completed & 0x1]; } static void rcu_init_one_nocb(struct rcu_node *rnp) { - init_waitqueue_head(&rnp->nocb_gp_wq[0]); - init_waitqueue_head(&rnp->nocb_gp_wq[1]); + init_swait_queue_head(&rnp->nocb_gp_wq[0]); + init_swait_queue_head(&rnp->nocb_gp_wq[1]); } #ifndef CONFIG_RCU_NOCB_CPU_ALL @@ -1862,7 +1862,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); - wake_up(&rdp_leader->nocb_wq); + swake_up(&rdp_leader->nocb_wq); } } @@ -2074,7 +2074,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) */ trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { - wait_event_interruptible( + swait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); if (likely(d)) @@ -2102,7 +2102,7 @@ wait_again: /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); - wait_event_interruptible(my_rdp->nocb_wq, + swait_event_interruptible(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); /* Memory barrier handled by smp_mb() calls below and repoll. */ } else if (firsttime) { @@ -2177,7 +2177,7 @@ wait_again: * List was empty, wake up the follower. * Memory barriers supplied by atomic_long_add(). */ - wake_up(&rdp->nocb_wq); + swake_up(&rdp->nocb_wq); } } @@ -2198,7 +2198,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) if (!rcu_nocb_poll) { trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "FollowerSleep"); - wait_event_interruptible(rdp->nocb_wq, + swait_event_interruptible(rdp->nocb_wq, READ_ONCE(rdp->nocb_follower_head)); } else if (firsttime) { /* Don't drown trace log with "Poll"! */ @@ -2357,7 +2357,7 @@ void __init rcu_init_nohz(void) static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { rdp->nocb_tail = &rdp->nocb_head; - init_waitqueue_head(&rdp->nocb_wq); + init_swait_queue_head(&rdp->nocb_wq); rdp->nocb_follower_tail = &rdp->nocb_follower_head; } @@ -2507,7 +2507,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) return false; } -static void rcu_nocb_gp_cleanup(wait_queue_head_t *sq) +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { } @@ -2515,7 +2515,7 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) { } -static wait_queue_head_t *rcu_nocb_gp_get(struct rcu_node *rnp) +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { return NULL; } -- cgit From 232d26373d310a941ef2ab46e53ea62fe076ed13 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Fri, 26 Feb 2016 19:14:14 -0800 Subject: jiffies: Use CLOCKSOURCE_MASK instead of constant The CLOCKSOURCE_MASK(32) macro expands to the same value, but makes code more readable. Signed-off-by: Alexander Kuleshov Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Richard Cochran Link: http://lkml.kernel.org/r/1456542854-22104-3-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/jiffies.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 347fecf86a3f..555e21f7b966 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -68,7 +68,7 @@ static struct clocksource clocksource_jiffies = { .name = "jiffies", .rating = 1, /* lowest valid rating*/ .read = jiffies_read, - .mask = 0xffffffff, /*32bits*/ + .mask = CLOCKSOURCE_MASK(32), .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ .shift = JIFFIES_SHIFT, .max_cycles = 10, -- cgit From 63253ad814db726d43c04011c752d83b7aaca998 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 26 Feb 2016 13:07:38 +0800 Subject: cgroup: fix a mistake in warning message There is a mistake about the print format name:id <--> %d:%s, which the name is 'char *' type and id is 'int' type. Change "name:id" to "id:name" instead to be consistent with "cgroup_subsys %d:%s". Signed-off-by: Xiubo Li Acked-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ac5451e7c458..fcfad82149b1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5235,7 +5235,7 @@ int __init cgroup_init_early(void) for_each_subsys(ss, i) { WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, - "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", + "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n", i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, -- cgit From 39853cc0cdcf1b11f00f7f81e2f515a4d68ed209 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sun, 28 Feb 2016 22:22:37 -0600 Subject: bpf: Mark __bpf_prog_run() stack frame as non-standard objtool reports the following false positive warnings: kernel/bpf/core.o: warning: objtool: __bpf_prog_run()+0x5c: sibling call from callable instruction with changed frame pointer kernel/bpf/core.o: warning: objtool: __bpf_prog_run()+0x60: function has unreachable instruction kernel/bpf/core.o: warning: objtool: __bpf_prog_run()+0x64: function has unreachable instruction [...] It's confused by the following dynamic jump instruction in __bpf_prog_run():: jmp *(%r12,%rax,8) which corresponds to the following line in the C code: goto *jumptable[insn->code]; There's no way for objtool to deterministically find all possible branch targets for a dynamic jump, so it can't verify this code. In this case the jumps all stay within the function, and there's nothing unusual going on related to the stack, so we can whitelist the function. Signed-off-by: Josh Poimboeuf Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Bernd Petrovitsch Cc: Borislav Petkov Cc: Chris J Arges Cc: Jiri Slaby Cc: Linus Torvalds Cc: Michal Marek Cc: Namhyung Kim Cc: Pedro Alves Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: live-patching@vger.kernel.org Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/b90e6bf3fdbfb5c4cc1b164b965502e53cf48935.1456719558.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/bpf/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 972d9a8e4ac4..be0abf669ced 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -649,6 +650,7 @@ load_byte: WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); return 0; } +STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) -- cgit From 8e05e96ac949c80704d0a38420bf60dcf18c938f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sun, 28 Feb 2016 22:22:38 -0600 Subject: sched: Mark __schedule() stack frame as non-standard objtool reports the following warnings for __schedule(): kernel/sched/core.o: warning: objtool:__schedule()+0x3c0: duplicate frame pointer save kernel/sched/core.o: warning: objtool:__schedule()+0x3fd: sibling call from callable instruction with changed frame pointer kernel/sched/core.o: warning: objtool:__schedule()+0x40a: call without frame pointer save/setup kernel/sched/core.o: warning: objtool:__schedule()+0x7fd: frame pointer state mismatch kernel/sched/core.o: warning: objtool:__schedule()+0x421: frame pointer state mismatch Basically it's confused by two unusual attributes of the switch_to() macro: 1. It saves prev's frame pointer to the old stack and restores next's frame pointer from the new stack. 2. For new tasks it jumps directly to ret_from_fork. Eventually it would probably be a good idea to clean up the ret_from_fork hack so that new tasks are created with a valid initial stack, as suggested by Andy: https://lkml.kernel.org/r/CALCETrWsqCw4L1qKO9j9L5F+4ED4viuLQTFc=n1pKBZfFPQUFg@mail.gmail.com Then __schedule() could return normally into the new code and objtool hopefully wouldn't have a problem anymore. In the meantime, mark its stack frame as non-standard so we can have a baseline with no objtool warnings. The marker also serves as a reminder that this code could be improved a bit. Signed-off-by: Josh Poimboeuf Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Bernd Petrovitsch Cc: Borislav Petkov Cc: Chris J Arges Cc: Jiri Slaby Cc: Linus Torvalds Cc: Michal Marek Cc: Namhyung Kim Cc: Pedro Alves Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: live-patching@vger.kernel.org Link: http://lkml.kernel.org/r/91190e324ebd7fcd01748d508d0dfd4693e84d91.1456719558.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9503d590e5ef..641043dfc773 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -3288,6 +3289,7 @@ static void __sched notrace __schedule(bool preempt) balance_callback(rq); } +STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ static inline void sched_submit_work(struct task_struct *tsk) { -- cgit From 049369487e2068294b61cee19233be0ffac7d243 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sun, 28 Feb 2016 22:22:39 -0600 Subject: sched: Always inline context_switch() When CONFIG_GCOV is enabled, gcc decides to put context_switch() out-of-line, which is inconsistent with its normal behavior. It also causes an objtool warning because __schedule() no longer inlines context_switch(), so the "STACK_FRAME_NON_STANDARD(__schedule)" statement loses its effect. Signed-off-by: Josh Poimboeuf Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Bernd Petrovitsch Cc: Borislav Petkov Cc: Chris J Arges Cc: Jiri Slaby Cc: Linus Torvalds Cc: Michal Marek Cc: Namhyung Kim Cc: Pedro Alves Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: live-patching@vger.kernel.org Link: http://lkml.kernel.org/r/d62aee926b6e303394e34a06999a964dc2773cf6.1456719558.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 641043dfc773..bb0daabe0ffe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2763,7 +2763,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) /* * context_switch - switch to the new MM and the new thread's register state. */ -static inline struct rq * +static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { -- cgit From 869ae76147ffdf21ad24f0e599303cd58a2bb39f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 27 Feb 2016 23:11:28 +0100 Subject: uprobes: __create_xol_area() must nullify xol_mapping.fault As Jiri pointed out, this recent commit: f872f5400cc0 ("mm: Add a vm_special_mapping.fault() method") breaks uprobes: __create_xol_area() doesn't initialize the new ->fault() method and this obviously leads to kernel crash when the application tries to execute the probed insn after bp hit. We probably want to add uprobes_special_mapping_fault(), this allows to turn xol_area->xol_mapping into a single instance of vm_special_mapping. But we need a simple fix, so lets change __create_xol() to nullify the new member as Jiri suggests. Suggested-by: Jiri Olsa Reported-by: Jiri Olsa Signed-off-by: Oleg Nesterov Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Pratyush Anand Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160227221128.GA29565@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0167679182c0..5f6ce931f1ea 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1178,6 +1178,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) goto free_area; area->xol_mapping.name = "[uprobes]"; + area->xol_mapping.fault = NULL; area->xol_mapping.pages = area->pages; area->pages[0] = alloc_page(GFP_HIGHUSER); if (!area->pages[0]) -- cgit From 675965b00d734c985e4285f5bec7e524d15fc4e1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Feb 2016 22:19:27 +0000 Subject: perf: Export perf_event_sysfs_show() Required to use it in modular perf drivers. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Andi Kleen Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Harish Chegondi Cc: Jacob Pan Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20160222221012.930735780@linutronix.de Signed-off-by: Ingo Molnar --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index df5589fa8bf0..5dcc0bd08d11 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9446,6 +9446,7 @@ ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, return 0; } +EXPORT_SYMBOL_GPL(perf_event_sysfs_show); static int __init perf_event_sysfs_init(void) { -- cgit From 6fe1f348b3dd1f700f9630562b7d38afd6949568 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 Jan 2016 22:24:16 +0100 Subject: sched/cgroup: Fix cgroup entity load tracking tear-down When a cgroup's CPU runqueue is destroyed, it should remove its remaining load accounting from its parent cgroup. The current site for doing so it unsuited because its far too late and unordered against other cgroup removal (->css_free() will be, but we're also in an RCU callback). Put it in the ->css_offline() callback, which is the start of cgroup destruction, right after the group has been made unavailable to userspace. The ->css_offline() callbacks are called in hierarchical order after the following v4.4 commit: aa226ff4a1ce ("cgroup: make sure a parent css isn't offlined before its children") Signed-off-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Johannes Weiner Cc: Li Zefan Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160121212416.GL6357@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +--- kernel/sched/fair.c | 37 +++++++++++++++++++++---------------- kernel/sched/sched.h | 2 +- 3 files changed, 23 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9503d590e5ef..ab814bf100e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7860,11 +7860,9 @@ void sched_destroy_group(struct task_group *tg) void sched_offline_group(struct task_group *tg) { unsigned long flags; - int i; /* end participation in shares distribution */ - for_each_possible_cpu(i) - unregister_fair_sched_group(tg, i); + unregister_fair_sched_group(tg); spin_lock_irqsave(&task_group_lock, flags); list_del_rcu(&tg->list); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56b7d4b83947..cce330329624 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8234,11 +8234,8 @@ void free_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); - if (tg->se) { - if (tg->se[i]) - remove_entity_load_avg(tg->se[i]); + if (tg->se) kfree(tg->se[i]); - } } kfree(tg->cfs_rq); @@ -8286,21 +8283,29 @@ err: return 0; } -void unregister_fair_sched_group(struct task_group *tg, int cpu) +void unregister_fair_sched_group(struct task_group *tg) { - struct rq *rq = cpu_rq(cpu); unsigned long flags; + struct rq *rq; + int cpu; - /* - * Only empty task groups can be destroyed; so we can speculatively - * check on_list without danger of it being re-added. - */ - if (!tg->cfs_rq[cpu]->on_list) - return; + for_each_possible_cpu(cpu) { + if (tg->se[cpu]) + remove_entity_load_avg(tg->se[cpu]); - raw_spin_lock_irqsave(&rq->lock, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + continue; + + rq = cpu_rq(cpu); + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } } void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, @@ -8382,7 +8387,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } -void unregister_fair_sched_group(struct task_group *tg, int cpu) { } +void unregister_fair_sched_group(struct task_group *tg) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 10f16374df7f..30ea2d871ba7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -313,7 +313,7 @@ extern int tg_nop(struct task_group *tg, void *data); extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); -extern void unregister_fair_sched_group(struct task_group *tg, int cpu); +extern void unregister_fair_sched_group(struct task_group *tg); extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, struct sched_entity *parent); -- cgit From 48be3a67da7413d62e5efbcf2c73a9dddf61fb96 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Feb 2016 13:28:22 +0100 Subject: sched/deadline: Always calculate end of period on sched_yield() Steven noticed that occasionally a sched_yield() call would not result in a wait for the next period edge as expected. It turns out that when we call update_curr_dl() and end up with delta_exec <= 0, we will bail early and fail to throttle. Further inspection of the yield code revealed that yield_task_dl() clearing dl.runtime is wrong too, it will not account the last bit of runtime which could result in dl.runtime < 0, which in turn means that replenish would gift us with too much runtime. Fix both issues by not relying on the dl.runtime value for yield. Reported-by: Steven Rostedt Tested-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Cc: Clark Williams Cc: Daniel Bristot de Oliveira Cc: John Kacur Cc: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160223122822.GP6357@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 57b939c81bce..04a569cdd613 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -399,6 +399,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, dl_se->runtime = pi_se->dl_runtime; } + if (dl_se->dl_yielded && dl_se->runtime > 0) + dl_se->runtime = 0; + /* * We keep moving the deadline away until we get some * available runtime for the entity. This ensures correct @@ -735,8 +738,11 @@ static void update_curr_dl(struct rq *rq) * approach need further study. */ delta_exec = rq_clock_task(rq) - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) + if (unlikely((s64)delta_exec <= 0)) { + if (unlikely(dl_se->dl_yielded)) + goto throttle; return; + } schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -749,8 +755,10 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; - if (dl_runtime_exceeded(dl_se)) { + dl_se->runtime -= delta_exec; + +throttle: + if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; __dequeue_task_dl(rq, curr, 0); if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) @@ -994,18 +1002,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) */ static void yield_task_dl(struct rq *rq) { - struct task_struct *p = rq->curr; - /* * We make the task go to sleep until its current deadline by * forcing its runtime to zero. This way, update_curr_dl() stops * it and the bandwidth timer will wake it up and will give it * new scheduling parameters (thanks to dl_yielded=1). */ - if (p->dl.runtime > 0) { - rq->curr->dl.dl_yielded = 1; - p->dl.runtime = 0; - } + rq->curr->dl.dl_yielded = 1; + update_rq_clock(rq); update_curr_dl(rq); /* -- cgit From 7400d3bbaa229eb8e7631d28fb34afd7cd2c96ff Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 15 Jan 2016 16:07:49 +0900 Subject: sched/fair: Avoid using decay_load_missed() with a negative value decay_load_missed() cannot handle nagative values, so we need to prevent using the function with a negative value. Reported-by: Dietmar Eggemann Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: perterz@infradead.org Fixes: 59543275488d ("sched/fair: Prepare __update_cpu_load() to handle active tickless") Link: http://lkml.kernel.org/r/20160115070749.GA1914@X58A-UD3R Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 564104227498..3b3dc39671f8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4509,9 +4509,17 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, /* scale is effectively 1 << i now, and >> i divides by scale */ - old_load = this_rq->cpu_load[i] - tickless_load; + old_load = this_rq->cpu_load[i]; old_load = decay_load_missed(old_load, pending_updates - 1, i); - old_load += tickless_load; + if (tickless_load) { + old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); + /* + * old_load can never be a negative value because a + * decayed tickless_load cannot be greater than the + * original tickless_load. + */ + old_load += tickless_load; + } new_load = this_load; /* * Round up the averaging division if load is increasing. This -- cgit From be68a682c00dfa7733021e1da386ba7182fc7bb9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 13 Jan 2016 17:01:29 +0100 Subject: sched/fair: Consolidate nohz CPU load update code Lets factorize a bit of code there. We'll even have a third user soon. While at it, standardize the idle update function name against the others. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Acked-by: Byungchul Park Cc: Chris Metcalf Cc: Christoph Lameter Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1452700891-21807-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3b3dc39671f8..33130529e9b5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4542,6 +4542,25 @@ static unsigned long weighted_cpuload(const int cpu) } #ifdef CONFIG_NO_HZ_COMMON +static void __update_cpu_load_nohz(struct rq *this_rq, + unsigned long curr_jiffies, + unsigned long load, + int active) +{ + unsigned long pending_updates; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * In the regular NOHZ case, we were idle, this means load 0. + * In the NOHZ_FULL case, we were non-idle, we should consider + * its weighted load. + */ + __update_cpu_load(this_rq, load, pending_updates, active); + } +} + /* * There is no sane way to deal with nohz on smp when using jiffies because the * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading @@ -4559,22 +4578,15 @@ static unsigned long weighted_cpuload(const int cpu) * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ -static void update_idle_cpu_load(struct rq *this_rq) +static void update_cpu_load_idle(struct rq *this_rq) { - unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = weighted_cpuload(cpu_of(this_rq)); - unsigned long pending_updates; - /* * bail if there's load or we're actually up-to-date. */ - if (load || curr_jiffies == this_rq->last_load_update_tick) + if (weighted_cpuload(cpu_of(this_rq))) return; - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load, pending_updates, 0); + __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0); } /* @@ -4585,22 +4597,12 @@ void update_cpu_load_nohz(int active) struct rq *this_rq = this_rq(); unsigned long curr_jiffies = READ_ONCE(jiffies); unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; - unsigned long pending_updates; if (curr_jiffies == this_rq->last_load_update_tick) return; raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * In the regular NOHZ case, we were idle, this means load 0. - * In the NOHZ_FULL case, we were non-idle, we should consider - * its weighted load. - */ - __update_cpu_load(this_rq, load, pending_updates, active); - } + __update_cpu_load_nohz(this_rq, curr_jiffies, load, active); raw_spin_unlock(&this_rq->lock); } #endif /* CONFIG_NO_HZ */ @@ -4612,7 +4614,7 @@ void update_cpu_load_active(struct rq *this_rq) { unsigned long load = weighted_cpuload(cpu_of(this_rq)); /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + * See the mess around update_cpu_load_idle() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; __update_cpu_load(this_rq, load, 1, 1); @@ -7906,7 +7908,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (time_after_eq(jiffies, rq->next_balance)) { raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); - update_idle_cpu_load(rq); + update_cpu_load_idle(rq); raw_spin_unlock_irq(&rq->lock); rebalance_domains(rq, CPU_IDLE); } -- cgit From 41d93397334f3c3374810f45e7bcce9007d1a7bb Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Wed, 13 Jan 2016 16:42:38 +0800 Subject: sched/core: Remove duplicated sched_group_set_shares() prototype Signed-off-by: Dongsheng Yang Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1452674558-31897-1-git-send-email-yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 91f0a7752058..3dc7b8b3f94c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -318,7 +318,6 @@ extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, struct sched_entity *parent); extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); -extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); -- cgit From ff77e468535987b3d21b7bd4da15608ea3ce7d0b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Jan 2016 15:27:07 +0100 Subject: sched/rt: Fix PI handling vs. sched_setscheduler() Andrea Parri reported: > I found that the following scenario (with CONFIG_RT_GROUP_SCHED=y) is not > handled correctly: > > T1 (prio = 20) > lock(rtmutex); > > T2 (prio = 20) > blocks on rtmutex (rt_nr_boosted = 0 on T1's rq) > > T1 (prio = 20) > sys_set_scheduler(prio = 0) > [new_effective_prio == oldprio] > T1 prio = 20 (rt_nr_boosted = 0 on T1's rq) > > The last step is incorrect as T1 is now boosted (c.f., rt_se_boosted()); > in particular, if we continue with > > T1 (prio = 20) > unlock(rtmutex) > wakeup(T2) > adjust_prio(T1) > [prio != rt_mutex_getprio(T1)] > dequeue(T1) > rt_nr_boosted = (unsigned long)(-1) > ... > T1 prio = 0 > > then we end up leaving rt_nr_boosted in an "inconsistent" state. > > The simple program attached could reproduce the previous scenario; note > that, as a consequence of the presence of this state, the "assertion" > > WARN_ON(!rt_nr_running && rt_nr_boosted) > > from dec_rt_group() may trigger. So normally we dequeue/enqueue tasks in sched_setscheduler(), which would ensure the accounting stays correct. However in the early PI path we fail to do so. So this was introduced at around v3.14, by: c365c292d059 ("sched: Consider pi boosting in setscheduler()") which fixed another problem exactly because that dequeue/enqueue, joy. Fix this by teaching rt about DEQUEUE_SAVE/ENQUEUE_RESTORE and have it preserve runqueue location with that option. This requires decoupling the on_rt_rq() state from being on the list. In order to allow for explicit movement during the SAVE/RESTORE, introduce {DE,EN}QUEUE_MOVE. We still must use SAVE/RESTORE in these cases to preserve other invariants. Respecting the SAVE/RESTORE flags also has the (nice) side-effect that things like sys_nice()/sys_sched_setaffinity() also do not reorder FIFO tasks (whereas they used to before this patch). Reported-by: Andrea Parri Tested-by: Andrea Parri Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/sched/core.c | 39 +++++++++++++---------- kernel/sched/rt.c | 86 +++++++++++++++++++++++++++++++++++---------------- kernel/sched/sched.h | 36 ++++++++++++++++----- 4 files changed, 113 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index a292c4b7e94c..87e5f9886ac8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1293,6 +1293,8 @@ struct sched_rt_entity { unsigned long timeout; unsigned long watchdog_stamp; unsigned int time_slice; + unsigned short on_rq; + unsigned short on_list; struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 291552b4d8ee..bb565b4663c8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2221,6 +2221,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); + p->rt.timeout = 0; + p->rt.time_slice = sched_rr_timeslice; + p->rt.on_rq = 0; + p->rt.on_list = 0; #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -3531,7 +3535,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; + int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; struct rq *rq; const struct sched_class *prev_class; @@ -3559,11 +3563,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; + + if (oldprio == prio) + queue_flag &= ~DEQUEUE_MOVE; + prev_class = p->sched_class; queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flag); if (running) put_prev_task(rq, p); @@ -3581,7 +3589,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; - enqueue_flag |= ENQUEUE_REPLENISH; + queue_flag |= ENQUEUE_REPLENISH; } else p->dl.dl_boosted = 0; p->sched_class = &dl_sched_class; @@ -3589,7 +3597,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (dl_prio(oldprio)) p->dl.dl_boosted = 0; if (oldprio < prio) - enqueue_flag |= ENQUEUE_HEAD; + queue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) @@ -3604,7 +3612,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, enqueue_flag); + enqueue_task(rq, p, queue_flag); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -3960,6 +3968,7 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_class *prev_class; struct rq *rq; int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); @@ -4142,17 +4151,14 @@ change: * itself. */ new_effective_prio = rt_mutex_get_effective_prio(p, newprio); - if (new_effective_prio == oldprio) { - __setscheduler_params(p, attr); - task_rq_unlock(rq, p, &flags); - return 0; - } + if (new_effective_prio == oldprio) + queue_flags &= ~DEQUEUE_MOVE; } queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flags); if (running) put_prev_task(rq, p); @@ -4162,15 +4168,14 @@ change: if (running) p->sched_class->set_curr_task(rq); if (queued) { - int enqueue_flags = ENQUEUE_RESTORE; /* * We enqueue to tail when the priority of a task is * increased (user space view). */ - if (oldprio <= p->prio) - enqueue_flags |= ENQUEUE_HEAD; + if (oldprio < p->prio) + queue_flags |= ENQUEUE_HEAD; - enqueue_task(rq, p, enqueue_flags); + enqueue_task(rq, p, queue_flags); } check_class_changed(rq, p, prev_class, oldprio); @@ -7958,7 +7963,7 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE); + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); if (unlikely(running)) put_prev_task(rq, tsk); @@ -7982,7 +7987,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, tsk, ENQUEUE_RESTORE); + enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); task_rq_unlock(rq, tsk, &flags); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86abe0ea1..406a9c20b210 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -436,7 +436,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq); static inline int on_rt_rq(struct sched_rt_entity *rt_se) { - return !list_empty(&rt_se->run_list); + return rt_se->on_rq; } #ifdef CONFIG_RT_GROUP_SCHED @@ -482,8 +482,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) return rt_se->my_q; } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); -static void dequeue_rt_entity(struct sched_rt_entity *rt_se); +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { @@ -499,7 +499,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) if (!rt_se) enqueue_top_rt_rq(rt_rq); else if (!on_rt_rq(rt_se)) - enqueue_rt_entity(rt_se, false); + enqueue_rt_entity(rt_se, 0); if (rt_rq->highest_prio.curr < curr->prio) resched_curr(rq); @@ -516,7 +516,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) if (!rt_se) dequeue_top_rt_rq(rt_rq); else if (on_rt_rq(rt_se)) - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, 0); } static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -1166,7 +1166,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) dec_rt_group(rt_se, rt_rq); } -static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +/* + * Change rt_se->run_list location unless SAVE && !MOVE + * + * assumes ENQUEUE/DEQUEUE flags match + */ +static inline bool move_entity(unsigned int flags) +{ + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + return false; + + return true; +} + +static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) +{ + list_del_init(&rt_se->run_list); + + if (list_empty(array->queue + rt_se_prio(rt_se))) + __clear_bit(rt_se_prio(rt_se), array->bitmap); + + rt_se->on_list = 0; +} + +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; @@ -1179,26 +1202,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) * get throttled and the current group doesn't have any other * active members. */ - if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { + if (rt_se->on_list) + __delist_rt_entity(rt_se, array); return; + } - if (head) - list_add(&rt_se->run_list, queue); - else - list_add_tail(&rt_se->run_list, queue); - __set_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(rt_se->on_list); + if (flags & ENQUEUE_HEAD) + list_add(&rt_se->run_list, queue); + else + list_add_tail(&rt_se->run_list, queue); + + __set_bit(rt_se_prio(rt_se), array->bitmap); + rt_se->on_list = 1; + } + rt_se->on_rq = 1; inc_rt_tasks(rt_se, rt_rq); } -static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; - list_del_init(&rt_se->run_list); - if (list_empty(array->queue + rt_se_prio(rt_se))) - __clear_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(!rt_se->on_list); + __delist_rt_entity(rt_se, array); + } + rt_se->on_rq = 0; dec_rt_tasks(rt_se, rt_rq); } @@ -1207,7 +1241,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) * Because the prio of an upper entry depends on the lower * entries, we must remove entries top - down. */ -static void dequeue_rt_stack(struct sched_rt_entity *rt_se) +static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) { struct sched_rt_entity *back = NULL; @@ -1220,31 +1254,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) - __dequeue_rt_entity(rt_se); + __dequeue_rt_entity(rt_se, flags); } } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) - __enqueue_rt_entity(rt_se, head); + __enqueue_rt_entity(rt_se, flags); enqueue_top_rt_rq(&rq->rt); } -static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq && rt_rq->rt_nr_running) - __enqueue_rt_entity(rt_se, false); + __enqueue_rt_entity(rt_se, flags); } enqueue_top_rt_rq(&rq->rt); } @@ -1260,7 +1294,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; - enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + enqueue_rt_entity(rt_se, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1271,7 +1305,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, flags); dequeue_pushable_task(rq, p); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3dc7b8b3f94c..d3606e40ea0d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1130,18 +1130,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) extern const int sched_prio_to_weight[40]; extern const u32 sched_prio_to_wmult[40]; +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_WAKING - sched_class::task_waking was called + * + */ + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ + #define ENQUEUE_WAKEUP 0x01 -#define ENQUEUE_HEAD 0x02 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 + +#define ENQUEUE_HEAD 0x08 +#define ENQUEUE_REPLENISH 0x10 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ +#define ENQUEUE_WAKING 0x20 #else #define ENQUEUE_WAKING 0x00 #endif -#define ENQUEUE_REPLENISH 0x08 -#define ENQUEUE_RESTORE 0x10 - -#define DEQUEUE_SLEEP 0x01 -#define DEQUEUE_SAVE 0x02 #define RETRY_TASK ((void *)-1UL) -- cgit From d6ca41d7922ce0110a840ef4f8ec4afdd5a239d3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 22 Feb 2016 16:26:50 -0500 Subject: sched/debug: Move the /sys/kernel/debug/sched_features file setup into debug.c As /sys/kernel/debug/sched_features is only created when SCHED_DEBUG is enabled, and the file debug.c is only compiled when SCHED_DEBUG is enabled, it makes sense to move sched_feature setup into that file and get rid of the #ifdef. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Clark Williams Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160222212825.464193063@goodmis.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 133 --------------------------------------------------- kernel/sched/debug.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+), 133 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bb565b4663c8..ffaaeb8a9f98 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -66,7 +66,6 @@ #include #include #include -#include #include #include #include @@ -124,138 +123,6 @@ const_debug unsigned int sysctl_sched_features = #undef SCHED_FEAT -#ifdef CONFIG_SCHED_DEBUG -#define SCHED_FEAT(name, enabled) \ - #name , - -static const char * const sched_feat_names[] = { -#include "features.h" -}; - -#undef SCHED_FEAT - -static int sched_feat_show(struct seq_file *m, void *v) -{ - int i; - - for (i = 0; i < __SCHED_FEAT_NR; i++) { - if (!(sysctl_sched_features & (1UL << i))) - seq_puts(m, "NO_"); - seq_printf(m, "%s ", sched_feat_names[i]); - } - seq_puts(m, "\n"); - - return 0; -} - -#ifdef HAVE_JUMP_LABEL - -#define jump_label_key__true STATIC_KEY_INIT_TRUE -#define jump_label_key__false STATIC_KEY_INIT_FALSE - -#define SCHED_FEAT(name, enabled) \ - jump_label_key__##enabled , - -struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { -#include "features.h" -}; - -#undef SCHED_FEAT - -static void sched_feat_disable(int i) -{ - static_key_disable(&sched_feat_keys[i]); -} - -static void sched_feat_enable(int i) -{ - static_key_enable(&sched_feat_keys[i]); -} -#else -static void sched_feat_disable(int i) { }; -static void sched_feat_enable(int i) { }; -#endif /* HAVE_JUMP_LABEL */ - -static int sched_feat_set(char *cmp) -{ - int i; - int neg = 0; - - if (strncmp(cmp, "NO_", 3) == 0) { - neg = 1; - cmp += 3; - } - - for (i = 0; i < __SCHED_FEAT_NR; i++) { - if (strcmp(cmp, sched_feat_names[i]) == 0) { - if (neg) { - sysctl_sched_features &= ~(1UL << i); - sched_feat_disable(i); - } else { - sysctl_sched_features |= (1UL << i); - sched_feat_enable(i); - } - break; - } - } - - return i; -} - -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - char *cmp; - int i; - struct inode *inode; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); - - /* Ensure the static_key remains in a consistent state */ - inode = file_inode(filp); - inode_lock(inode); - i = sched_feat_set(cmp); - inode_unlock(inode); - if (i == __SCHED_FEAT_NR) - return -EINVAL; - - *ppos += cnt; - - return cnt; -} - -static int sched_feat_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, sched_feat_show, NULL); -} - -static const struct file_operations sched_feat_fops = { - .open = sched_feat_open, - .write = sched_feat_write, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static __init int sched_init_debug(void) -{ - debugfs_create_file("sched_features", 0644, NULL, NULL, - &sched_feat_fops); - - return 0; -} -late_initcall(sched_init_debug); -#endif /* CONFIG_SCHED_DEBUG */ - /* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7cfa87bd8b89..d2dedfcbf84d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "sched.h" @@ -58,6 +59,136 @@ static unsigned long nsec_low(unsigned long long nsec) #define SPLIT_NS(x) nsec_high(x), nsec_low(x) +#define SCHED_FEAT(name, enabled) \ + #name , + +static const char * const sched_feat_names[] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static int sched_feat_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0; i < __SCHED_FEAT_NR; i++) { + if (!(sysctl_sched_features & (1UL << i))) + seq_puts(m, "NO_"); + seq_printf(m, "%s ", sched_feat_names[i]); + } + seq_puts(m, "\n"); + + return 0; +} + +#ifdef HAVE_JUMP_LABEL + +#define jump_label_key__true STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE + +#define SCHED_FEAT(name, enabled) \ + jump_label_key__##enabled , + +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static void sched_feat_disable(int i) +{ + static_key_disable(&sched_feat_keys[i]); +} + +static void sched_feat_enable(int i) +{ + static_key_enable(&sched_feat_keys[i]); +} +#else +static void sched_feat_disable(int i) { }; +static void sched_feat_enable(int i) { }; +#endif /* HAVE_JUMP_LABEL */ + +static int sched_feat_set(char *cmp) +{ + int i; + int neg = 0; + + if (strncmp(cmp, "NO_", 3) == 0) { + neg = 1; + cmp += 3; + } + + for (i = 0; i < __SCHED_FEAT_NR; i++) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { + if (neg) { + sysctl_sched_features &= ~(1UL << i); + sched_feat_disable(i); + } else { + sysctl_sched_features |= (1UL << i); + sched_feat_enable(i); + } + break; + } + } + + return i; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int i; + struct inode *inode; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + /* Ensure the static_key remains in a consistent state */ + inode = file_inode(filp); + inode_lock(inode); + i = sched_feat_set(cmp); + inode_unlock(inode); + if (i == __SCHED_FEAT_NR) + return -EINVAL; + + *ppos += cnt; + + return cnt; +} + +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_feat_show, NULL); +} + +static const struct file_operations sched_feat_fops = { + .open = sched_feat_open, + .write = sched_feat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init int sched_init_debug(void) +{ + debugfs_create_file("sched_features", 0644, NULL, NULL, + &sched_feat_fops); + + return 0; +} +late_initcall(sched_init_debug); + #ifdef CONFIG_FAIR_GROUP_SCHED static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { -- cgit From 3866e845ed522258c77da2eaa9f849ef55206ca2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 22 Feb 2016 16:26:51 -0500 Subject: sched/debug: Move sched_domain_sysctl to debug.c The sched_domain_sysctl setup is only enabled when SCHED_DEBUG is configured. As debug.c is only compiled when SCHED_DEBUG is configured as well, move the setup of sched_domain_sysctl into that file. Note, the (un)register_sched_domain_sysctl() functions had to be changed from static to allow access to them from core.c. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Clark Williams Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160222212825.599278093@goodmis.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 178 --------------------------------------------------- kernel/sched/debug.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 13 ++++ 3 files changed, 186 insertions(+), 178 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ffaaeb8a9f98..6dbb21f4b17c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -70,7 +70,6 @@ #include #include #include -#include #include #include @@ -5342,183 +5341,6 @@ static void migrate_tasks(struct rq *dead_rq) } #endif /* CONFIG_HOTPLUG_CPU */ -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) - -static struct ctl_table sd_ctl_dir[] = { - { - .procname = "sched_domain", - .mode = 0555, - }, - {} -}; - -static struct ctl_table sd_ctl_root[] = { - { - .procname = "kernel", - .mode = 0555, - .child = sd_ctl_dir, - }, - {} -}; - -static struct ctl_table *sd_alloc_ctl_entry(int n) -{ - struct ctl_table *entry = - kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); - - return entry; -} - -static void sd_free_ctl_entry(struct ctl_table **tablep) -{ - struct ctl_table *entry; - - /* - * In the intermediate directories, both the child directory and - * procname are dynamically allocated and could fail but the mode - * will always be set. In the lowest directory the names are - * static strings and all have proc handlers. - */ - for (entry = *tablep; entry->mode; entry++) { - if (entry->child) - sd_free_ctl_entry(&entry->child); - if (entry->proc_handler == NULL) - kfree(entry->procname); - } - - kfree(*tablep); - *tablep = NULL; -} - -static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX-1; - -static void -set_table_entry(struct ctl_table *entry, - const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler, - bool load_idx) -{ - entry->procname = procname; - entry->data = data; - entry->maxlen = maxlen; - entry->mode = mode; - entry->proc_handler = proc_handler; - - if (load_idx) { - entry->extra1 = &min_load_idx; - entry->extra2 = &max_load_idx; - } -} - -static struct ctl_table * -sd_alloc_ctl_domain_table(struct sched_domain *sd) -{ - struct ctl_table *table = sd_alloc_ctl_entry(14); - - if (table == NULL) - return NULL; - - set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[9], "cache_nice_tries", - &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "max_newidle_lb_cost", - &sd->max_newidle_lb_cost, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[12], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[13] is terminator */ - - return table; -} - -static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) -{ - struct ctl_table *entry, *table; - struct sched_domain *sd; - int domain_num = 0, i; - char buf[32]; - - for_each_domain(cpu, sd) - domain_num++; - entry = table = sd_alloc_ctl_entry(domain_num + 1); - if (table == NULL) - return NULL; - - i = 0; - for_each_domain(cpu, sd) { - snprintf(buf, 32, "domain%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_domain_table(sd); - entry++; - i++; - } - return table; -} - -static struct ctl_table_header *sd_sysctl_header; -static void register_sched_domain_sysctl(void) -{ - int i, cpu_num = num_possible_cpus(); - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); - char buf[32]; - - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = entry; - - if (entry == NULL) - return; - - for_each_possible_cpu(i) { - snprintf(buf, 32, "cpu%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_cpu_table(i); - entry++; - } - - WARN_ON(sd_sysctl_header); - sd_sysctl_header = register_sysctl_table(sd_ctl_root); -} - -/* may be called multiple times per register */ -static void unregister_sched_domain_sysctl(void) -{ - unregister_sysctl_table(sd_sysctl_header); - sd_sysctl_header = NULL; - if (sd_ctl_dir[0].child) - sd_free_ctl_entry(&sd_ctl_dir[0].child); -} -#else -static void register_sched_domain_sysctl(void) -{ -} -static void unregister_sched_domain_sysctl(void) -{ -} -#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ - static void set_rq_online(struct rq *rq) { if (!rq->online) { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index d2dedfcbf84d..313c65fe2087 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -189,6 +189,179 @@ static __init int sched_init_debug(void) } late_initcall(sched_init_debug); +#ifdef CONFIG_SMP + +#ifdef CONFIG_SYSCTL + +static struct ctl_table sd_ctl_dir[] = { + { + .procname = "sched_domain", + .mode = 0555, + }, + {} +}; + +static struct ctl_table sd_ctl_root[] = { + { + .procname = "kernel", + .mode = 0555, + .child = sd_ctl_dir, + }, + {} +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); + + return entry; +} + +static void sd_free_ctl_entry(struct ctl_table **tablep) +{ + struct ctl_table *entry; + + /* + * In the intermediate directories, both the child directory and + * procname are dynamically allocated and could fail but the mode + * will always be set. In the lowest directory the names are + * static strings and all have proc handlers. + */ + for (entry = *tablep; entry->mode; entry++) { + if (entry->child) + sd_free_ctl_entry(&entry->child); + if (entry->proc_handler == NULL) + kfree(entry->procname); + } + + kfree(*tablep); + *tablep = NULL; +} + +static int min_load_idx = 0; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; + +static void +set_table_entry(struct ctl_table *entry, + const char *procname, void *data, int maxlen, + umode_t mode, proc_handler *proc_handler, + bool load_idx) +{ + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; + + if (load_idx) { + entry->extra1 = &min_load_idx; + entry->extra2 = &max_load_idx; + } +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table = sd_alloc_ctl_entry(14); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[1], "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[9], "cache_nice_tries", + &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[10], "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[11], "max_newidle_lb_cost", + &sd->max_newidle_lb_cost, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[12], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring, false); + /* &table[13] is terminator */ + + return table; +} + +static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct ctl_table *entry, *table; + struct sched_domain *sd; + int domain_num = 0, i; + char buf[32]; + + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + if (table == NULL) + return NULL; + + i = 0; + for_each_domain(cpu, sd) { + snprintf(buf, 32, "domain%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +void register_sched_domain_sysctl(void) +{ + int i, cpu_num = num_possible_cpus(); + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + char buf[32]; + + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = entry; + + if (entry == NULL) + return; + + for_each_possible_cpu(i) { + snprintf(buf, 32, "cpu%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_cpu_table(i); + entry++; + } + + WARN_ON(sd_sysctl_header); + sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} + +/* may be called multiple times per register */ +void unregister_sched_domain_sysctl(void) +{ + unregister_sysctl_table(sd_sysctl_header); + sd_sysctl_header = NULL; + if (sd_ctl_dir[0].child) + sd_free_ctl_entry(&sd_ctl_dir[0].child); +} +#endif /* CONFIG_SYSCTL */ +#endif /* CONFIG_SMP */ + #ifdef CONFIG_FAIR_GROUP_SCHED static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d3606e40ea0d..ef5875fff5b7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -908,6 +909,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group) extern int group_balance_cpu(struct sched_group *sg); +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) +void register_sched_domain_sysctl(void); +void unregister_sched_domain_sysctl(void); +#else +static inline void register_sched_domain_sysctl(void) +{ +} +static inline void unregister_sched_domain_sysctl(void) +{ +} +#endif + #else static inline void sched_ttwu_pending(void) { } -- cgit From ef477183d06b0aa41c9e7c02cf5bfec41536e2c4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 22 Feb 2016 16:26:52 -0500 Subject: sched/debug: Add deadline scheduler bandwidth ratio to /proc/sched_debug Playing with SCHED_DEADLINE and cpusets, I found that I was unable to create new SCHED_DEADLINE tasks, with the error of EBUSY as if the bandwidth was already used up. I then realized there wa no way to see what bandwidth is used by the runqueues to debug the issue. By adding the dl_bw->bw and dl_bw->total_bw to the output of the deadline info in /proc/sched_debug, this allows us to see what bandwidth has been reserved and where a problem may exist. For example, before the issue we see the ratio of the bandwidth: # cat /proc/sys/kernel/sched_rt_runtime_us 950000 # cat /proc/sys/kernel/sched_rt_period_us 1000000 # grep dl /proc/sched_debug dl_rq[0]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[1]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[2]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[3]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[4]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[5]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[6]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 dl_rq[7]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 0 Note: (950000 / 1000000) << 20 == 996147 After I played with cpusets and hit the issue, the result is now: # grep dl /proc/sched_debug dl_rq[0]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : -104857 dl_rq[1]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 104857 dl_rq[2]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 104857 dl_rq[3]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : 104857 dl_rq[4]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : -104857 dl_rq[5]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : -104857 dl_rq[6]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : -104857 dl_rq[7]: .dl_nr_running : 0 .dl_bw->bw : 996147 .dl_bw->total_bw : -104857 This shows that there is definitely a problem as we should never have a negative total bandwidth. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Clark Williams Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160222212825.756849091@goodmis.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 313c65fe2087..4fbc3bd5ff60 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -566,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) { + struct dl_bw *dl_bw; + SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); +#ifdef CONFIG_SMP + dl_bw = &cpu_rq(cpu)->rd->dl_bw; +#else + dl_bw = &dl_rq->dl_bw; +#endif + SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); + SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); } extern __read_mostly int sched_clock_running; -- cgit From c3a990dc9fab590fb88608410f1cc2bc866bdf30 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Feb 2016 18:37:46 -0500 Subject: sched/rt: Kick RT bandwidth timer immediately on start up I've been debugging why deadline tasks can cause the RT scheduler to throttle, even when the deadline tasks are only taking up 50% of the CPU and RT tasks are not even using 1% of the CPU. Here's what I found. In order to keep a CPU from being hogged by RT tasks, the deadline scheduler adds its run time (delta_exec) to the rt_time of the RT bandwidth. That way, if the two use more than 95% of the CPU within one second (default settings), the RT tasks are throttled to allow non RT tasks to run. Although the deadline tasks add their run time to the RT bandwidth, it lets the RT tasks do the accounting. This is where the problem lies. If a deadline task runs for a bit, and no RT tasks are running, then it will continually add to the RT rt_time that is used to calculate how much CPU the RT tasks use. But no RT period is in play, and this accumulation of the runtime never gets reset. When an RT task finally gets to run, and the watchdog goes off, it can see that the RT task has used more than it should of, because the deadline task added all this runtime to its rt_time. Then the RT task that just woke up gets throttled for no good reason. I also noticed that when an RT task is queued, it starts the timer to account for overload and such. But that timer goes off one period later, which may be too late and the extra rt_time will trigger a throttle. This is a quick work around to the problem. When a new RT task is queued, the bandwidth timer is set to go off immediately. Then the timer can clear out the extra time added to the rt_time while there was no RT task running. This stops my tests from triggering the throttle, and it will still throttle if an RT task runs too much, even while a deadline task is running. A better solution may be to subtract the bandwidth that the deadline task uses from the rt_runtime, and add it back when its finished. Then there wont be a need for runtime tracking of the time used by deadline tasks. I may play with that solution tomorrow. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Clark Williams Cc: Daniel Bristot de Oliveira Cc: John Kacur Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160216183746.349ec98b@gandalf.local.home Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 406a9c20b210..a774b4dbf291 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_lock(&rt_b->rt_runtime_lock); if (!rt_b->rt_period_active) { rt_b->rt_period_active = 1; - hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); + /* + * SCHED_DEADLINE updates the bandwidth, as a run away + * RT task with a DL task could hog a CPU. But DL does + * not reset the period. If a deadline task was running + * without an RT task running, it can cause RT tasks to + * throttle when they start up. Kick the timer right away + * to update the period. + */ + hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); } raw_spin_unlock(&rt_b->rt_runtime_lock); -- cgit From 382c2fe994321d503647ce8ee329b9420dc7c1f9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 10 Feb 2016 20:08:24 -0500 Subject: sched, time: Remove non-power-of-two divides from __acct_update_integrals() When running a microbenchmark calling an invalid syscall number in a loop, on a nohz_full CPU, we spend a full 9% of our CPU time in __acct_update_integrals(). This function converts cputime_t to jiffies, to a timeval, only to convert the timeval back to microseconds before discarding it. This patch leaves __acct_update_integrals() functionally equivalent, but speeds things up by about 12%, with 10 million calls to an invalid syscall number dropping from 3.7 to 3.25 seconds. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: clark@redhat.com Cc: eric.dumazet@gmail.com Cc: fweisbec@gmail.com Cc: luto@amacapital.net Link: http://lkml.kernel.org/r/1455152907-18495-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/tsacct.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 975cb49e32bf..460ee2bbfef3 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) { struct mm_struct *mm; - /* convert pages-usec to Mbyte-usec */ - stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB; - stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB; + /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */ + stats->coremem = p->acct_rss_mem1 * PAGE_SIZE; + do_div(stats->coremem, 1000 * KB); + stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE; + do_div(stats->virtmem, 1000 * KB); mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ @@ -125,22 +127,26 @@ static void __acct_update_integrals(struct task_struct *tsk, { if (likely(tsk->mm)) { cputime_t time, dtime; - struct timeval value; unsigned long flags; u64 delta; local_irq_save(flags); time = stime + utime; dtime = time - tsk->acct_timexpd; - jiffies_to_timeval(cputime_to_jiffies(dtime), &value); - delta = value.tv_sec; - delta = delta * USEC_PER_SEC + value.tv_usec; + /* Avoid division: cputime_t is often in nanoseconds already. */ + delta = cputime_to_nsecs(dtime); - if (delta == 0) + if (delta < TICK_NSEC) goto out; + tsk->acct_timexpd = time; - tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; + /* + * Divide by 1024 to avoid overflow, and to avoid division. + * The final unit reported to userspace is Mbyte-usecs, + * the rest of the math is done in xacct_add_tsk. + */ + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; + tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; out: local_irq_restore(flags); } -- cgit From b2add86edd3bc050af350515e6ba26f4622c38f3 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 10 Feb 2016 20:08:25 -0500 Subject: acct, time: Change indentation in __acct_update_integrals() Change the indentation in __acct_update_integrals() to make the function a little easier to read. Suggested-by: Peter Zijlstra Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Acked-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: clark@redhat.com Cc: eric.dumazet@gmail.com Cc: fweisbec@gmail.com Cc: luto@amacapital.net Link: http://lkml.kernel.org/r/1455152907-18495-3-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/tsacct.c | 51 ++++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 460ee2bbfef3..d12e815b7bcd 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -125,31 +125,32 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) static void __acct_update_integrals(struct task_struct *tsk, cputime_t utime, cputime_t stime) { - if (likely(tsk->mm)) { - cputime_t time, dtime; - unsigned long flags; - u64 delta; - - local_irq_save(flags); - time = stime + utime; - dtime = time - tsk->acct_timexpd; - /* Avoid division: cputime_t is often in nanoseconds already. */ - delta = cputime_to_nsecs(dtime); - - if (delta < TICK_NSEC) - goto out; - - tsk->acct_timexpd = time; - /* - * Divide by 1024 to avoid overflow, and to avoid division. - * The final unit reported to userspace is Mbyte-usecs, - * the rest of the math is done in xacct_add_tsk. - */ - tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; - out: - local_irq_restore(flags); - } + cputime_t time, dtime; + unsigned long flags; + u64 delta; + + if (!likely(tsk->mm)) + return; + + local_irq_save(flags); + time = stime + utime; + dtime = time - tsk->acct_timexpd; + /* Avoid division: cputime_t is often in nanoseconds already. */ + delta = cputime_to_nsecs(dtime); + + if (delta < TICK_NSEC) + goto out; + + tsk->acct_timexpd = time; + /* + * Divide by 1024 to avoid overflow, and to avoid division. + * The final unit reported to userspace is Mbyte-usecs, + * the rest of the math is done in xacct_add_tsk. + */ + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; + tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; +out: + local_irq_restore(flags); } /** -- cgit From 9344c92c2e72e495f695caef8364b3dd73af0eab Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 10 Feb 2016 20:08:26 -0500 Subject: time, acct: Drop irq save & restore from __acct_update_integrals() It looks like all the call paths that lead to __acct_update_integrals() already have irqs disabled, and __acct_update_integrals() does not need to disable irqs itself. This is very convenient since about half the CPU time left in this function was spent in local_irq_save alone. Performance of a microbenchmark that calls an invalid syscall ten million times in a row on a nohz_full CPU improves 21% vs. 4.5-rc1 with both the removal of divisions from __acct_update_integrals() and this patch, with runtime dropping from 3.7 to 2.9 seconds. With these patches applied, the highest remaining cpu user in the trace is native_sched_clock, which is addressed in the next patch. For testing purposes I stuck a WARN_ON(!irqs_disabled()) test in __acct_update_integrals(). It did not trigger. Suggested-by: Peter Zijlstra Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Mike Galbraith Cc: clark@redhat.com Cc: eric.dumazet@gmail.com Cc: fweisbec@gmail.com Cc: luto@amacapital.net Link: http://lkml.kernel.org/r/1455152907-18495-4-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/tsacct.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index d12e815b7bcd..f8e26ab963ed 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -126,20 +126,18 @@ static void __acct_update_integrals(struct task_struct *tsk, cputime_t utime, cputime_t stime) { cputime_t time, dtime; - unsigned long flags; u64 delta; if (!likely(tsk->mm)) return; - local_irq_save(flags); time = stime + utime; dtime = time - tsk->acct_timexpd; /* Avoid division: cputime_t is often in nanoseconds already. */ delta = cputime_to_nsecs(dtime); if (delta < TICK_NSEC) - goto out; + return; tsk->acct_timexpd = time; /* @@ -149,8 +147,6 @@ static void __acct_update_integrals(struct task_struct *tsk, */ tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; -out: - local_irq_restore(flags); } /** @@ -160,9 +156,12 @@ out: void acct_update_integrals(struct task_struct *tsk) { cputime_t utime, stime; + unsigned long flags; + local_irq_save(flags); task_cputime(tsk, &utime, &stime); __acct_update_integrals(tsk, utime, stime); + local_irq_restore(flags); } /** -- cgit From ff9a9b4c4334b53b52ee9279f30bd5dd92ea9bdd Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 10 Feb 2016 20:08:27 -0500 Subject: sched, time: Switch VIRT_CPU_ACCOUNTING_GEN to jiffy granularity When profiling syscall overhead on nohz-full kernels, after removing __acct_update_integrals() from the profile, native_sched_clock() remains as the top CPU user. This can be reduced by moving VIRT_CPU_ACCOUNTING_GEN to jiffy granularity. This will reduce timing accuracy on nohz_full CPUs to jiffy based sampling, just like on normal CPUs. It results in totally removing native_sched_clock from the profile, and significantly speeding up the syscall entry and exit path, as well as irq entry and exit, and KVM guest entry & exit. Additionally, only call the more expensive functions (and advance the seqlock) when jiffies actually changed. This code relies on another CPU advancing jiffies when the system is busy. On a nohz_full system, this is done by a housekeeping CPU. A microbenchmark calling an invalid syscall number 10 million times in a row speeds up an additional 30% over the numbers with just the previous patches, for a total speedup of about 40% over 4.4 and 4.5-rc1. Run times for the microbenchmark: 4.4 3.8 seconds 4.5-rc1 3.7 seconds 4.5-rc1 + first patch 3.3 seconds 4.5-rc1 + first 3 patches 3.1 seconds 4.5-rc1 + all patches 2.3 seconds A non-NOHZ_FULL cpu (not the housekeeping CPU): all kernels 1.86 seconds Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: clark@redhat.com Cc: eric.dumazet@gmail.com Cc: fweisbec@gmail.com Cc: luto@amacapital.net Link: http://lkml.kernel.org/r/1455152907-18495-5-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b2ab2ffb1adc..01d9898bc9a2 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -668,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static unsigned long long vtime_delta(struct task_struct *tsk) +static cputime_t vtime_delta(struct task_struct *tsk) { - unsigned long long clock; + unsigned long now = READ_ONCE(jiffies); - clock = local_clock(); - if (clock < tsk->vtime_snap) + if (time_before(now, (unsigned long)tsk->vtime_snap)) return 0; - return clock - tsk->vtime_snap; + return jiffies_to_cputime(now - tsk->vtime_snap); } static cputime_t get_vtime_delta(struct task_struct *tsk) { - unsigned long long delta = vtime_delta(tsk); + unsigned long now = READ_ONCE(jiffies); + unsigned long delta = now - tsk->vtime_snap; WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); - tsk->vtime_snap += delta; + tsk->vtime_snap = now; - /* CHECKME: always safe to convert nsecs to cputime? */ - return nsecs_to_cputime(delta); + return jiffies_to_cputime(delta); } static void __vtime_account_system(struct task_struct *tsk) @@ -699,6 +698,9 @@ static void __vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk) { + if (!vtime_delta(tsk)) + return; + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); write_seqcount_end(&tsk->vtime_seqcount); @@ -707,7 +709,8 @@ void vtime_account_system(struct task_struct *tsk) void vtime_gen_account_irq_exit(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - __vtime_account_system(tsk); + if (vtime_delta(tsk)) + __vtime_account_system(tsk); if (context_tracking_in_user()) tsk->vtime_snap_whence = VTIME_USER; write_seqcount_end(&tsk->vtime_seqcount); @@ -718,16 +721,19 @@ void vtime_account_user(struct task_struct *tsk) cputime_t delta_cpu; write_seqcount_begin(&tsk->vtime_seqcount); - delta_cpu = get_vtime_delta(tsk); tsk->vtime_snap_whence = VTIME_SYS; - account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + if (vtime_delta(tsk)) { + delta_cpu = get_vtime_delta(tsk); + account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + } write_seqcount_end(&tsk->vtime_seqcount); } void vtime_user_enter(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - __vtime_account_system(tsk); + if (vtime_delta(tsk)) + __vtime_account_system(tsk); tsk->vtime_snap_whence = VTIME_USER; write_seqcount_end(&tsk->vtime_seqcount); } @@ -742,7 +748,8 @@ void vtime_guest_enter(struct task_struct *tsk) * that can thus safely catch up with a tickless delta. */ write_seqcount_begin(&tsk->vtime_seqcount); - __vtime_account_system(tsk); + if (vtime_delta(tsk)) + __vtime_account_system(tsk); current->flags |= PF_VCPU; write_seqcount_end(&tsk->vtime_seqcount); } @@ -772,7 +779,7 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqcount_begin(¤t->vtime_seqcount); current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = sched_clock_cpu(smp_processor_id()); + current->vtime_snap = jiffies; write_seqcount_end(¤t->vtime_seqcount); } @@ -783,7 +790,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&t->vtime_seqcount); t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = sched_clock_cpu(cpu); + t->vtime_snap = jiffies; write_seqcount_end(&t->vtime_seqcount); local_irq_restore(flags); } -- cgit From f904f58263e1df5f70feb8b283f4bbe662847334 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 26 Feb 2016 14:54:56 +0100 Subject: sched/debug: Fix preempt_disable_ip recording for preempt_disable() The preempt_disable() invokes preempt_count_add() which saves the caller in ->preempt_disable_ip. It uses CALLER_ADDR1 which does not look for its caller but for the parent of the caller. Which means we get the correct caller for something like spin_lock() unless the architectures inlines those invocations. It is always wrong for preempt_disable() or local_bh_disable(). This patch makes the function get_lock_parent_ip() which tries CALLER_ADDR0,1,2 if the former is a locking function. This seems to record the preempt_disable() caller properly for preempt_disable() itself as well as for get_cpu_var() or local_bh_disable(). Steven asked for the get_parent_ip() -> get_lock_parent_ip() rename. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160226135456.GB18244@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 12 ++++++++++++ include/linux/sched.h | 2 -- kernel/sched/core.c | 14 ++------------ kernel/softirq.c | 4 ++-- 4 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index c2b340e23f62..6d9df3f7e334 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -713,6 +713,18 @@ static inline void __ftrace_enabled_restore(int enabled) #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5)) #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6)) +static inline unsigned long get_lock_parent_ip(void) +{ + unsigned long addr = CALLER_ADDR0; + + if (!in_lock_functions(addr)) + return addr; + addr = CALLER_ADDR1; + if (!in_lock_functions(addr)) + return addr; + return CALLER_ADDR2; +} + #ifdef CONFIG_IRQSOFF_TRACER extern void time_hardirqs_on(unsigned long a0, unsigned long a1); extern void time_hardirqs_off(unsigned long a0, unsigned long a1); diff --git a/include/linux/sched.h b/include/linux/sched.h index 87e5f9886ac8..9519b66fc21f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -182,8 +182,6 @@ extern void update_cpu_load_nohz(int active); static inline void update_cpu_load_nohz(int active) { } #endif -extern unsigned long get_parent_ip(unsigned long addr); - extern void dump_cpu_task(int cpu); struct seq_file; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6dbb21f4b17c..423452cae5cf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2946,16 +2946,6 @@ u64 scheduler_tick_max_deferment(void) } #endif -notrace unsigned long get_parent_ip(unsigned long addr) -{ - if (in_lock_functions(addr)) { - addr = CALLER_ADDR2; - if (in_lock_functions(addr)) - addr = CALLER_ADDR3; - } - return addr; -} - #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) @@ -2977,7 +2967,7 @@ void preempt_count_add(int val) PREEMPT_MASK - 10); #endif if (preempt_count() == val) { - unsigned long ip = get_parent_ip(CALLER_ADDR1); + unsigned long ip = get_lock_parent_ip(); #ifdef CONFIG_DEBUG_PREEMPT current->preempt_disable_ip = ip; #endif @@ -3004,7 +2994,7 @@ void preempt_count_sub(int val) #endif if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); diff --git a/kernel/softirq.c b/kernel/softirq.c index 479e4436f787..8aae49dd7da8 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) if (preempt_count() == cnt) { #ifdef CONFIG_DEBUG_PREEMPT - current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1); + current->preempt_disable_ip = get_lock_parent_ip(); #endif - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip()); } } EXPORT_SYMBOL(__local_bh_disable_ip); -- cgit From 801ccdbf018ca5dbd478756ece55cd6c7726ed5b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Feb 2016 15:01:49 +0100 Subject: sched/deadline: Remove superfluous call to switched_to_dl() if (A || B) { } else if (A && !B) { } If A we'll take the first branch, if !A we will not satisfy the second. Therefore the second branch will never be taken. Reported-by: luca abeni Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160225140149.GK6357@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 04a569cdd613..15abf04bf0b8 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1772,8 +1772,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, */ resched_curr(rq); #endif /* CONFIG_SMP */ - } else - switched_to_dl(rq, p); + } } const struct sched_class dl_sched_class = { -- cgit From 920c720aa5aa3900a7f1689228fdfc2580a91e7e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 1 Feb 2016 15:11:28 +0100 Subject: locking/mcs: Fix mcs_spin_lock() ordering Similar to commit b4b29f94856a ("locking/osq: Fix ordering of node initialisation in osq_lock") the use of xchg_acquire() is fundamentally broken with MCS like constructs. Furthermore, it turns out we rely on the global transitivity of this operation because the unlock path observes the pointer with a READ_ONCE(), not an smp_load_acquire(). This is non-critical because the MCS code isn't actually used and mostly serves as documentation, a stepping stone to the more complex things we've build on top of the idea. Reported-by: Andrea Parri Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Fixes: 3552a07a9c4a ("locking/mcs: Use acquire/release semantics") Signed-off-by: Ingo Molnar --- kernel/locking/mcs_spinlock.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 5b9102a47ea5..c835270f0c2f 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) node->locked = 0; node->next = NULL; - prev = xchg_acquire(lock, node); + /* + * We rely on the full barrier with global transitivity implied by the + * below xchg() to order the initialization stores above against any + * observation of @node. And to provide the ACQUIRE ordering associated + * with a LOCK primitive. + */ + prev = xchg(lock, node); if (likely(prev == NULL)) { /* * Lock acquired, don't need to set node->locked to 1. Threads -- cgit From eaff0e7003cca6c2748b67ead2d4b1a8ad858fc7 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 10 Dec 2015 15:17:46 -0500 Subject: locking/pvqspinlock: Move lock stealing count tracking code into pv_queued_spin_steal_lock() This patch moves the lock stealing count tracking code into pv_queued_spin_steal_lock() instead of via a jacket function simplifying the code. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Douglas Hatch Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1449778666-13593-3-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 16 +++++++++------- kernel/locking/qspinlock_stat.h | 13 ------------- 2 files changed, 9 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 87bb235c3448..78f04a232d3a 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -54,6 +54,11 @@ struct pv_node { u8 state; }; +/* + * Include queued spinlock statistics code + */ +#include "qspinlock_stat.h" + /* * By replacing the regular queued_spin_trylock() with the function below, * it will be called once when a lock waiter enter the PV slowpath before @@ -65,9 +70,11 @@ struct pv_node { static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; + int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && + (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); - return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); + qstat_inc(qstat_pv_lock_stealing, ret); + return ret; } /* @@ -137,11 +144,6 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock) } #endif /* _Q_PENDING_BITS == 8 */ -/* - * Include queued spinlock statistics code - */ -#include "qspinlock_stat.h" - /* * Lock and MCS node addresses hash table for fast lookup * diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 640dcecdd1df..869988d46124 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -279,19 +279,6 @@ static inline void __pv_wait(u8 *ptr, u8 val) #define pv_kick(c) __pv_kick(c) #define pv_wait(p, v) __pv_wait(p, v) -/* - * PV unfair trylock count tracking function - */ -static inline int qstat_spin_steal_lock(struct qspinlock *lock) -{ - int ret = pv_queued_spin_steal_lock(lock); - - qstat_inc(qstat_pv_lock_stealing, ret); - return ret; -} -#undef queued_spin_trylock -#define queued_spin_trylock(l) qstat_spin_steal_lock(l) - #else /* CONFIG_QUEUED_LOCK_STAT */ static inline void qstat_inc(enum qlock_stats stat, bool cond) { } -- cgit From cb037fdad6772df2d49fe61c97d7c0d8265bc918 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 10 Dec 2015 15:17:44 -0500 Subject: locking/qspinlock: Use smp_cond_acquire() in pending code The newly introduced smp_cond_acquire() was used to replace the slowpath lock acquisition loop. Similarly, the new function can also be applied to the pending bit locking loop. This patch uses the new function in that loop. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Douglas Hatch Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1449778666-13593-1-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 393d1874b9e0..ce2f75e32ae1 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -358,8 +358,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * sequentiality; this is because not all clear_pending_set_locked() * implementations imply full barriers. */ - while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK) - cpu_relax(); + smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK)); /* * take ownership and clear the pending bit. @@ -435,7 +434,7 @@ queue: * * The PV pv_wait_head_or_lock function, if active, will acquire * the lock and return a non-zero value. So we have to skip the - * smp_load_acquire() call. As the next PV queue head hasn't been + * smp_cond_acquire() call. As the next PV queue head hasn't been * designated yet, there is no way for the locked value to become * _Q_SLOW_VAL. So both the set_locked() and the * atomic_cmpxchg_relaxed() calls will be safe. @@ -466,7 +465,7 @@ locked: break; } /* - * The smp_load_acquire() call above has provided the necessary + * The smp_cond_acquire() call above has provided the necessary * acquire semantics required for locking. At most two * iterations of this loop may be ran. */ -- cgit From 32d62510f949d3c8e83b9b3b844a84446611661b Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 10 Dec 2015 15:17:45 -0500 Subject: locking/pvqspinlock: Enable slowpath locking count tracking This patch enables the tracking of the number of slowpath locking operations performed. This can be used to compare against the number of lock stealing operations to see what percentage of locks are stolen versus acquired via the regular slowpath. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Douglas Hatch Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1449778666-13593-2-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 5 +++++ kernel/locking/qspinlock_stat.h | 3 +++ 2 files changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 78f04a232d3a..21ede57f68b3 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -400,6 +400,11 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) if (READ_ONCE(pn->state) == vcpu_hashed) lp = (struct qspinlock **)1; + /* + * Tracking # of slowpath locking operations + */ + qstat_inc(qstat_pv_lock_slowpath, true); + for (;; waitcnt++) { /* * Set correct vCPU state to be used by queue node wait-early diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 869988d46124..eb2a2c9bc3fc 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -22,6 +22,7 @@ * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake * pv_latency_kick - average latency (ns) of vCPU kick operation * pv_latency_wake - average latency (ns) from vCPU kick to wakeup + * pv_lock_slowpath - # of locking operations via the slowpath * pv_lock_stealing - # of lock stealing operations * pv_spurious_wakeup - # of spurious wakeups * pv_wait_again - # of vCPU wait's that happened after a vCPU kick @@ -45,6 +46,7 @@ enum qlock_stats { qstat_pv_kick_wake, qstat_pv_latency_kick, qstat_pv_latency_wake, + qstat_pv_lock_slowpath, qstat_pv_lock_stealing, qstat_pv_spurious_wakeup, qstat_pv_wait_again, @@ -70,6 +72,7 @@ static const char * const qstat_names[qstat_num + 1] = { [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", [qstat_pv_latency_kick] = "pv_latency_kick", [qstat_pv_latency_wake] = "pv_latency_wake", + [qstat_pv_lock_slowpath] = "pv_lock_slowpath", [qstat_pv_lock_stealing] = "pv_lock_stealing", [qstat_pv_wait_again] = "pv_wait_again", [qstat_pv_wait_early] = "pv_wait_early", -- cgit From 1329ce6fbbe4536592dfcfc8d64d61bfeb598fe6 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sun, 24 Jan 2016 18:23:43 -0800 Subject: locking/mutex: Allow next waiter lockless wakeup Make use of wake-queues and enable the wakeup to occur after releasing the wait_lock. This is similar to what we do with rtmutex top waiter, slightly shortening the critical region and allow other waiters to acquire the wait_lock sooner. In low contention cases it can also help the recently woken waiter to find the wait_lock available (fastpath) when it continues execution. Reviewed-by: Waiman Long Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Ding Tianhong Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Waiman Long Cc: Will Deacon Link: http://lkml.kernel.org/r/20160125022343.GA3322@linux-uzut.site Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 0551c219c40e..e364b424b019 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -716,6 +716,7 @@ static inline void __mutex_unlock_common_slowpath(struct mutex *lock, int nested) { unsigned long flags; + WAKE_Q(wake_q); /* * As a performance measurement, release the lock before doing other @@ -743,11 +744,11 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested) struct mutex_waiter, list); debug_mutex_wake_waiter(lock, waiter); - - wake_up_process(waiter->task); + wake_q_add(&wake_q, waiter->task); } spin_unlock_mutex(&lock->wait_lock, flags); + wake_up_q(&wake_q); } /* -- cgit From 5f18ab5c6bdba735b31a1e7c2618f48eae6b1b5c Mon Sep 17 00:00:00 2001 From: Alfredo Alvarez Fernandez Date: Thu, 11 Feb 2016 00:33:32 +0100 Subject: locking/lockdep: Prevent chain_key collisions The chain_key hashing macro iterate_chain_key(key1, key2) does not generate a new different value if both key1 and key2 are 0. In that case the generated value is again 0. This can lead to collisions which can result in lockdep not detecting deadlocks or circular dependencies. Avoid the problem by using class_idx (1-based) instead of class id (0-based) as an input for the hashing macro 'key2' in iterate_chain_key(key1, key2). The use of class id created collisions in cases like the following: 1.- Consider an initial state in which no class has been acquired yet. Under these circumstances an AA deadlock will not be detected by lockdep: lock [key1,key2]->new key (key1=old chain_key, key2=id) -------------------------- A [0,0]->0 A [0,0]->0 (collision) The newly generated chain_key collides with the one used before and as a result the check for a deadlock is skipped A simple test using liblockdep and a pthread mutex confirms the problem: (omitting stack traces) new class 0xe15038: 0x7ffc64950f20 acquire class [0xe15038] 0x7ffc64950f20 acquire class [0xe15038] 0x7ffc64950f20 hash chain already cached, key: 0000000000000000 tail class: [0xe15038] 0x7ffc64950f20 2.- Consider an ABBA in 2 different tasks and no class yet acquired. T1 [key1,key2]->new key T2[key1,key2]->new key -- -- A [0,0]->0 B [0,1]->1 B [0,1]->1 (collision) A In this case the collision prevents lockdep from creating the new dependency A->B. This in turn results in lockdep not detecting the circular dependency when T2 acquires A. Signed-off-by: Alfredo Alvarez Fernandez Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: sasha.levin@oracle.com Link: http://lkml.kernel.org/r/1455147212-2389-4-git-send-email-alfredoalvarezernandez@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 3261214323fa..6f446eb3c742 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2143,7 +2143,7 @@ static void check_chain_key(struct task_struct *curr) { #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; - unsigned int i, id; + unsigned int i; u64 chain_key = 0; for (i = 0; i < curr->lockdep_depth; i++) { @@ -2160,17 +2160,16 @@ static void check_chain_key(struct task_struct *curr) (unsigned long long)hlock->prev_chain_key); return; } - id = hlock->class_idx - 1; /* * Whoops ran out of static storage again? */ - if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS)) return; if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) chain_key = 0; - chain_key = iterate_chain_key(chain_key, id); + chain_key = iterate_chain_key(chain_key, hlock->class_idx); prev_hlock = hlock; } if (chain_key != curr->curr_chain_key) { @@ -3048,7 +3047,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, struct task_struct *curr = current; struct lock_class *class = NULL; struct held_lock *hlock; - unsigned int depth, id; + unsigned int depth; int chain_head = 0; int class_idx; u64 chain_key; @@ -3151,11 +3150,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * The 'key ID' is what is the most compact key value to drive * the hash, not class->key. */ - id = class - lock_classes; /* * Whoops, we did it again.. ran straight out of our static allocation. */ - if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS)) return 0; chain_key = curr->curr_chain_key; @@ -3173,7 +3171,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, chain_key = 0; chain_head = 1; } - chain_key = iterate_chain_key(chain_key, id); + chain_key = iterate_chain_key(chain_key, class_idx); if (nest_lock && !__lock_is_held(nest_lock)) return print_lock_nested_lock_not_held(curr, hlock, ip); -- cgit From 9e4e7554e755aaad8df0e26268787438735b4b76 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 29 Feb 2016 10:03:58 +0100 Subject: locking/lockdep: Detect chain_key collisions Add detection for chain_key collision under CONFIG_DEBUG_LOCKDEP. When a collision is detected the problem is reported and all lock debugging is turned off. Tested using liblockdep and the added tests before and after applying the fix, confirming both that the code added for the detection correctly reports the problem and that the fix actually fixes it. Tested tweaking lockdep to generate false collisions and verified that the problem is reported and that lock debugging is turned off. Also tested with lockdep's test suite after applying the patch: [ 0.000000] Good, all 253 testcases passed! | Signed-off-by: Alfredo Alvarez Fernandez Cc: Alfredo Alvarez Fernandez Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: sasha.levin@oracle.com Link: http://lkml.kernel.org/r/1455864533-7536-4-git-send-email-alfredoalvarezernandez@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 59 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 6f446eb3c742..f894a2cd9b2a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1981,6 +1981,53 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) return lock_classes + chain_hlocks[chain->base + i]; } +/* + * Returns the index of the first held_lock of the current chain + */ +static inline int get_first_held_lock(struct task_struct *curr, + struct held_lock *hlock) +{ + int i; + struct held_lock *hlock_curr; + + for (i = curr->lockdep_depth - 1; i >= 0; i--) { + hlock_curr = curr->held_locks + i; + if (hlock_curr->irq_context != hlock->irq_context) + break; + + } + + return ++i; +} + +/* + * Checks whether the chain and the current held locks are consistent + * in depth and also in content. If they are not it most likely means + * that there was a collision during the calculation of the chain_key. + * Returns: 0 not passed, 1 passed + */ +static int check_no_collision(struct task_struct *curr, + struct held_lock *hlock, + struct lock_chain *chain) +{ +#ifdef CONFIG_DEBUG_LOCKDEP + int i, j, id; + + i = get_first_held_lock(curr, hlock); + + if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1))) + return 0; + + for (j = 0; j < chain->depth - 1; j++, i++) { + id = curr->held_locks[i].class_idx - 1; + + if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) + return 0; + } +#endif + return 1; +} + /* * Look up a dependency chain. If the key is not present yet then * add it and return 1 - in this case the new dependency chain is @@ -1994,7 +2041,6 @@ static inline int lookup_chain_cache(struct task_struct *curr, struct lock_class *class = hlock_class(hlock); struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - struct held_lock *hlock_curr; int i, j; /* @@ -2012,6 +2058,9 @@ static inline int lookup_chain_cache(struct task_struct *curr, if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(chain_lookup_hits); + if (!check_no_collision(curr, hlock, chain)) + return 0; + if (very_verbose(class)) printk("\nhash chain already cached, key: " "%016Lx tail class: [%p] %s\n", @@ -2049,13 +2098,7 @@ cache_hit: chain = lock_chains + nr_lock_chains++; chain->chain_key = chain_key; chain->irq_context = hlock->irq_context; - /* Find the first held_lock of current chain */ - for (i = curr->lockdep_depth - 1; i >= 0; i--) { - hlock_curr = curr->held_locks + i; - if (hlock_curr->irq_context != hlock->irq_context) - break; - } - i++; + i = get_first_held_lock(curr, hlock); chain->depth = curr->lockdep_depth + 1 - i; if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = nr_chain_hlocks; -- cgit From ede5147d515694e012cd958ec874b9daf8a65fec Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 24 Feb 2016 14:37:53 +0000 Subject: Handle ISO 8601 leap seconds and encodings of midnight in mktime64() Handle the following ISO 8601 features in mktime64(): (1) Leap seconds. Leap seconds are indicated by the seconds parameter being the value 60. Handle this by treating it the same as 00 of the following minute. It has been pointed out that a minute may contain two leap seconds. However, pending discussion of what that looks like and how to handle it, I'm not going to concern myself with it. (2) Alternate encodings of midnight. Two different encodings of midnight are permitted - 00:00:00 and 24:00:00 - the first is midnight today and the second is midnight tomorrow and is exactly equivalent to the first with tomorrow's date. As it happens, we don't actually need to change mktime64() to handle either of these - just comment them as valid parameters. These facility will be used by the X.509 parser. Doing it in mktime64() makes the policy common to the whole kernel and easier to find. Signed-off-by: David Howells Acked-by: Arnd Bergmann cc: John Stultz cc: Rudolf Polzer cc: One Thousand Gnomes --- kernel/time/time.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 86751c68e08d..be115b020d27 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -322,6 +322,13 @@ EXPORT_SYMBOL(timespec_trunc); * -year/100+year/400 terms, and add 10.] * * This algorithm was first published by Gauss (I think). + * + * A leap second can be indicated by calling this function with sec as + * 60 (allowable under ISO 8601). The leap second is treated the same + * as the following second since they don't exist in UNIX time. + * + * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight + * tomorrow - (allowable under ISO 8601) is supported. */ time64_t mktime64(const unsigned int year0, const unsigned int mon0, const unsigned int day, const unsigned int hour, @@ -338,7 +345,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, return ((((time64_t) (year/4 - year/100 + year/400 + 367*mon/12 + day) + year*365 - 719499 - )*24 + hour /* now have hours */ + )*24 + hour /* now have hours - midnight tomorrow handled here */ )*60 + min /* now have minutes */ )*60 + sec; /* finally seconds */ } -- cgit From 026842d148b920dc28f0499ede4950dcb098d4d5 Mon Sep 17 00:00:00 2001 From: Taeung Song Date: Fri, 26 Feb 2016 13:23:01 -0500 Subject: tracing/syscalls: Rename "/format" tracepoint field name "nr" to "__syscall_nr: Some tracepoint have multiple fields with the same name, "nr", the first one is a unique syscall ID, the other is a syscall argument: # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_io_getevents/format name: sys_enter_io_getevents ID: 747 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1; signed:0; field:int common_pid; offset:4; size:4; signed:1; field:int nr; offset:8; size:4; signed:1; field:aio_context_t ctx_id; offset:16; size:8; signed:0; field:long min_nr; offset:24; size:8; signed:0; field:long nr; offset:32; size:8; signed:0; field:struct io_event * events; offset:40; size:8; signed:0; field:struct timespec * timeout; offset:48; size:8; signed:0; print fmt: "ctx_id: 0x%08lx, min_nr: 0x%08lx, nr: 0x%08lx, events: 0x%08lx, timeout: 0x%08lx", ((unsigned long)(REC->ctx_id)), ((unsigned long)(REC->min_nr)), ((unsigned long)(REC->nr)), ((unsigned long)(REC->events)), ((unsigned long)(REC->timeout)) # Fix it by renaming the "/format" common tracepoint field "nr" to "__syscall_nr". Signed-off-by: Taeung Song [ Do not rename the struct member, just the '/format' field name ] Signed-off-by: Steven Rostedt Acked-by: Peter Zijlstra Cc: Jiri Olsa Cc: Lai Jiangshan Cc: Namhyung Kim Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160226132301.3ae065a4@gandalf.local.home Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_syscalls.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 0655afbea83f..d1663083d903 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -186,11 +186,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags, extern char *__bad_type_size(void); -#define SYSCALL_FIELD(type, name) \ - sizeof(type) != sizeof(trace.name) ? \ +#define SYSCALL_FIELD(type, field, name) \ + sizeof(type) != sizeof(trace.field) ? \ __bad_type_size() : \ - #type, #name, offsetof(typeof(trace), name), \ - sizeof(trace.name), is_signed_type(type) + #type, #name, offsetof(typeof(trace), field), \ + sizeof(trace.field), is_signed_type(type) static int __init __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) @@ -261,7 +261,8 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) int i; int offset = offsetof(typeof(trace), args); - ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr), + FILTER_OTHER); if (ret) return ret; @@ -281,11 +282,12 @@ static int __init syscall_exit_define_fields(struct trace_event_call *call) struct syscall_trace_exit trace; int ret; - ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr), + FILTER_OTHER); if (ret) return ret; - ret = trace_define_field(call, SYSCALL_FIELD(long, ret), + ret = trace_define_field(call, SYSCALL_FIELD(long, ret, ret), FILTER_OTHER); return ret; -- cgit From fa5ff8a1c43fc7b78353059899edf3cbedf54e9f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 28 Feb 2016 08:59:33 -0500 Subject: cgroup: fix and restructure error handling in copy_cgroup_ns() copy_cgroup_ns()'s error handling was broken and the attempt to fix it d22025570e2e ("cgroup: fix alloc_cgroup_ns() error handling in copy_cgroup_ns()") was broken too in that it ended up trying an ERR_PTR() value. There's only one place where copy_cgroup_ns() needs to perform cleanup after failure. Simplify and fix the error handling by removing the goto's. Signed-off-by: Tejun Heo Reported-by: Dan Carpenter Acked-by: Serge E. Hallyn --- kernel/cgroup.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d92d91a4bb3e..2c88149da848 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6058,9 +6058,8 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { - struct cgroup_namespace *new_ns = NULL; - struct css_set *cset = NULL; - int err; + struct cgroup_namespace *new_ns; + struct css_set *cset; BUG_ON(!old_ns); @@ -6070,9 +6069,8 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, } /* Allow only sysadmin to create cgroup namespace. */ - err = -EPERM; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) - goto err_out; + return ERR_PTR(-EPERM); mutex_lock(&cgroup_mutex); spin_lock_bh(&css_set_lock); @@ -6085,20 +6083,14 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { - err = PTR_ERR(new_ns); - goto err_out; + put_css_set(cset); + return new_ns; } new_ns->user_ns = get_user_ns(user_ns); new_ns->root_cset = cset; return new_ns; - -err_out: - if (cset) - put_css_set(cset); - kfree(new_ns); - return ERR_PTR(err); } static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -- cgit From 89053aa9c711bd455a68844b4ed37c8b72ef1daa Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 1 Mar 2016 10:36:07 +0000 Subject: MODSIGN: linux/string.h should be #included to get memcpy() linux/string.h should be #included in module_signing.c to get memcpy(), lest the following occur: kernel/module_signing.c: In function 'mod_verify_sig': kernel/module_signing.c:57:2: error: implicit declaration of function 'memcpy' [-Werror=implicit-function-declaration] memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); ^ Reported-by: kbuild test robot Signed-off-by: David Howells --- kernel/module_signing.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 6528a79d998d..9cfa46d8d14f 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include "module-internal.h" -- cgit From fa06235b8eb0ae87a962e023243dba1eb4e7160d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 1 Mar 2016 19:56:30 +0300 Subject: cgroup: reset css on destruction An associated css can be around for quite a while after a cgroup directory has been removed. In general, it makes sense to reset it to defaults so as not to worry about any remnants. For instance, memory cgroup needs to reset memory.low, otherwise pages charged to a dead cgroup might never get reclaimed. There's ->css_reset callback, which would fit perfectly for the purpose. Currently, it's only called when a subsystem is disabled in the unified hierarchy and there are other subsystems dependant on it. Let's call it on css destruction as well. Suggested-by: Johannes Weiner Signed-off-by: Vladimir Davydov Signed-off-by: Tejun Heo --- kernel/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fcfad82149b1..46529502e9d5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4787,6 +4787,9 @@ static void offline_css(struct cgroup_subsys_state *css) if (!(css->flags & CSS_ONLINE)) return; + if (ss->css_reset) + ss->css_reset(css); + if (ss->css_offline) ss->css_offline(css); -- cgit From 090e77c391dd983c8945b8e2e16d09f378d2e334 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:23 +0000 Subject: cpu/hotplug: Restructure FROZEN state handling There are only a few callbacks which really care about FROZEN vs. !FROZEN. No need to have extra states for this. Publish the frozen state in an extra variable which is updated under the hotplug lock and let the users interested deal with it w/o imposing that extra state checks on everyone. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.334912357@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 2 ++ kernel/cpu.c | 69 ++++++++++++++++++++++------------------------------- 2 files changed, 31 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index d2ca8c38f9c4..f2fb54938ee6 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -118,6 +118,7 @@ enum { #ifdef CONFIG_SMP +extern bool cpuhp_tasks_frozen; /* Need to know about CPUs going up/down? */ #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) #define cpu_notifier(fn, pri) { \ @@ -177,6 +178,7 @@ extern void cpu_maps_update_done(void); #define cpu_notifier_register_done cpu_maps_update_done #else /* CONFIG_SMP */ +#define cpuhp_tasks_frozen 0 #define cpu_notifier(fn, pri) do { (void)(fn); } while (0) #define __cpu_notifier(fn, pri) do { (void)(fn); } while (0) diff --git a/kernel/cpu.c b/kernel/cpu.c index 5b9d39633ce9..41a6cb85c0af 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -29,6 +29,8 @@ #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); +bool cpuhp_tasks_frozen; +EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen); /* * The following two APIs (cpu_maps_update_begin/done) must be used when @@ -207,27 +209,30 @@ int __register_cpu_notifier(struct notifier_block *nb) return raw_notifier_chain_register(&cpu_chain, nb); } -static int __cpu_notify(unsigned long val, void *v, int nr_to_call, +static int __cpu_notify(unsigned long val, unsigned int cpu, int nr_to_call, int *nr_calls) { + unsigned long mod = cpuhp_tasks_frozen ? CPU_TASKS_FROZEN : 0; + void *hcpu = (void *)(long)cpu; + int ret; - ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call, + ret = __raw_notifier_call_chain(&cpu_chain, val | mod, hcpu, nr_to_call, nr_calls); return notifier_to_errno(ret); } -static int cpu_notify(unsigned long val, void *v) +static int cpu_notify(unsigned long val, unsigned int cpu) { - return __cpu_notify(val, v, -1, NULL); + return __cpu_notify(val, cpu, -1, NULL); } #ifdef CONFIG_HOTPLUG_CPU -static void cpu_notify_nofail(unsigned long val, void *v) +static void cpu_notify_nofail(unsigned long val, unsigned int cpu) { - BUG_ON(cpu_notify(val, v)); + BUG_ON(cpu_notify(val, cpu)); } EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); @@ -311,27 +316,21 @@ static inline void check_for_tasks(int dead_cpu) read_unlock(&tasklist_lock); } -struct take_cpu_down_param { - unsigned long mod; - void *hcpu; -}; - /* Take this CPU down. */ static int take_cpu_down(void *_param) { - struct take_cpu_down_param *param = _param; - int err; + int err, cpu = smp_processor_id(); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; - cpu_notify(CPU_DYING | param->mod, param->hcpu); + cpu_notify(CPU_DYING, cpu); /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ - stop_machine_park((long)param->hcpu); + stop_machine_park(cpu); return 0; } @@ -339,12 +338,6 @@ static int take_cpu_down(void *_param) static int _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; - void *hcpu = (void *)(long)cpu; - unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; - struct take_cpu_down_param tcd_param = { - .mod = mod, - .hcpu = hcpu, - }; if (num_online_cpus() == 1) return -EBUSY; @@ -354,10 +347,12 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) cpu_hotplug_begin(); - err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); + cpuhp_tasks_frozen = tasks_frozen; + + err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls); if (err) { nr_calls--; - __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); + __cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL); pr_warn("%s: attempt to take down CPU %u failed\n", __func__, cpu); goto out_release; @@ -389,10 +384,10 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* * So now all preempt/rcu users must observe !cpu_active(). */ - err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); + err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ - cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); + cpu_notify_nofail(CPU_DOWN_FAILED, cpu); irq_unlock_sparse(); goto out_release; } @@ -419,14 +414,14 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* CPU is completely dead: tell everyone. Too late to complain. */ tick_cleanup_dead_cpu(cpu); - cpu_notify_nofail(CPU_DEAD | mod, hcpu); + cpu_notify_nofail(CPU_DEAD, cpu); check_for_tasks(cpu); out_release: cpu_hotplug_done(); if (!err) - cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu); + cpu_notify_nofail(CPU_POST_DEAD, cpu); return err; } @@ -485,10 +480,8 @@ void smpboot_thread_init(void) /* Requires cpu_add_remove_lock to be held */ static int _cpu_up(unsigned int cpu, int tasks_frozen) { - int ret, nr_calls = 0; - void *hcpu = (void *)(long)cpu; - unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct task_struct *idle; + int ret, nr_calls = 0; cpu_hotplug_begin(); @@ -507,7 +500,9 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) if (ret) goto out; - ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); + cpuhp_tasks_frozen = tasks_frozen; + + ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls); if (ret) { nr_calls--; pr_warn("%s: attempt to bring up CPU %u failed\n", @@ -523,11 +518,11 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) BUG_ON(!cpu_online(cpu)); /* Now call notifier in preparation. */ - cpu_notify(CPU_ONLINE | mod, hcpu); + cpu_notify(CPU_ONLINE, cpu); out_notify: if (ret != 0) - __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); + __cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL); out: cpu_hotplug_done(); @@ -719,13 +714,7 @@ core_initcall(cpu_hotplug_pm_sync_init); */ void notify_cpu_starting(unsigned int cpu) { - unsigned long val = CPU_STARTING; - -#ifdef CONFIG_PM_SLEEP_SMP - if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) - val = CPU_STARTING_FROZEN; -#endif /* CONFIG_PM_SLEEP_SMP */ - cpu_notify(val, (void *)(long)cpu); + cpu_notify(CPU_STARTING, cpu); } #endif /* CONFIG_SMP */ -- cgit From ba997462435f48ad1501320e9da8770fd40c59b1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:24 +0000 Subject: cpu/hotplug: Restructure cpu_up code Split out into separate functions, so we can convert it to a state machine. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.429389195@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 69 +++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 41a6cb85c0af..15a413639abc 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -228,6 +228,43 @@ static int cpu_notify(unsigned long val, unsigned int cpu) return __cpu_notify(val, cpu, -1, NULL); } +/* Notifier wrappers for transitioning to state machine */ +static int notify_prepare(unsigned int cpu) +{ + int nr_calls = 0; + int ret; + + ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls); + if (ret) { + nr_calls--; + printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", + __func__, cpu); + __cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL); + } + return ret; +} + +static int notify_online(unsigned int cpu) +{ + cpu_notify(CPU_ONLINE, cpu); + return 0; +} + +static int bringup_cpu(unsigned int cpu) +{ + struct task_struct *idle = idle_thread_get(cpu); + int ret; + + /* Arch-specific enabling code. */ + ret = __cpu_up(cpu, idle); + if (ret) { + cpu_notify(CPU_UP_CANCELED, cpu); + return ret; + } + BUG_ON(!cpu_online(cpu)); + return 0; +} + #ifdef CONFIG_HOTPLUG_CPU static void cpu_notify_nofail(unsigned long val, unsigned int cpu) @@ -481,7 +518,7 @@ void smpboot_thread_init(void) static int _cpu_up(unsigned int cpu, int tasks_frozen) { struct task_struct *idle; - int ret, nr_calls = 0; + int ret; cpu_hotplug_begin(); @@ -496,33 +533,21 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) goto out; } + cpuhp_tasks_frozen = tasks_frozen; + ret = smpboot_create_threads(cpu); if (ret) goto out; - cpuhp_tasks_frozen = tasks_frozen; - - ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls); - if (ret) { - nr_calls--; - pr_warn("%s: attempt to bring up CPU %u failed\n", - __func__, cpu); - goto out_notify; - } - - /* Arch-specific enabling code. */ - ret = __cpu_up(cpu, idle); - - if (ret != 0) - goto out_notify; - BUG_ON(!cpu_online(cpu)); + ret = notify_prepare(cpu); + if (ret) + goto out; - /* Now call notifier in preparation. */ - cpu_notify(CPU_ONLINE, cpu); + ret = bringup_cpu(cpu); + if (ret) + goto out; -out_notify: - if (ret != 0) - __cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL); + notify_online(cpu); out: cpu_hotplug_done(); -- cgit From 984581728eb4b2e10baed3d606f85a119795b207 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:25 +0000 Subject: cpu/hotplug: Split out cpu down functions Split cpu_down in separate functions in preparation for state machine conversion. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.511796562@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 83 ++++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 15a413639abc..0b5d2596f3ec 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -266,11 +266,6 @@ static int bringup_cpu(unsigned int cpu) } #ifdef CONFIG_HOTPLUG_CPU - -static void cpu_notify_nofail(unsigned long val, unsigned int cpu) -{ - BUG_ON(cpu_notify(val, cpu)); -} EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); @@ -353,6 +348,25 @@ static inline void check_for_tasks(int dead_cpu) read_unlock(&tasklist_lock); } +static void cpu_notify_nofail(unsigned long val, unsigned int cpu) +{ + BUG_ON(cpu_notify(val, cpu)); +} + +static int notify_down_prepare(unsigned int cpu) +{ + int err, nr_calls = 0; + + err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls); + if (err) { + nr_calls--; + __cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL); + pr_warn("%s: attempt to take down CPU %u failed\n", + __func__, cpu); + } + return err; +} + /* Take this CPU down. */ static int take_cpu_down(void *_param) { @@ -371,29 +385,9 @@ static int take_cpu_down(void *_param) return 0; } -/* Requires cpu_add_remove_lock to be held */ -static int _cpu_down(unsigned int cpu, int tasks_frozen) +static int takedown_cpu(unsigned int cpu) { - int err, nr_calls = 0; - - if (num_online_cpus() == 1) - return -EBUSY; - - if (!cpu_online(cpu)) - return -EINVAL; - - cpu_hotplug_begin(); - - cpuhp_tasks_frozen = tasks_frozen; - - err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls); - if (err) { - nr_calls--; - __cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL); - pr_warn("%s: attempt to take down CPU %u failed\n", - __func__, cpu); - goto out_release; - } + int err; /* * By now we've cleared cpu_active_mask, wait for all preempt-disabled @@ -426,7 +420,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* CPU didn't die: tell everyone. Can't complain. */ cpu_notify_nofail(CPU_DOWN_FAILED, cpu); irq_unlock_sparse(); - goto out_release; + return err; } BUG_ON(cpu_online(cpu)); @@ -449,11 +443,40 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* This actually kills the CPU. */ __cpu_die(cpu); - /* CPU is completely dead: tell everyone. Too late to complain. */ tick_cleanup_dead_cpu(cpu); - cpu_notify_nofail(CPU_DEAD, cpu); + return 0; +} +static int notify_dead(unsigned int cpu) +{ + cpu_notify_nofail(CPU_DEAD, cpu); check_for_tasks(cpu); + return 0; +} + +/* Requires cpu_add_remove_lock to be held */ +static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) +{ + int err; + + if (num_online_cpus() == 1) + return -EBUSY; + + if (!cpu_online(cpu)) + return -EINVAL; + + cpu_hotplug_begin(); + + cpuhp_tasks_frozen = tasks_frozen; + + err = notify_down_prepare(cpu); + if (err) + goto out_release; + err = takedown_cpu(cpu); + if (err) + goto out_release; + + notify_dead(cpu); out_release: cpu_hotplug_done(); -- cgit From cff7d378d3fdbb53db9b6e2578b14855f401cd41 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:28 +0000 Subject: cpu/hotplug: Convert to a state machine for the control processor Move the split out steps into a callback array and let the cpu_up/down code iterate through the array functions. For now most of the callbacks are asymmetric to resemble the current hotplug maze. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.671816690@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 9 +- include/linux/cpuhotplug.h | 13 +++ init/main.c | 15 +--- kernel/cpu.c | 202 +++++++++++++++++++++++++++++++++++++++------ 4 files changed, 194 insertions(+), 45 deletions(-) create mode 100644 include/linux/cpuhotplug.h (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index f2fb54938ee6..78989f20420f 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -16,6 +16,7 @@ #include #include #include +#include struct device; struct device_node; @@ -27,6 +28,9 @@ struct cpu { struct device dev; }; +extern void boot_cpu_init(void); +extern void boot_cpu_state_init(void); + extern int register_cpu(struct cpu *cpu, int num); extern struct device *get_cpu_device(unsigned cpu); extern bool cpu_is_hotpluggable(unsigned cpu); @@ -267,11 +271,6 @@ static inline int disable_nonboot_cpus(void) { return 0; } static inline void enable_nonboot_cpus(void) {} #endif /* !CONFIG_PM_SLEEP_SMP */ -enum cpuhp_state { - CPUHP_OFFLINE, - CPUHP_ONLINE, -}; - void cpu_startup_entry(enum cpuhp_state state); void cpu_idle_poll_ctrl(bool enable); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h new file mode 100644 index 000000000000..d55c9e64acd7 --- /dev/null +++ b/include/linux/cpuhotplug.h @@ -0,0 +1,13 @@ +#ifndef __CPUHOTPLUG_H +#define __CPUHOTPLUG_H + +enum cpuhp_state { + CPUHP_OFFLINE, + CPUHP_CREATE_THREADS, + CPUHP_NOTIFY_PREPARE, + CPUHP_BRINGUP_CPU, + CPUHP_NOTIFY_ONLINE, + CPUHP_ONLINE, +}; + +#endif diff --git a/init/main.c b/init/main.c index 58c9e374704b..c2ea72362ee3 100644 --- a/init/main.c +++ b/init/main.c @@ -452,20 +452,6 @@ void __init parse_early_param(void) done = 1; } -/* - * Activate the first processor. - */ - -static void __init boot_cpu_init(void) -{ - int cpu = smp_processor_id(); - /* Mark the boot cpu "present", "online" etc for SMP and UP case */ - set_cpu_online(cpu, true); - set_cpu_active(cpu, true); - set_cpu_present(cpu, true); - set_cpu_possible(cpu, true); -} - void __init __weak smp_setup_processor_id(void) { } @@ -530,6 +516,7 @@ asmlinkage __visible void __init start_kernel(void) setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); + boot_cpu_state_init(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ build_all_zonelists(NULL, NULL); diff --git a/kernel/cpu.c b/kernel/cpu.c index 0b5d2596f3ec..301851974b8d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -22,10 +22,64 @@ #include #include #include + #include +#define CREATE_TRACE_POINTS +#include #include "smpboot.h" +/** + * cpuhp_cpu_state - Per cpu hotplug state storage + * @state: The current cpu state + * @target: The target state + */ +struct cpuhp_cpu_state { + enum cpuhp_state state; + enum cpuhp_state target; +}; + +static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); + +/** + * cpuhp_step - Hotplug state machine step + * @name: Name of the step + * @startup: Startup function of the step + * @teardown: Teardown function of the step + * @skip_onerr: Do not invoke the functions on error rollback + * Will go away once the notifiers are gone + */ +struct cpuhp_step { + const char *name; + int (*startup)(unsigned int cpu); + int (*teardown)(unsigned int cpu); + bool skip_onerr; +}; + +static struct cpuhp_step cpuhp_bp_states[]; + +/** + * cpuhp_invoke_callback _ Invoke the callbacks for a given state + * @cpu: The cpu for which the callback should be invoked + * @step: The step in the state machine + * @cb: The callback function to invoke + * + * Called from cpu hotplug and from the state register machinery + */ +static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step, + int (*cb)(unsigned int)) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int ret = 0; + + if (cb) { + trace_cpuhp_enter(cpu, st->target, step, cb); + ret = cb(cpu); + trace_cpuhp_exit(cpu, st->state, step, ret); + } + return ret; +} + #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); @@ -454,10 +508,29 @@ static int notify_dead(unsigned int cpu) return 0; } +#else +#define notify_down_prepare NULL +#define takedown_cpu NULL +#define notify_dead NULL +#endif + +#ifdef CONFIG_HOTPLUG_CPU +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + for (st->state++; st->state < st->target; st->state++) { + struct cpuhp_step *step = cpuhp_bp_states + st->state; + + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, step->startup); + } +} + /* Requires cpu_add_remove_lock to be held */ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { - int err; + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int prev_state, ret = 0; + bool hasdied = false; if (num_online_cpus() == 1) return -EBUSY; @@ -469,20 +542,25 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) cpuhp_tasks_frozen = tasks_frozen; - err = notify_down_prepare(cpu); - if (err) - goto out_release; - err = takedown_cpu(cpu); - if (err) - goto out_release; + prev_state = st->state; + st->target = CPUHP_OFFLINE; + for (; st->state > st->target; st->state--) { + struct cpuhp_step *step = cpuhp_bp_states + st->state; - notify_dead(cpu); + ret = cpuhp_invoke_callback(cpu, st->state, step->teardown); + if (ret) { + st->target = prev_state; + undo_cpu_down(cpu, st); + break; + } + } + hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE; -out_release: cpu_hotplug_done(); - if (!err) + /* This post dead nonsense must die */ + if (!ret && hasdied) cpu_notify_nofail(CPU_POST_DEAD, cpu); - return err; + return ret; } int cpu_down(unsigned int cpu) @@ -537,11 +615,22 @@ void smpboot_thread_init(void) register_cpu_notifier(&smpboot_thread_notifier); } +static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + for (st->state--; st->state > st->target; st->state--) { + struct cpuhp_step *step = cpuhp_bp_states + st->state; + + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, step->teardown); + } +} + /* Requires cpu_add_remove_lock to be held */ static int _cpu_up(unsigned int cpu, int tasks_frozen) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct task_struct *idle; - int ret; + int prev_state, ret = 0; cpu_hotplug_begin(); @@ -550,6 +639,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) goto out; } + /* Let it fail before we try to bring the cpu up */ idle = idle_thread_get(cpu); if (IS_ERR(idle)) { ret = PTR_ERR(idle); @@ -558,22 +648,22 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) cpuhp_tasks_frozen = tasks_frozen; - ret = smpboot_create_threads(cpu); - if (ret) - goto out; - - ret = notify_prepare(cpu); - if (ret) - goto out; - - ret = bringup_cpu(cpu); - if (ret) - goto out; - - notify_online(cpu); + prev_state = st->state; + st->target = CPUHP_ONLINE; + while (st->state < st->target) { + struct cpuhp_step *step; + + st->state++; + step = cpuhp_bp_states + st->state; + ret = cpuhp_invoke_callback(cpu, st->state, step->startup); + if (ret) { + st->target = prev_state; + undo_cpu_up(cpu, st); + break; + } + } out: cpu_hotplug_done(); - return ret; } @@ -767,6 +857,44 @@ void notify_cpu_starting(unsigned int cpu) #endif /* CONFIG_SMP */ +/* Boot processor state steps */ +static struct cpuhp_step cpuhp_bp_states[] = { + [CPUHP_OFFLINE] = { + .name = "offline", + .startup = NULL, + .teardown = NULL, + }, +#ifdef CONFIG_SMP + [CPUHP_CREATE_THREADS]= { + .name = "threads:create", + .startup = smpboot_create_threads, + .teardown = NULL, + }, + [CPUHP_NOTIFY_PREPARE] = { + .name = "notify:prepare", + .startup = notify_prepare, + .teardown = notify_dead, + .skip_onerr = true, + }, + [CPUHP_BRINGUP_CPU] = { + .name = "cpu:bringup", + .startup = bringup_cpu, + .teardown = takedown_cpu, + .skip_onerr = true, + }, + [CPUHP_NOTIFY_ONLINE] = { + .name = "notify:online", + .startup = notify_online, + .teardown = notify_down_prepare, + }, +#endif + [CPUHP_ONLINE] = { + .name = "online", + .startup = NULL, + .teardown = NULL, + }, +}; + /* * cpu_bit_bitmap[] is a special, "compressed" data structure that * represents all NR_CPUS bits binary values of 1<state = CPUHP_ONLINE; +} -- cgit From 4baa0afc6719cbf36a1e08551484a641926b3fd1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:29 +0000 Subject: cpu/hotplug: Convert the hotplugged cpu work to a state machine Move the functions which need to run on the hotplugged processor into a state machine array and let the code iterate through these functions. In a later state, this will grow synchronization points between the control processor and the hotplugged processor, so we can move the various architecture implementations of the synchronizations to the core. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.770651526@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 4 +++ kernel/cpu.c | 81 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 70 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index d55c9e64acd7..d9303cca83d3 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -6,6 +6,10 @@ enum cpuhp_state { CPUHP_CREATE_THREADS, CPUHP_NOTIFY_PREPARE, CPUHP_BRINGUP_CPU, + CPUHP_AP_OFFLINE, + CPUHP_AP_NOTIFY_STARTING, + CPUHP_AP_ONLINE, + CPUHP_TEARDOWN_CPU, CPUHP_NOTIFY_ONLINE, CPUHP_ONLINE, }; diff --git a/kernel/cpu.c b/kernel/cpu.c index 301851974b8d..797723e81756 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -57,6 +57,7 @@ struct cpuhp_step { }; static struct cpuhp_step cpuhp_bp_states[]; +static struct cpuhp_step cpuhp_ap_states[]; /** * cpuhp_invoke_callback _ Invoke the callbacks for a given state @@ -304,6 +305,12 @@ static int notify_online(unsigned int cpu) return 0; } +static int notify_starting(unsigned int cpu) +{ + cpu_notify(CPU_STARTING, cpu); + return 0; +} + static int bringup_cpu(unsigned int cpu) { struct task_struct *idle = idle_thread_get(cpu); @@ -421,9 +428,17 @@ static int notify_down_prepare(unsigned int cpu) return err; } +static int notify_dying(unsigned int cpu) +{ + cpu_notify(CPU_DYING, cpu); + return 0; +} + /* Take this CPU down. */ static int take_cpu_down(void *_param) { + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); int err, cpu = smp_processor_id(); /* Ensure this CPU doesn't handle any more interrupts. */ @@ -431,7 +446,12 @@ static int take_cpu_down(void *_param) if (err < 0) return err; - cpu_notify(CPU_DYING, cpu); + /* Invoke the former CPU_DYING callbacks */ + for (; st->state > target; st->state--) { + struct cpuhp_step *step = cpuhp_ap_states + st->state; + + cpuhp_invoke_callback(cpu, st->state, step->teardown); + } /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ @@ -512,6 +532,7 @@ static int notify_dead(unsigned int cpu) #define notify_down_prepare NULL #define takedown_cpu NULL #define notify_dead NULL +#define notify_dying NULL #endif #ifdef CONFIG_HOTPLUG_CPU @@ -615,6 +636,28 @@ void smpboot_thread_init(void) register_cpu_notifier(&smpboot_thread_notifier); } +/** + * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * @cpu: cpu that just started + * + * This function calls the cpu_chain notifiers with CPU_STARTING. + * It must be called by the arch code on the new cpu, before the new cpu + * enables interrupts and before the "boot" cpu returns from __cpu_up(). + */ +void notify_cpu_starting(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); + + while (st->state < target) { + struct cpuhp_step *step; + + st->state++; + step = cpuhp_ap_states + st->state; + cpuhp_invoke_callback(cpu, st->state, step->startup); + } +} + static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) { for (st->state--; st->state > st->target; st->state--) { @@ -842,19 +885,6 @@ core_initcall(cpu_hotplug_pm_sync_init); #endif /* CONFIG_PM_SLEEP_SMP */ -/** - * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers - * @cpu: cpu that just started - * - * This function calls the cpu_chain notifiers with CPU_STARTING. - * It must be called by the arch code on the new cpu, before the new cpu - * enables interrupts and before the "boot" cpu returns from __cpu_up(). - */ -void notify_cpu_starting(unsigned int cpu) -{ - cpu_notify(CPU_STARTING, cpu); -} - #endif /* CONFIG_SMP */ /* Boot processor state steps */ @@ -879,8 +909,12 @@ static struct cpuhp_step cpuhp_bp_states[] = { [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup = bringup_cpu, + .teardown = NULL, + }, + [CPUHP_TEARDOWN_CPU] = { + .name = "cpu:teardown", + .startup = NULL, .teardown = takedown_cpu, - .skip_onerr = true, }, [CPUHP_NOTIFY_ONLINE] = { .name = "notify:online", @@ -895,6 +929,23 @@ static struct cpuhp_step cpuhp_bp_states[] = { }, }; +/* Application processor state steps */ +static struct cpuhp_step cpuhp_ap_states[] = { +#ifdef CONFIG_SMP + [CPUHP_AP_NOTIFY_STARTING] = { + .name = "notify:starting", + .startup = notify_starting, + .teardown = notify_dying, + .skip_onerr = true, + }, +#endif + [CPUHP_ONLINE] = { + .name = "online", + .startup = NULL, + .teardown = NULL, + }, +}; + /* * cpu_bit_bitmap[] is a special, "compressed" data structure that * represents all NR_CPUS bits binary values of 1< Date: Fri, 26 Feb 2016 18:43:30 +0000 Subject: cpu/hotplug: Hand in target state to _cpu_up/down We want to be able to bringup/teardown the cpu to a particular state. Add a target argument to _cpu_up/down. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.862113133@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 797723e81756..a00f8f67e4ad 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -547,7 +547,8 @@ static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) } /* Requires cpu_add_remove_lock to be held */ -static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) +static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, + enum cpuhp_state target) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int prev_state, ret = 0; @@ -564,7 +565,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) cpuhp_tasks_frozen = tasks_frozen; prev_state = st->state; - st->target = CPUHP_OFFLINE; + st->target = target; for (; st->state > st->target; st->state--) { struct cpuhp_step *step = cpuhp_bp_states + st->state; @@ -584,7 +585,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return ret; } -int cpu_down(unsigned int cpu) +static int do_cpu_down(unsigned int cpu, enum cpuhp_state target) { int err; @@ -595,12 +596,16 @@ int cpu_down(unsigned int cpu) goto out; } - err = _cpu_down(cpu, 0); + err = _cpu_down(cpu, 0, target); out: cpu_maps_update_done(); return err; } +int cpu_down(unsigned int cpu) +{ + return do_cpu_down(cpu, CPUHP_OFFLINE); +} EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ @@ -669,7 +674,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) } /* Requires cpu_add_remove_lock to be held */ -static int _cpu_up(unsigned int cpu, int tasks_frozen) +static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct task_struct *idle; @@ -692,7 +697,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) cpuhp_tasks_frozen = tasks_frozen; prev_state = st->state; - st->target = CPUHP_ONLINE; + st->target = target; while (st->state < st->target) { struct cpuhp_step *step; @@ -710,7 +715,7 @@ out: return ret; } -int cpu_up(unsigned int cpu) +static int do_cpu_up(unsigned int cpu, enum cpuhp_state target) { int err = 0; @@ -734,12 +739,16 @@ int cpu_up(unsigned int cpu) goto out; } - err = _cpu_up(cpu, 0); - + err = _cpu_up(cpu, 0, target); out: cpu_maps_update_done(); return err; } + +int cpu_up(unsigned int cpu) +{ + return do_cpu_up(cpu, CPUHP_ONLINE); +} EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP @@ -762,7 +771,7 @@ int disable_nonboot_cpus(void) if (cpu == first_cpu) continue; trace_suspend_resume(TPS("CPU_OFF"), cpu, true); - error = _cpu_down(cpu, 1); + error = _cpu_down(cpu, 1, CPUHP_OFFLINE); trace_suspend_resume(TPS("CPU_OFF"), cpu, false); if (!error) cpumask_set_cpu(cpu, frozen_cpus); @@ -812,7 +821,7 @@ void enable_nonboot_cpus(void) for_each_cpu(cpu, frozen_cpus) { trace_suspend_resume(TPS("CPU_ON"), cpu, true); - error = _cpu_up(cpu, 1); + error = _cpu_up(cpu, 1, CPUHP_ONLINE); trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { pr_info("CPU%d is up\n", cpu); -- cgit From 98f8cdce1db580b99fce823a48eea2cb2bdb261e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:31 +0000 Subject: cpu/hotplug: Add sysfs state interface Add a sysfs interface so we can actually see in which state the cpus are in. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182340.942257522@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index a00f8f67e4ad..1979b8927b86 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -56,6 +56,7 @@ struct cpuhp_step { bool skip_onerr; }; +static DEFINE_MUTEX(cpuhp_state_mutex); static struct cpuhp_step cpuhp_bp_states[]; static struct cpuhp_step cpuhp_ap_states[]; @@ -955,6 +956,105 @@ static struct cpuhp_step cpuhp_ap_states[] = { }, }; +static bool cpuhp_is_ap_state(enum cpuhp_state state) +{ + return (state > CPUHP_AP_OFFLINE && state < CPUHP_AP_ONLINE); +} + +static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) +{ + struct cpuhp_step *sp; + + sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; + return sp + state; +} + +#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) +static ssize_t show_cpuhp_state(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->state); +} +static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL); + +static ssize_t show_cpuhp_target(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->target); +} +static DEVICE_ATTR(target, 0444, show_cpuhp_target, NULL); + +static struct attribute *cpuhp_cpu_attrs[] = { + &dev_attr_state.attr, + &dev_attr_target.attr, + NULL +}; + +static struct attribute_group cpuhp_cpu_attr_group = { + .attrs = cpuhp_cpu_attrs, + .name = "hotplug", + NULL +}; + +static ssize_t show_cpuhp_states(struct device *dev, + struct device_attribute *attr, char *buf) +{ + ssize_t cur, res = 0; + int i; + + mutex_lock(&cpuhp_state_mutex); + for (i = 0; i <= CPUHP_ONLINE; i++) { + struct cpuhp_step *sp = cpuhp_get_step(i); + + if (sp->name) { + cur = sprintf(buf, "%3d: %s\n", i, sp->name); + buf += cur; + res += cur; + } + } + mutex_unlock(&cpuhp_state_mutex); + return res; +} +static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL); + +static struct attribute *cpuhp_cpu_root_attrs[] = { + &dev_attr_states.attr, + NULL +}; + +static struct attribute_group cpuhp_cpu_root_attr_group = { + .attrs = cpuhp_cpu_root_attrs, + .name = "hotplug", + NULL +}; + +static int __init cpuhp_sysfs_init(void) +{ + int cpu, ret; + + ret = sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpuhp_cpu_root_attr_group); + if (ret) + return ret; + + for_each_possible_cpu(cpu) { + struct device *dev = get_cpu_device(cpu); + + if (!dev) + continue; + ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group); + if (ret) + return ret; + } + return 0; +} +device_initcall(cpuhp_sysfs_init); +#endif + /* * cpu_bit_bitmap[] is a special, "compressed" data structure that * represents all NR_CPUS bits binary values of 1< Date: Fri, 26 Feb 2016 18:43:32 +0000 Subject: cpu/hotplug: Make target state writeable Make it possible to write a target state to the per cpu state file, so we can switch between states. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.022814799@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++------ lib/Kconfig.debug | 13 ++++++++++ 2 files changed, 78 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 1979b8927b86..be9335da82f1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -48,12 +48,14 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); * @teardown: Teardown function of the step * @skip_onerr: Do not invoke the functions on error rollback * Will go away once the notifiers are gone + * @cant_stop: Bringup/teardown can't be stopped at this step */ struct cpuhp_step { const char *name; int (*startup)(unsigned int cpu); int (*teardown)(unsigned int cpu); bool skip_onerr; + bool cant_stop; }; static DEFINE_MUTEX(cpuhp_state_mutex); @@ -558,7 +560,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, if (num_online_cpus() == 1) return -EBUSY; - if (!cpu_online(cpu)) + if (!cpu_present(cpu)) return -EINVAL; cpu_hotplug_begin(); @@ -683,16 +685,25 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) cpu_hotplug_begin(); - if (cpu_online(cpu) || !cpu_present(cpu)) { + if (!cpu_present(cpu)) { ret = -EINVAL; goto out; } - /* Let it fail before we try to bring the cpu up */ - idle = idle_thread_get(cpu); - if (IS_ERR(idle)) { - ret = PTR_ERR(idle); + /* + * The caller of do_cpu_up might have raced with another + * caller. Ignore it for now. + */ + if (st->state >= target) goto out; + + if (st->state == CPUHP_OFFLINE) { + /* Let it fail before we try to bring the cpu up */ + idle = idle_thread_get(cpu); + if (IS_ERR(idle)) { + ret = PTR_ERR(idle); + goto out; + } } cpuhp_tasks_frozen = tasks_frozen; @@ -909,27 +920,32 @@ static struct cpuhp_step cpuhp_bp_states[] = { .name = "threads:create", .startup = smpboot_create_threads, .teardown = NULL, + .cant_stop = true, }, [CPUHP_NOTIFY_PREPARE] = { .name = "notify:prepare", .startup = notify_prepare, .teardown = notify_dead, .skip_onerr = true, + .cant_stop = true, }, [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup = bringup_cpu, .teardown = NULL, + .cant_stop = true, }, [CPUHP_TEARDOWN_CPU] = { .name = "cpu:teardown", .startup = NULL, .teardown = takedown_cpu, + .cant_stop = true, }, [CPUHP_NOTIFY_ONLINE] = { .name = "notify:online", .startup = notify_online, .teardown = notify_down_prepare, + .cant_stop = true, }, #endif [CPUHP_ONLINE] = { @@ -947,6 +963,7 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup = notify_starting, .teardown = notify_dying, .skip_onerr = true, + .cant_stop = true, }, #endif [CPUHP_ONLINE] = { @@ -979,6 +996,46 @@ static ssize_t show_cpuhp_state(struct device *dev, } static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL); +static ssize_t write_cpuhp_target(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + struct cpuhp_step *sp; + int target, ret; + + ret = kstrtoint(buf, 10, &target); + if (ret) + return ret; + +#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL + if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE) + return -EINVAL; +#else + if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE) + return -EINVAL; +#endif + + ret = lock_device_hotplug_sysfs(); + if (ret) + return ret; + + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(target); + ret = !sp->name || sp->cant_stop ? -EINVAL : 0; + mutex_unlock(&cpuhp_state_mutex); + if (ret) + return ret; + + if (st->state < target) + ret = do_cpu_up(dev->id, target); + else + ret = do_cpu_down(dev->id, target); + + unlock_device_hotplug(); + return ret ? ret : count; +} + static ssize_t show_cpuhp_target(struct device *dev, struct device_attribute *attr, char *buf) { @@ -986,7 +1043,7 @@ static ssize_t show_cpuhp_target(struct device *dev, return sprintf(buf, "%d\n", st->target); } -static DEVICE_ATTR(target, 0444, show_cpuhp_target, NULL); +static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); static struct attribute *cpuhp_cpu_attrs[] = { &dev_attr_state.attr, @@ -1007,7 +1064,7 @@ static ssize_t show_cpuhp_states(struct device *dev, int i; mutex_lock(&cpuhp_state_mutex); - for (i = 0; i <= CPUHP_ONLINE; i++) { + for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) { struct cpuhp_step *sp = cpuhp_get_step(i); if (sp->name) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 8bfd1aca7a3d..f28f7fad452f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1442,6 +1442,19 @@ config DEBUG_BLOCK_EXT_DEVT Say N if you are unsure. +config CPU_HOTPLUG_STATE_CONTROL + bool "Enable CPU hotplug state control" + depends on DEBUG_KERNEL + depends on HOTPLUG_CPU + default n + help + Allows to write steps between "offline" and "online" to the CPUs + sysfs target file so states can be stepped granular. This is a debug + option for now as the hotplug machinery cannot be stopped and + restarted at arbitrary points yet. + + Say N if your are unsure. + config NOTIFIER_ERROR_INJECTION tristate "Notifier error injection" depends on DEBUG_KERNEL -- cgit From 5b7aa87e0482be768486e0c2277aa4122487eb9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:33 +0000 Subject: cpu/hotplug: Implement setup/removal interface Implement function which allow to setup/remove hotplug state callbacks. The default behaviour for setup is to call the startup function for this state for (or on) all cpus which have a hotplug state >= the installed state. The default behaviour for removal is to call the teardown function for this state for (or on) all cpus which have a hotplug state >= the installed state. This includes rollback to the previous state in case of failure. A special state is CPUHP_ONLINE_DYN. Its for dynamically registering a hotplug callback pair. This is for drivers which have no dependencies to avoid that we need to allocate CPUHP states for each of them For both setup and remove helper functions are provided, which prevent the core to issue the callbacks. This simplifies the conversion of existing hotplug notifiers. [ Dynamic registering implemented by Sebastian Siewior ] Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.103464877@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 67 ++++++++++++++ kernel/cpu.c | 224 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 291 insertions(+) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index d9303cca83d3..29935261b26d 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -11,7 +11,74 @@ enum cpuhp_state { CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, CPUHP_NOTIFY_ONLINE, + CPUHP_ONLINE_DYN, + CPUHP_ONLINE_DYN_END = CPUHP_ONLINE_DYN + 30, CPUHP_ONLINE, }; +int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)); + +/** + * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks + * @state: The state for which the calls are installed + * @name: Name of the callback (will be used in debug output) + * @startup: startup callback function + * @teardown: teardown callback function + * + * Installs the callback functions and invokes the startup callback on + * the present cpus which have already reached the @state. + */ +static inline int cpuhp_setup_state(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + return __cpuhp_setup_state(state, name, true, startup, teardown); +} + +/** + * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the + * callbacks + * @state: The state for which the calls are installed + * @name: Name of the callback. + * @startup: startup callback function + * @teardown: teardown callback function + * + * Same as @cpuhp_setup_state except that no calls are executed are invoked + * during installation of this callback. NOP if SMP=n or HOTPLUG_CPU=n. + */ +static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + return __cpuhp_setup_state(state, name, false, startup, teardown); +} + +void __cpuhp_remove_state(enum cpuhp_state state, bool invoke); + +/** + * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown + * @state: The state for which the calls are removed + * + * Removes the callback functions and invokes the teardown callback on + * the present cpus which have already reached the @state. + */ +static inline void cpuhp_remove_state(enum cpuhp_state state) +{ + __cpuhp_remove_state(state, true); +} + +/** + * cpuhp_remove_state_nocalls - Remove hotplug state callbacks without invoking + * teardown + * @state: The state for which the calls are removed + */ +static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) +{ + __cpuhp_remove_state(state, false); +} + #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index be9335da82f1..b5eacb9587af 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -973,6 +973,14 @@ static struct cpuhp_step cpuhp_ap_states[] = { }, }; +/* Sanity check for callbacks */ +static int cpuhp_cb_check(enum cpuhp_state state) +{ + if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE) + return -EINVAL; + return 0; +} + static bool cpuhp_is_ap_state(enum cpuhp_state state) { return (state > CPUHP_AP_OFFLINE && state < CPUHP_AP_ONLINE); @@ -986,6 +994,222 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) return sp + state; } +static void cpuhp_store_callbacks(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + /* (Un)Install the callbacks for further cpu hotplug operations */ + struct cpuhp_step *sp; + + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(state); + sp->startup = startup; + sp->teardown = teardown; + sp->name = name; + mutex_unlock(&cpuhp_state_mutex); +} + +static void *cpuhp_get_teardown_cb(enum cpuhp_state state) +{ + return cpuhp_get_step(state)->teardown; +} + +/* Helper function to run callback on the target cpu */ +static void cpuhp_on_cpu_cb(void *__cb) +{ + int (*cb)(unsigned int cpu) = __cb; + + BUG_ON(cb(smp_processor_id())); +} + +/* + * Call the startup/teardown function for a step either on the AP or + * on the current CPU. + */ +static int cpuhp_issue_call(int cpu, enum cpuhp_state state, + int (*cb)(unsigned int), bool bringup) +{ + int ret; + + if (!cb) + return 0; + + /* + * This invokes the callback directly for now. In a later step we + * convert that to use cpuhp_invoke_callback(). + */ + if (cpuhp_is_ap_state(state)) { + /* + * Note, that a function called on the AP is not + * allowed to fail. + */ + if (cpu_online(cpu)) + smp_call_function_single(cpu, cpuhp_on_cpu_cb, cb, 1); + return 0; + } + + /* + * The non AP bound callbacks can fail on bringup. On teardown + * e.g. module removal we crash for now. + */ + ret = cb(cpu); + BUG_ON(ret && !bringup); + return ret; +} + +/* + * Called from __cpuhp_setup_state on a recoverable failure. + * + * Note: The teardown callbacks for rollback are not allowed to fail! + */ +static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, + int (*teardown)(unsigned int cpu)) +{ + int cpu; + + if (!teardown) + return; + + /* Roll back the already executed steps on the other cpus */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpu >= failedcpu) + break; + + /* Did we invoke the startup call on that cpu ? */ + if (cpustate >= state) + cpuhp_issue_call(cpu, state, teardown, false); + } +} + +/* + * Returns a free for dynamic slot assignment of the Online state. The states + * are protected by the cpuhp_slot_states mutex and an empty slot is identified + * by having no name assigned. + */ +static int cpuhp_reserve_state(enum cpuhp_state state) +{ + enum cpuhp_state i; + + mutex_lock(&cpuhp_state_mutex); + for (i = CPUHP_ONLINE_DYN; i <= CPUHP_ONLINE_DYN_END; i++) { + if (cpuhp_bp_states[i].name) + continue; + + cpuhp_bp_states[i].name = "Reserved"; + mutex_unlock(&cpuhp_state_mutex); + return i; + } + mutex_unlock(&cpuhp_state_mutex); + WARN(1, "No more dynamic states available for CPU hotplug\n"); + return -ENOSPC; +} + +/** + * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state + * @state: The state to setup + * @invoke: If true, the startup function is invoked for cpus where + * cpu state >= @state + * @startup: startup callback function + * @teardown: teardown callback function + * + * Returns 0 if successful, otherwise a proper error code + */ +int __cpuhp_setup_state(enum cpuhp_state state, + const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + int cpu, ret = 0; + int dyn_state = 0; + + if (cpuhp_cb_check(state) || !name) + return -EINVAL; + + get_online_cpus(); + + /* currently assignments for the ONLINE state are possible */ + if (state == CPUHP_ONLINE_DYN) { + dyn_state = 1; + ret = cpuhp_reserve_state(state); + if (ret < 0) + goto out; + state = ret; + } + + cpuhp_store_callbacks(state, name, startup, teardown); + + if (!invoke || !startup) + goto out; + + /* + * Try to call the startup callback for each present cpu + * depending on the hotplug state of the cpu. + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate < state) + continue; + + ret = cpuhp_issue_call(cpu, state, startup, true); + if (ret) { + cpuhp_rollback_install(cpu, state, teardown); + cpuhp_store_callbacks(state, NULL, NULL, NULL); + goto out; + } + } +out: + put_online_cpus(); + if (!ret && dyn_state) + return state; + return ret; +} +EXPORT_SYMBOL(__cpuhp_setup_state); + +/** + * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state + * @state: The state to remove + * @invoke: If true, the teardown function is invoked for cpus where + * cpu state >= @state + * + * The teardown callback is currently not allowed to fail. Think + * about module removal! + */ +void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) +{ + int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state); + int cpu; + + BUG_ON(cpuhp_cb_check(state)); + + get_online_cpus(); + + if (!invoke || !teardown) + goto remove; + + /* + * Call the teardown callback for each present cpu depending + * on the hotplug state of the cpu. This function is not + * allowed to fail currently! + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate >= state) + cpuhp_issue_call(cpu, state, teardown, false); + } +remove: + cpuhp_store_callbacks(state, NULL, NULL, NULL); + put_online_cpus(); +} +EXPORT_SYMBOL(__cpuhp_remove_state); + #if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) static ssize_t show_cpuhp_state(struct device *dev, struct device_attribute *attr, char *buf) -- cgit From 949338e35131c551f7bf54f48a2e3a227af6721b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:35 +0000 Subject: cpu/hotplug: Move scheduler cpu_online notifier to hotplug core Move the scheduler cpu online notifier part to the hotplug core. This is anyway the highest priority callback and we need that functionality right now for the next changes. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.200791046@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 1 + kernel/cpu.c | 18 ++++++++++++++++++ kernel/sched/core.c | 10 ---------- 3 files changed, 19 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 29935261b26d..2f2e5d9711c4 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -10,6 +10,7 @@ enum cpuhp_state { CPUHP_AP_NOTIFY_STARTING, CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, + CPUHP_CPU_SET_ACTIVE, CPUHP_NOTIFY_ONLINE, CPUHP_ONLINE_DYN, CPUHP_ONLINE_DYN_END = CPUHP_ONLINE_DYN + 30, diff --git a/kernel/cpu.c b/kernel/cpu.c index b5eacb9587af..65e34d34ca93 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -666,6 +666,19 @@ void notify_cpu_starting(unsigned int cpu) } } +/* + * Called from the idle task. We need to set active here, so we can kick off + * the stopper thread. + */ +static int cpuhp_set_cpu_active(unsigned int cpu) +{ + /* The cpu is marked online, set it active now */ + set_cpu_active(cpu, true); + /* Unpark the stopper thread */ + stop_machine_unpark(cpu); + return 0; +} + static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) { for (st->state--; st->state > st->target; st->state--) { @@ -941,6 +954,11 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown = takedown_cpu, .cant_stop = true, }, + [CPUHP_CPU_SET_ACTIVE] = { + .name = "cpu:active", + .startup = cpuhp_set_cpu_active, + .teardown = NULL, + }, [CPUHP_NOTIFY_ONLINE] = { .name = "notify:online", .startup = notify_online, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9503d590e5ef..626646396ca0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5692,16 +5692,6 @@ static int sched_cpu_active(struct notifier_block *nfb, set_cpu_rq_start_time(); return NOTIFY_OK; - case CPU_ONLINE: - /* - * At this point a starting CPU has marked itself as online via - * set_cpu_online(). But it might not yet have marked itself - * as active, which is essential from here on. - */ - set_cpu_active(cpu, true); - stop_machine_unpark(cpu); - return NOTIFY_OK; - case CPU_DOWN_FAILED: set_cpu_active(cpu, true); return NOTIFY_OK; -- cgit From 931ef163309ee955611f287dc65248b39a65fc9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:36 +0000 Subject: cpu/hotplug: Unpark smpboot threads from the state machine Handle the smpboot threads in the state machine. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.295777684@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 7 +------ include/linux/cpuhotplug.h | 1 + init/main.c | 1 - kernel/cpu.c | 39 +++++---------------------------------- kernel/smpboot.c | 6 ++++-- kernel/smpboot.h | 4 ++-- 6 files changed, 13 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 78989f20420f..83f35767016d 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -78,7 +78,7 @@ enum { /* migration should happen before other stuff but after perf */ CPU_PRI_PERF = 20, CPU_PRI_MIGRATION = 10, - CPU_PRI_SMPBOOT = 9, + /* bring up workqueues before normal notifiers and down after */ CPU_PRI_WORKQUEUE_UP = 5, CPU_PRI_WORKQUEUE_DOWN = -5, @@ -172,7 +172,6 @@ static inline void __unregister_cpu_notifier(struct notifier_block *nb) } #endif -void smpboot_thread_init(void); int cpu_up(unsigned int cpu); void notify_cpu_starting(unsigned int cpu); extern void cpu_maps_update_begin(void); @@ -221,10 +220,6 @@ static inline void cpu_notifier_register_done(void) { } -static inline void smpboot_thread_init(void) -{ -} - #endif /* CONFIG_SMP */ extern struct bus_type cpu_subsys; diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 2f2e5d9711c4..38679106fddd 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -11,6 +11,7 @@ enum cpuhp_state { CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, CPUHP_CPU_SET_ACTIVE, + CPUHP_SMPBOOT_THREADS, CPUHP_NOTIFY_ONLINE, CPUHP_ONLINE_DYN, CPUHP_ONLINE_DYN_END = CPUHP_ONLINE_DYN + 30, diff --git a/init/main.c b/init/main.c index c2ea72362ee3..55563fd36be3 100644 --- a/init/main.c +++ b/init/main.c @@ -388,7 +388,6 @@ static noinline void __init_refok rest_init(void) int pid; rcu_scheduler_starting(); - smpboot_thread_init(); /* * We need to spawn init first so that it obtains pid 1, however * the init task will end up wanting to create kthreads, which, if diff --git a/kernel/cpu.c b/kernel/cpu.c index 65e34d34ca93..3ec86bc414b7 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -481,8 +481,6 @@ static int takedown_cpu(unsigned int cpu) else synchronize_rcu(); - smpboot_park_threads(cpu); - /* * Prevent irq alloc/free while the dying cpu reorganizes the * interrupt affinities. @@ -612,38 +610,6 @@ int cpu_down(unsigned int cpu) EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ -/* - * Unpark per-CPU smpboot kthreads at CPU-online time. - */ -static int smpboot_thread_call(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - - case CPU_DOWN_FAILED: - case CPU_ONLINE: - smpboot_unpark_threads(cpu); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block smpboot_thread_notifier = { - .notifier_call = smpboot_thread_call, - .priority = CPU_PRI_SMPBOOT, -}; - -void smpboot_thread_init(void) -{ - register_cpu_notifier(&smpboot_thread_notifier); -} - /** * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers * @cpu: cpu that just started @@ -959,6 +925,11 @@ static struct cpuhp_step cpuhp_bp_states[] = { .startup = cpuhp_set_cpu_active, .teardown = NULL, }, + [CPUHP_SMPBOOT_THREADS] = { + .name = "smpboot:threads", + .startup = smpboot_unpark_threads, + .teardown = smpboot_park_threads, + }, [CPUHP_NOTIFY_ONLINE] = { .name = "notify:online", .startup = notify_online, diff --git a/kernel/smpboot.c b/kernel/smpboot.c index d264f59bff56..13bc43d1fb22 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -226,7 +226,7 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp kthread_unpark(tsk); } -void smpboot_unpark_threads(unsigned int cpu) +int smpboot_unpark_threads(unsigned int cpu) { struct smp_hotplug_thread *cur; @@ -235,6 +235,7 @@ void smpboot_unpark_threads(unsigned int cpu) if (cpumask_test_cpu(cpu, cur->cpumask)) smpboot_unpark_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); + return 0; } static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) @@ -245,7 +246,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) kthread_park(tsk); } -void smpboot_park_threads(unsigned int cpu) +int smpboot_park_threads(unsigned int cpu) { struct smp_hotplug_thread *cur; @@ -253,6 +254,7 @@ void smpboot_park_threads(unsigned int cpu) list_for_each_entry_reverse(cur, &hotplug_threads, list) smpboot_park_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); + return 0; } static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 72415a0eb955..6b5f02017be3 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -14,7 +14,7 @@ static inline void idle_threads_init(void) { } #endif int smpboot_create_threads(unsigned int cpu); -void smpboot_park_threads(unsigned int cpu); -void smpboot_unpark_threads(unsigned int cpu); +int smpboot_park_threads(unsigned int cpu); +int smpboot_unpark_threads(unsigned int cpu); #endif -- cgit From 2e1a3483ce74d197876e58281925dd4e378a7f28 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:37 +0000 Subject: cpu/hotplug: Split out the state walk into functions We need that for running callbacks on the AP and the BP. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.374946234@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 111 ++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 68 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 3ec86bc414b7..9572ca043727 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -329,10 +329,74 @@ static int bringup_cpu(unsigned int cpu) return 0; } +/* + * Hotplug state machine related functions + */ +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps) +{ + for (st->state++; st->state < st->target; st->state++) { + struct cpuhp_step *step = steps + st->state; + + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, step->startup); + } +} + +static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + for (; st->state > target; st->state--) { + struct cpuhp_step *step = steps + st->state; + + ret = cpuhp_invoke_callback(cpu, st->state, step->teardown); + if (ret) { + st->target = prev_state; + undo_cpu_down(cpu, st, steps); + break; + } + } + return ret; +} + +static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps) +{ + for (st->state--; st->state > st->target; st->state--) { + struct cpuhp_step *step = steps + st->state; + + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, step->teardown); + } +} + +static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + while (st->state < target) { + struct cpuhp_step *step; + + st->state++; + step = steps + st->state; + ret = cpuhp_invoke_callback(cpu, st->state, step->startup); + if (ret) { + st->target = prev_state; + undo_cpu_up(cpu, st, steps); + break; + } + } + return ret; +} + #ifdef CONFIG_HOTPLUG_CPU EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); - void unregister_cpu_notifier(struct notifier_block *nb) { cpu_maps_update_begin(); @@ -537,15 +601,6 @@ static int notify_dead(unsigned int cpu) #endif #ifdef CONFIG_HOTPLUG_CPU -static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - for (st->state++; st->state < st->target; st->state++) { - struct cpuhp_step *step = cpuhp_bp_states + st->state; - - if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, step->startup); - } -} /* Requires cpu_add_remove_lock to be held */ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, @@ -567,16 +622,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, prev_state = st->state; st->target = target; - for (; st->state > st->target; st->state--) { - struct cpuhp_step *step = cpuhp_bp_states + st->state; + ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); - ret = cpuhp_invoke_callback(cpu, st->state, step->teardown); - if (ret) { - st->target = prev_state; - undo_cpu_down(cpu, st); - break; - } - } hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE; cpu_hotplug_done(); @@ -645,22 +692,12 @@ static int cpuhp_set_cpu_active(unsigned int cpu) return 0; } -static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - for (st->state--; st->state > st->target; st->state--) { - struct cpuhp_step *step = cpuhp_bp_states + st->state; - - if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, step->teardown); - } -} - /* Requires cpu_add_remove_lock to be held */ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct task_struct *idle; - int prev_state, ret = 0; + int ret = 0; cpu_hotplug_begin(); @@ -687,20 +724,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) cpuhp_tasks_frozen = tasks_frozen; - prev_state = st->state; st->target = target; - while (st->state < st->target) { - struct cpuhp_step *step; - - st->state++; - step = cpuhp_bp_states + st->state; - ret = cpuhp_invoke_callback(cpu, st->state, step->startup); - if (ret) { - st->target = prev_state; - undo_cpu_up(cpu, st); - break; - } - } + ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); out: cpu_hotplug_done(); return ret; -- cgit From 4cb28ced23c4f222ff4e3f39898017e52161a9c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:38 +0000 Subject: cpu/hotplug: Create hotplug threads In order to let the hotplugged cpu take care of the setup/teardown, we need a seperate hotplug thread. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.454541272@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- kernel/smp.c | 1 + kernel/smpboot.h | 2 + 3 files changed, 147 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 9572ca043727..9048c33689ac 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -33,10 +34,24 @@ * cpuhp_cpu_state - Per cpu hotplug state storage * @state: The current cpu state * @target: The target state + * @thread: Pointer to the hotplug thread + * @should_run: Thread should execute + * @cb_stat: The state for a single callback (install/uninstall) + * @cb: Single callback function (install/uninstall) + * @result: Result of the operation + * @done: Signal completion to the issuer of the task */ struct cpuhp_cpu_state { enum cpuhp_state state; enum cpuhp_state target; +#ifdef CONFIG_SMP + struct task_struct *thread; + bool should_run; + enum cpuhp_state cb_state; + int (*cb)(unsigned int cpu); + int result; + struct completion done; +#endif }; static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); @@ -394,6 +409,134 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, return ret; } +/* + * The cpu hotplug threads manage the bringup and teardown of the cpus + */ +static void cpuhp_create(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + init_completion(&st->done); +} + +static int cpuhp_should_run(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + + return st->should_run; +} + +/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */ +static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + enum cpuhp_state target = max((int)st->target, CPUHP_AP_ONLINE); + + return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target); +} + +/* Execute the online startup callbacks. Used to be CPU_ONLINE */ +static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target); +} + +/* + * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke + * callbacks when a state gets [un]installed at runtime. + */ +static void cpuhp_thread_fun(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + int ret = 0; + + /* + * Paired with the mb() in cpuhp_kick_ap_work and + * cpuhp_invoke_ap_callback, so the work set is consistent visible. + */ + smp_mb(); + if (!st->should_run) + return; + + st->should_run = false; + + /* Single callback invocation for [un]install ? */ + if (st->cb) { + if (st->cb_state < CPUHP_AP_ONLINE) { + local_irq_disable(); + ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + local_irq_enable(); + } else { + ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + } + } else { + /* Regular hotplug work */ + if (st->state < st->target) + ret = cpuhp_ap_online(cpu, st); + else if (st->state > st->target) + ret = cpuhp_ap_offline(cpu, st); + } + st->result = ret; + complete(&st->done); +} + +/* Invoke a single callback on a remote cpu */ +static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, + int (*cb)(unsigned int)) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + if (!cpu_online(cpu)) + return 0; + + st->cb_state = state; + st->cb = cb; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); + wait_for_completion(&st->done); + return st->result; +} + +/* Regular hotplug invocation of the AP hotplug thread */ +static int cpuhp_kick_ap_work(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + enum cpuhp_state state = st->state; + + trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); + st->result = 0; + st->cb = NULL; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); + wait_for_completion(&st->done); + trace_cpuhp_exit(cpu, st->state, state, st->result); + return st->result; +} + +static struct smp_hotplug_thread cpuhp_threads = { + .store = &cpuhp_state.thread, + .create = &cpuhp_create, + .thread_should_run = cpuhp_should_run, + .thread_fn = cpuhp_thread_fun, + .thread_comm = "cpuhp/%u", + .selfparking = true, +}; + +void __init cpuhp_threads_init(void) +{ + BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads)); + kthread_unpark(this_cpu_read(cpuhp_state.thread)); +} + #ifdef CONFIG_HOTPLUG_CPU EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); @@ -997,7 +1140,7 @@ static int cpuhp_cb_check(enum cpuhp_state state) static bool cpuhp_is_ap_state(enum cpuhp_state state) { - return (state > CPUHP_AP_OFFLINE && state < CPUHP_AP_ONLINE); + return (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE); } static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) diff --git a/kernel/smp.c b/kernel/smp.c index d903c02223af..822ffb1ada3f 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -569,6 +569,7 @@ void __init smp_init(void) unsigned int cpu; idle_threads_init(); + cpuhp_threads_init(); /* FIXME: This should be done in userspace --RR */ for_each_present_cpu(cpu) { diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 6b5f02017be3..485b81cfab34 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -17,4 +17,6 @@ int smpboot_create_threads(unsigned int cpu); int smpboot_park_threads(unsigned int cpu); int smpboot_unpark_threads(unsigned int cpu); +void __init cpuhp_threads_init(void); + #endif -- cgit From 1cf4f629d9d246519a1e76c021806f2a51ddba4d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:39 +0000 Subject: cpu/hotplug: Move online calls to hotplugged cpu Let the hotplugged cpu invoke the setup/teardown callbacks (CPU_ONLINE/CPU_DOWN_PREPARE) itself. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.536364371@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 10 ++-- kernel/cpu.c | 144 ++++++++++++++++++++++++++++++--------------- 2 files changed, 102 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 38679106fddd..8a715bb1e192 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -11,10 +11,12 @@ enum cpuhp_state { CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, CPUHP_CPU_SET_ACTIVE, - CPUHP_SMPBOOT_THREADS, - CPUHP_NOTIFY_ONLINE, - CPUHP_ONLINE_DYN, - CPUHP_ONLINE_DYN_END = CPUHP_ONLINE_DYN + 30, + CPUHP_KICK_AP_THREAD, + CPUHP_BP_ONLINE, + CPUHP_AP_SMPBOOT_THREADS, + CPUHP_AP_NOTIFY_ONLINE, + CPUHP_AP_ONLINE_DYN, + CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, CPUHP_ONLINE, }; diff --git a/kernel/cpu.c b/kernel/cpu.c index 9048c33689ac..e220e565ea98 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -429,7 +429,7 @@ static int cpuhp_should_run(unsigned int cpu) /* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */ static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) { - enum cpuhp_state target = max((int)st->target, CPUHP_AP_ONLINE); + enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target); } @@ -469,6 +469,9 @@ static void cpuhp_thread_fun(unsigned int cpu) ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); } } else { + /* Cannot happen .... */ + BUG_ON(st->state < CPUHP_KICK_AP_THREAD); + /* Regular hotplug work */ if (st->state < st->target) ret = cpuhp_ap_online(cpu, st); @@ -502,12 +505,8 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, } /* Regular hotplug invocation of the AP hotplug thread */ -static int cpuhp_kick_ap_work(unsigned int cpu) +static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) { - struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - enum cpuhp_state state = st->state; - - trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); st->result = 0; st->cb = NULL; /* @@ -517,6 +516,15 @@ static int cpuhp_kick_ap_work(unsigned int cpu) smp_mb(); st->should_run = true; wake_up_process(st->thread); +} + +static int cpuhp_kick_ap_work(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + enum cpuhp_state state = st->state; + + trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); + __cpuhp_kick_ap_work(st); wait_for_completion(&st->done); trace_cpuhp_exit(cpu, st->state, state, st->result); return st->result; @@ -688,6 +696,9 @@ static int takedown_cpu(unsigned int cpu) else synchronize_rcu(); + /* Park the hotplug thread */ + kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread); + /* * Prevent irq alloc/free while the dying cpu reorganizes the * interrupt affinities. @@ -765,10 +776,34 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, prev_state = st->state; st->target = target; + /* + * If the current CPU state is in the range of the AP hotplug thread, + * then we need to kick the thread. + */ + if (st->state >= CPUHP_KICK_AP_THREAD) { + ret = cpuhp_kick_ap_work(cpu); + /* + * The AP side has done the error rollback already. Just + * return the error code.. + */ + if (ret) + goto out; + + /* + * We might have stopped still in the range of the AP hotplug + * thread. Nothing to do anymore. + */ + if (st->state >= CPUHP_KICK_AP_THREAD) + goto out; + } + /* + * The AP brought itself down below CPUHP_KICK_AP_THREAD. So we need + * to do the further cleanups. + */ ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE; - +out: cpu_hotplug_done(); /* This post dead nonsense must die */ if (!ret && hasdied) @@ -828,10 +863,13 @@ void notify_cpu_starting(unsigned int cpu) */ static int cpuhp_set_cpu_active(unsigned int cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + /* The cpu is marked online, set it active now */ set_cpu_active(cpu, true); - /* Unpark the stopper thread */ + /* Unpark the stopper thread and the hotplug thread */ stop_machine_unpark(cpu); + kthread_unpark(st->thread); return 0; } @@ -868,6 +906,26 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) cpuhp_tasks_frozen = tasks_frozen; st->target = target; + /* + * If the current CPU state is in the range of the AP hotplug thread, + * then we need to kick the thread once more. + */ + if (st->state >= CPUHP_KICK_AP_THREAD) { + ret = cpuhp_kick_ap_work(cpu); + /* + * The AP side has done the error rollback already. Just + * return the error code.. + */ + if (ret) + goto out; + } + + /* + * Try to reach the target state. We max out on the BP at + * CPUHP_KICK_AP_THREAD. After that the AP hotplug thread is + * responsible for bringing it up to the target state. + */ + target = min((int)target, CPUHP_KICK_AP_THREAD); ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); out: cpu_hotplug_done(); @@ -1093,19 +1151,13 @@ static struct cpuhp_step cpuhp_bp_states[] = { .startup = cpuhp_set_cpu_active, .teardown = NULL, }, - [CPUHP_SMPBOOT_THREADS] = { - .name = "smpboot:threads", - .startup = smpboot_unpark_threads, - .teardown = smpboot_park_threads, - }, - [CPUHP_NOTIFY_ONLINE] = { - .name = "notify:online", - .startup = notify_online, - .teardown = notify_down_prepare, - .cant_stop = true, + [CPUHP_KICK_AP_THREAD] = { + .name = "cpuhp:kickthread", + .startup = cpuhp_kick_ap_work, + .teardown = cpuhp_kick_ap_work, }, #endif - [CPUHP_ONLINE] = { + [CPUHP_BP_ONLINE] = { .name = "online", .startup = NULL, .teardown = NULL, @@ -1122,6 +1174,16 @@ static struct cpuhp_step cpuhp_ap_states[] = { .skip_onerr = true, .cant_stop = true, }, + [CPUHP_AP_SMPBOOT_THREADS] = { + .name = "smpboot:threads", + .startup = smpboot_unpark_threads, + .teardown = smpboot_park_threads, + }, + [CPUHP_AP_NOTIFY_ONLINE] = { + .name = "notify:online", + .startup = notify_online, + .teardown = notify_down_prepare, + }, #endif [CPUHP_ONLINE] = { .name = "online", @@ -1140,7 +1202,9 @@ static int cpuhp_cb_check(enum cpuhp_state state) static bool cpuhp_is_ap_state(enum cpuhp_state state) { - return (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE); + if (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE) + return true; + return state > CPUHP_BP_ONLINE; } static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) @@ -1172,14 +1236,6 @@ static void *cpuhp_get_teardown_cb(enum cpuhp_state state) return cpuhp_get_step(state)->teardown; } -/* Helper function to run callback on the target cpu */ -static void cpuhp_on_cpu_cb(void *__cb) -{ - int (*cb)(unsigned int cpu) = __cb; - - BUG_ON(cb(smp_processor_id())); -} - /* * Call the startup/teardown function for a step either on the AP or * on the current CPU. @@ -1191,26 +1247,18 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, if (!cb) return 0; - - /* - * This invokes the callback directly for now. In a later step we - * convert that to use cpuhp_invoke_callback(). - */ - if (cpuhp_is_ap_state(state)) { - /* - * Note, that a function called on the AP is not - * allowed to fail. - */ - if (cpu_online(cpu)) - smp_call_function_single(cpu, cpuhp_on_cpu_cb, cb, 1); - return 0; - } - /* * The non AP bound callbacks can fail on bringup. On teardown * e.g. module removal we crash for now. */ - ret = cb(cpu); +#ifdef CONFIG_SMP + if (cpuhp_is_ap_state(state)) + ret = cpuhp_invoke_ap_callback(cpu, state, cb); + else + ret = cpuhp_invoke_callback(cpu, state, cb); +#else + ret = cpuhp_invoke_callback(cpu, state, cb); +#endif BUG_ON(ret && !bringup); return ret; } @@ -1252,11 +1300,11 @@ static int cpuhp_reserve_state(enum cpuhp_state state) enum cpuhp_state i; mutex_lock(&cpuhp_state_mutex); - for (i = CPUHP_ONLINE_DYN; i <= CPUHP_ONLINE_DYN_END; i++) { - if (cpuhp_bp_states[i].name) + for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) { + if (cpuhp_ap_states[i].name) continue; - cpuhp_bp_states[i].name = "Reserved"; + cpuhp_ap_states[i].name = "Reserved"; mutex_unlock(&cpuhp_state_mutex); return i; } @@ -1289,7 +1337,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, get_online_cpus(); /* currently assignments for the ONLINE state are possible */ - if (state == CPUHP_ONLINE_DYN) { + if (state == CPUHP_AP_ONLINE_DYN) { dyn_state = 1; ret = cpuhp_reserve_state(state); if (ret < 0) -- cgit From 8df3e07e7f21f2ed8d001e6fabf9505946b438aa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:41 +0000 Subject: cpu/hotplug: Let upcoming cpu bring itself fully up Let the upcoming cpu kick the hotplug thread and let itself complete the bringup. That way the controll side can just wait for the completion or later when we made the hotplug machinery async not care at all. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.697655464@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 9 ++++--- kernel/cpu.c | 66 ++++++++++++++++++++++++++-------------------- kernel/sched/idle.c | 2 ++ 3 files changed, 45 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 4aa263adc536..ad5d7fcb0130 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -10,9 +10,6 @@ enum cpuhp_state { CPUHP_AP_NOTIFY_STARTING, CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, - CPUHP_CPU_SET_ACTIVE, - CPUHP_KICK_AP_THREAD, - CPUHP_BP_ONLINE, CPUHP_AP_ONLINE_IDLE, CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_NOTIFY_ONLINE, @@ -86,4 +83,10 @@ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) __cpuhp_remove_state(state, false); } +#ifdef CONFIG_SMP +void cpuhp_online_idle(enum cpuhp_state state); +#else +static inline void cpuhp_online_idle(enum cpuhp_state state) { } +#endif + #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index e220e565ea98..f1f880fac832 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -329,6 +329,14 @@ static int notify_starting(unsigned int cpu) return 0; } +static int bringup_wait_for_ap(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + wait_for_completion(&st->done); + return st->result; +} + static int bringup_cpu(unsigned int cpu) { struct task_struct *idle = idle_thread_get(cpu); @@ -340,8 +348,9 @@ static int bringup_cpu(unsigned int cpu) cpu_notify(CPU_UP_CANCELED, cpu); return ret; } + ret = bringup_wait_for_ap(cpu); BUG_ON(!cpu_online(cpu)); - return 0; + return ret; } /* @@ -470,7 +479,7 @@ static void cpuhp_thread_fun(unsigned int cpu) } } else { /* Cannot happen .... */ - BUG_ON(st->state < CPUHP_KICK_AP_THREAD); + BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); /* Regular hotplug work */ if (st->state < st->target) @@ -780,7 +789,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread. */ - if (st->state >= CPUHP_KICK_AP_THREAD) { + if (st->state > CPUHP_TEARDOWN_CPU) { ret = cpuhp_kick_ap_work(cpu); /* * The AP side has done the error rollback already. Just @@ -793,11 +802,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, * We might have stopped still in the range of the AP hotplug * thread. Nothing to do anymore. */ - if (st->state >= CPUHP_KICK_AP_THREAD) + if (st->state > CPUHP_TEARDOWN_CPU) goto out; } /* - * The AP brought itself down below CPUHP_KICK_AP_THREAD. So we need + * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need * to do the further cleanups. */ ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); @@ -859,18 +868,32 @@ void notify_cpu_starting(unsigned int cpu) /* * Called from the idle task. We need to set active here, so we can kick off - * the stopper thread. + * the stopper thread and unpark the smpboot threads. If the target state is + * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the + * cpu further. */ -static int cpuhp_set_cpu_active(unsigned int cpu) +void cpuhp_online_idle(enum cpuhp_state state) { - struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + unsigned int cpu = smp_processor_id(); + + /* Happens for the boot cpu */ + if (state != CPUHP_AP_ONLINE_IDLE) + return; + + st->state = CPUHP_AP_ONLINE_IDLE; /* The cpu is marked online, set it active now */ set_cpu_active(cpu, true); - /* Unpark the stopper thread and the hotplug thread */ + /* Unpark the stopper thread and the hotplug thread of this cpu */ stop_machine_unpark(cpu); kthread_unpark(st->thread); - return 0; + + /* Should we go further up ? */ + if (st->target > CPUHP_AP_ONLINE_IDLE) + __cpuhp_kick_ap_work(st); + else + complete(&st->done); } /* Requires cpu_add_remove_lock to be held */ @@ -910,7 +933,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread once more. */ - if (st->state >= CPUHP_KICK_AP_THREAD) { + if (st->state > CPUHP_BRINGUP_CPU) { ret = cpuhp_kick_ap_work(cpu); /* * The AP side has done the error rollback already. Just @@ -922,10 +945,10 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) /* * Try to reach the target state. We max out on the BP at - * CPUHP_KICK_AP_THREAD. After that the AP hotplug thread is + * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is * responsible for bringing it up to the target state. */ - target = min((int)target, CPUHP_KICK_AP_THREAD); + target = min((int)target, CPUHP_BRINGUP_CPU); ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); out: cpu_hotplug_done(); @@ -1146,22 +1169,7 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown = takedown_cpu, .cant_stop = true, }, - [CPUHP_CPU_SET_ACTIVE] = { - .name = "cpu:active", - .startup = cpuhp_set_cpu_active, - .teardown = NULL, - }, - [CPUHP_KICK_AP_THREAD] = { - .name = "cpuhp:kickthread", - .startup = cpuhp_kick_ap_work, - .teardown = cpuhp_kick_ap_work, - }, #endif - [CPUHP_BP_ONLINE] = { - .name = "online", - .startup = NULL, - .teardown = NULL, - }, }; /* Application processor state steps */ @@ -1204,7 +1212,7 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state) { if (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE) return true; - return state > CPUHP_BP_ONLINE; + return state > CPUHP_BRINGUP_CPU; } static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 544a7133cbd1..a4b9813afc96 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -291,5 +292,6 @@ void cpu_startup_entry(enum cpuhp_state state) boot_init_stack_canary(); #endif arch_cpu_idle_prepare(); + cpuhp_online_idle(state); cpu_idle_loop(); } -- cgit From e69aab13117efc1987620090e539b4ebeb33a04c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:43 +0000 Subject: cpu/hotplug: Make wait for dead cpu completion based Kill the busy spinning on the control side and just wait for the hotplugged cpu to tell that it reached the dead state. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.776157858@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 5 +++-- include/linux/cpuhotplug.h | 1 + kernel/cpu.c | 16 ++++++++++++---- kernel/sched/idle.c | 5 +---- 4 files changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 83f35767016d..91a48d1b4ca0 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -276,14 +276,15 @@ void arch_cpu_idle_enter(void); void arch_cpu_idle_exit(void); void arch_cpu_idle_dead(void); -DECLARE_PER_CPU(bool, cpu_dead_idle); - int cpu_report_state(int cpu); int cpu_check_up_prepare(int cpu); void cpu_set_state_online(int cpu); #ifdef CONFIG_HOTPLUG_CPU bool cpu_wait_death(unsigned int cpu, int seconds); bool cpu_report_death(void); +void cpuhp_report_idle_dead(void); +#else +static inline void cpuhp_report_idle_dead(void) { } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ #endif /* _LINUX_CPU_H_ */ diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index ad5d7fcb0130..5d68e15e46b7 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -6,6 +6,7 @@ enum cpuhp_state { CPUHP_CREATE_THREADS, CPUHP_NOTIFY_PREPARE, CPUHP_BRINGUP_CPU, + CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, CPUHP_AP_NOTIFY_STARTING, CPUHP_AP_ONLINE, diff --git a/kernel/cpu.c b/kernel/cpu.c index f1f880fac832..0e8c07f2566e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -688,6 +688,7 @@ static int take_cpu_down(void *_param) static int takedown_cpu(unsigned int cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int err; /* @@ -733,10 +734,8 @@ static int takedown_cpu(unsigned int cpu) * * Wait for the stop thread to go away. */ - while (!per_cpu(cpu_dead_idle, cpu)) - cpu_relax(); - smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ - per_cpu(cpu_dead_idle, cpu) = false; + wait_for_completion(&st->done); + BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); /* Interrupts are moved away from the dying cpu, reenable alloc/free */ irq_unlock_sparse(); @@ -756,6 +755,15 @@ static int notify_dead(unsigned int cpu) return 0; } +void cpuhp_report_idle_dead(void) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + + BUG_ON(st->state != CPUHP_AP_OFFLINE); + st->state = CPUHP_AP_IDLE_DEAD; + complete(&st->done); +} + #else #define notify_down_prepare NULL #define takedown_cpu NULL diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index a4b9813afc96..8abbe89e9114 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -194,8 +194,6 @@ exit_idle: rcu_idle_exit(); } -DEFINE_PER_CPU(bool, cpu_dead_idle); - /* * Generic idle loop implementation * @@ -224,8 +222,7 @@ static void cpu_idle_loop(void) if (cpu_is_offline(smp_processor_id())) { rcu_cpu_notify(NULL, CPU_DYING_IDLE, (void *)(long)smp_processor_id()); - smp_mb(); /* all activity before dead. */ - this_cpu_write(cpu_dead_idle, true); + cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } -- cgit From 27d50c7eeb0f03c3d3ca72aac4d2dd487ca1f3f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Feb 2016 18:43:44 +0000 Subject: rcu: Make CPU_DYING_IDLE an explicit call Make the RCU CPU_DYING_IDLE callback an explicit function call, so it gets invoked at the proper place. Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Rik van Riel Cc: Rafael Wysocki Cc: "Srivatsa S. Bhat" Cc: Peter Zijlstra Cc: Arjan van de Ven Cc: Sebastian Siewior Cc: Rusty Russell Cc: Steven Rostedt Cc: Oleg Nesterov Cc: Tejun Heo Cc: Andrew Morton Cc: Paul McKenney Cc: Linus Torvalds Cc: Paul Turner Link: http://lkml.kernel.org/r/20160226182341.870167933@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 4 +-- include/linux/notifier.h | 2 ++ include/linux/rcupdate.h | 4 +-- kernel/cpu.c | 1 + kernel/rcu/tree.c | 70 +++++++++++++++++++++++++----------------------- kernel/sched/idle.c | 2 -- 6 files changed, 42 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 91a48d1b4ca0..f9b1fab4388a 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -101,9 +101,7 @@ enum { * Called on the new cpu, just before * enabling interrupts. Must not sleep, * must not fail */ -#define CPU_DYING_IDLE 0x000B /* CPU (unsigned)v dying, reached - * idle loop. */ -#define CPU_BROKEN 0x000C /* CPU (unsigned)v did not die properly, +#define CPU_BROKEN 0x000B /* CPU (unsigned)v did not die properly, * perhaps due to preemption. */ /* Used for CPU hotplug events occurring while tasks are frozen due to a suspend diff --git a/include/linux/notifier.h b/include/linux/notifier.h index d14a4c362465..4149868de4e6 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -47,6 +47,8 @@ * runtime initialization. */ +struct notifier_block; + typedef int (*notifier_fn_t)(struct notifier_block *nb, unsigned long action, void *data); diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 14e6f47ee16f..fc46fe3ea259 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -332,9 +332,7 @@ void rcu_init(void); void rcu_sched_qs(void); void rcu_bh_qs(void); void rcu_check_callbacks(int user); -struct notifier_block; -int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu); +void rcu_report_dead(unsigned int cpu); #ifndef CONFIG_TINY_RCU void rcu_end_inkernel_boot(void); diff --git a/kernel/cpu.c b/kernel/cpu.c index 0e8c07f2566e..ff8059b76a85 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -762,6 +762,7 @@ void cpuhp_report_idle_dead(void) BUG_ON(st->state != CPUHP_AP_OFFLINE); st->state = CPUHP_AP_IDLE_DEAD; complete(&st->done); + rcu_report_dead(smp_processor_id()); } #else diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e41dd4131f7a..85b41341272e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2606,28 +2606,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) } } -/* - * The CPU is exiting the idle loop into the arch_cpu_idle_dead() - * function. We now remove it from the rcu_node tree's ->qsmaskinit - * bit masks. - */ -static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long mask; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return; - - /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ - mask = rdp->grpmask; - raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ - rnp->qsmaskinitnext &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - /* * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, @@ -4247,6 +4225,43 @@ static void rcu_prepare_cpu(int cpu) rcu_init_percpu_data(cpu, rsp); } +#ifdef CONFIG_HOTPLUG_CPU +/* + * The CPU is exiting the idle loop into the arch_cpu_idle_dead() + * function. We now remove it from the rcu_node tree's ->qsmaskinit + * bit masks. + */ +static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long mask; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return; + + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + mask = rdp->grpmask; + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ + rnp->qsmaskinitnext &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +void rcu_report_dead(unsigned int cpu) +{ + struct rcu_state *rsp; + + /* QS for any half-done expedited RCU-sched GP. */ + preempt_disable(); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(rcu_sched_state.rda), true); + preempt_enable(); + for_each_rcu_flavor(rsp) + rcu_cleanup_dying_idle_cpu(cpu, rsp); +} +#endif + /* * Handle CPU online/offline notification events. */ @@ -4278,17 +4293,6 @@ int rcu_cpu_notify(struct notifier_block *self, for_each_rcu_flavor(rsp) rcu_cleanup_dying_cpu(rsp); break; - case CPU_DYING_IDLE: - /* QS for any half-done expedited RCU-sched GP. */ - preempt_disable(); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(rcu_sched_state.rda), true); - preempt_enable(); - - for_each_rcu_flavor(rsp) { - rcu_cleanup_dying_idle_cpu(cpu, rsp); - } - break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8abbe89e9114..bd12c6c714ec 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -220,8 +220,6 @@ static void cpu_idle_loop(void) rmb(); if (cpu_is_offline(smp_processor_id())) { - rcu_cpu_notify(NULL, CPU_DYING_IDLE, - (void *)(long)smp_processor_id()); cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } -- cgit From 9b7f6597f013d449d6700d11820faf91ee0ec985 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 2 Mar 2016 12:53:31 +0100 Subject: sched/core: Get rid of 'cpu' argument in wq_worker_sleeping() Given that wq_worker_sleeping() could only be called for a CPU it is running on, we do not need passing a CPU ID as an argument. Suggested-by: Oleg Nesterov Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Alexander Gordeev Signed-off-by: Tejun Heo --- kernel/sched/core.c | 2 +- kernel/workqueue.c | 5 ++--- kernel/workqueue_internal.h | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 63d3a24e081a..81ff7f2ad37a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3257,7 +3257,7 @@ static void __sched notrace __schedule(bool preempt) if (prev->flags & PF_WQ_WORKER) { struct task_struct *to_wakeup; - to_wakeup = wq_worker_sleeping(prev, cpu); + to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) try_to_wake_up_local(to_wakeup); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3a1c99b0c1b3..16f4986205e9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -858,7 +858,6 @@ void wq_worker_waking_up(struct task_struct *task, int cpu) /** * wq_worker_sleeping - a worker is going to sleep * @task: task going to sleep - * @cpu: CPU in question, must be the current CPU number * * This function is called during schedule() when a busy worker is * going to sleep. Worker on the same cpu can be woken up by @@ -870,7 +869,7 @@ void wq_worker_waking_up(struct task_struct *task, int cpu) * Return: * Worker task on @cpu to wake up, %NULL if none. */ -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) +struct task_struct *wq_worker_sleeping(struct task_struct *task) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; struct worker_pool *pool; @@ -886,7 +885,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) pool = worker->pool; /* this can only happen on the local cpu */ - if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu)) + if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id())) return NULL; /* diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 45215870ac6c..8635417c587b 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -69,6 +69,6 @@ static inline struct worker *current_wq_worker(void) * sched/core.c and workqueue.c. */ void wq_worker_waking_up(struct task_struct *task, int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ -- cgit From d027d45d8a17a4145eab2d841140e9acbb7feb59 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 7 Jun 2015 15:54:30 +0200 Subject: nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner Suggested-by: Peter Zijlstra Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/sched.h | 8 +++ include/linux/tick.h | 92 +++++++++++++++++++++++++++++ kernel/time/tick-sched.c | 150 ++++++++++++++++++++++++++++++++++++++++++++--- kernel/time/tick-sched.h | 1 + 4 files changed, 244 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index a10494a94cc3..307e48375f21 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -719,6 +719,10 @@ struct signal_struct { /* Earliest-expiration cache. */ struct task_cputime cputime_expires; +#ifdef CONFIG_NO_HZ_FULL + unsigned long tick_dep_mask; +#endif + struct list_head cpu_timers[3]; struct pid *tty_old_pgrp; @@ -1542,6 +1546,10 @@ struct task_struct { VTIME_SYS, } vtime_snap_whence; #endif + +#ifdef CONFIG_NO_HZ_FULL + unsigned long tick_dep_mask; +#endif unsigned long nvcsw, nivcsw; /* context switch counts */ u64 start_time; /* monotonic time in nsec */ u64 real_start_time; /* boot based time in nsec */ diff --git a/include/linux/tick.h b/include/linux/tick.h index 97fd4e543846..d60e5df8ec28 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -97,6 +97,18 @@ static inline void tick_broadcast_exit(void) tick_broadcast_oneshot_control(TICK_BROADCAST_EXIT); } +enum tick_dep_bits { + TICK_DEP_BIT_POSIX_TIMER = 0, + TICK_DEP_BIT_PERF_EVENTS = 1, + TICK_DEP_BIT_SCHED = 2, + TICK_DEP_BIT_CLOCK_UNSTABLE = 3 +}; + +#define TICK_DEP_MASK_POSIX_TIMER (1 << TICK_DEP_BIT_POSIX_TIMER) +#define TICK_DEP_MASK_PERF_EVENTS (1 << TICK_DEP_BIT_PERF_EVENTS) +#define TICK_DEP_MASK_SCHED (1 << TICK_DEP_BIT_SCHED) +#define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE) + #ifdef CONFIG_NO_HZ_COMMON extern int tick_nohz_enabled; extern int tick_nohz_tick_stopped(void); @@ -154,6 +166,72 @@ static inline int housekeeping_any_cpu(void) return cpumask_any_and(housekeeping_mask, cpu_online_mask); } +extern void tick_nohz_dep_set(enum tick_dep_bits bit); +extern void tick_nohz_dep_clear(enum tick_dep_bits bit); +extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit); +extern void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit); +extern void tick_nohz_dep_set_task(struct task_struct *tsk, + enum tick_dep_bits bit); +extern void tick_nohz_dep_clear_task(struct task_struct *tsk, + enum tick_dep_bits bit); +extern void tick_nohz_dep_set_signal(struct signal_struct *signal, + enum tick_dep_bits bit); +extern void tick_nohz_dep_clear_signal(struct signal_struct *signal, + enum tick_dep_bits bit); + +/* + * The below are tick_nohz_[set,clear]_dep() wrappers that optimize off-cases + * on top of static keys. + */ +static inline void tick_dep_set(enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_set(bit); +} + +static inline void tick_dep_clear(enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_clear(bit); +} + +static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit) +{ + if (tick_nohz_full_cpu(cpu)) + tick_nohz_dep_set_cpu(cpu, bit); +} + +static inline void tick_dep_clear_cpu(int cpu, enum tick_dep_bits bit) +{ + if (tick_nohz_full_cpu(cpu)) + tick_nohz_dep_clear_cpu(cpu, bit); +} + +static inline void tick_dep_set_task(struct task_struct *tsk, + enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_set_task(tsk, bit); +} +static inline void tick_dep_clear_task(struct task_struct *tsk, + enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_clear_task(tsk, bit); +} +static inline void tick_dep_set_signal(struct signal_struct *signal, + enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_set_signal(signal, bit); +} +static inline void tick_dep_clear_signal(struct signal_struct *signal, + enum tick_dep_bits bit) +{ + if (tick_nohz_full_enabled()) + tick_nohz_dep_clear_signal(signal, bit); +} + extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_cpu(int cpu); extern void tick_nohz_full_kick_all(void); @@ -166,6 +244,20 @@ static inline int housekeeping_any_cpu(void) static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { } + +static inline void tick_dep_set(enum tick_dep_bits bit) { } +static inline void tick_dep_clear(enum tick_dep_bits bit) { } +static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit) { } +static inline void tick_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { } +static inline void tick_dep_set_task(struct task_struct *tsk, + enum tick_dep_bits bit) { } +static inline void tick_dep_clear_task(struct task_struct *tsk, + enum tick_dep_bits bit) { } +static inline void tick_dep_set_signal(struct signal_struct *signal, + enum tick_dep_bits bit) { } +static inline void tick_dep_clear_signal(struct signal_struct *signal, + enum tick_dep_bits bit) { } + static inline void tick_nohz_full_kick_cpu(int cpu) { } static inline void tick_nohz_full_kick(void) { } static inline void tick_nohz_full_kick_all(void) { } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 548a4e2551a9..74ab7dbdc023 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -158,11 +158,53 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) cpumask_var_t tick_nohz_full_mask; cpumask_var_t housekeeping_mask; bool tick_nohz_full_running; +static unsigned long tick_dep_mask; -static bool can_stop_full_tick(void) +static void trace_tick_dependency(unsigned long dep) +{ + if (dep & TICK_DEP_MASK_POSIX_TIMER) { + trace_tick_stop(0, "posix timers running\n"); + return; + } + + if (dep & TICK_DEP_MASK_PERF_EVENTS) { + trace_tick_stop(0, "perf events running\n"); + return; + } + + if (dep & TICK_DEP_MASK_SCHED) { + trace_tick_stop(0, "more than 1 task in runqueue\n"); + return; + } + + if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE) + trace_tick_stop(0, "unstable sched clock\n"); +} + +static bool can_stop_full_tick(struct tick_sched *ts) { WARN_ON_ONCE(!irqs_disabled()); + if (tick_dep_mask) { + trace_tick_dependency(tick_dep_mask); + return false; + } + + if (ts->tick_dep_mask) { + trace_tick_dependency(ts->tick_dep_mask); + return false; + } + + if (current->tick_dep_mask) { + trace_tick_dependency(current->tick_dep_mask); + return false; + } + + if (current->signal->tick_dep_mask) { + trace_tick_dependency(current->signal->tick_dep_mask); + return false; + } + if (!sched_can_stop_tick()) { trace_tick_stop(0, "more than 1 task in runqueue\n"); return false; @@ -178,9 +220,10 @@ static bool can_stop_full_tick(void) return false; } - /* sched_clock_tick() needs us? */ #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK /* + * sched_clock_tick() needs us? + * * TODO: kick full dynticks CPUs when * sched_clock_stable is set. */ @@ -199,13 +242,13 @@ static bool can_stop_full_tick(void) return true; } -static void nohz_full_kick_work_func(struct irq_work *work) +static void nohz_full_kick_func(struct irq_work *work) { /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { - .func = nohz_full_kick_work_func, + .func = nohz_full_kick_func, }; /* @@ -251,6 +294,95 @@ void tick_nohz_full_kick_all(void) preempt_enable(); } +static void tick_nohz_dep_set_all(unsigned long *dep, + enum tick_dep_bits bit) +{ + unsigned long prev; + + prev = fetch_or(dep, BIT_MASK(bit)); + if (!prev) + tick_nohz_full_kick_all(); +} + +/* + * Set a global tick dependency. Used by perf events that rely on freq and + * by unstable clock. + */ +void tick_nohz_dep_set(enum tick_dep_bits bit) +{ + tick_nohz_dep_set_all(&tick_dep_mask, bit); +} + +void tick_nohz_dep_clear(enum tick_dep_bits bit) +{ + clear_bit(bit, &tick_dep_mask); +} + +/* + * Set per-CPU tick dependency. Used by scheduler and perf events in order to + * manage events throttling. + */ +void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) +{ + unsigned long prev; + struct tick_sched *ts; + + ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + prev = fetch_or(&ts->tick_dep_mask, BIT_MASK(bit)); + if (!prev) { + preempt_disable(); + /* Perf needs local kick that is NMI safe */ + if (cpu == smp_processor_id()) { + tick_nohz_full_kick(); + } else { + /* Remote irq work not NMI-safe */ + if (!WARN_ON_ONCE(in_nmi())) + tick_nohz_full_kick_cpu(cpu); + } + preempt_enable(); + } +} + +void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) +{ + struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + clear_bit(bit, &ts->tick_dep_mask); +} + +/* + * Set a per-task tick dependency. Posix CPU timers need this in order to elapse + * per task timers. + */ +void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) +{ + /* + * We could optimize this with just kicking the target running the task + * if that noise matters for nohz full users. + */ + tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); +} + +void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) +{ + clear_bit(bit, &tsk->tick_dep_mask); +} + +/* + * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse + * per process timers. + */ +void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit) +{ + tick_nohz_dep_set_all(&sig->tick_dep_mask, bit); +} + +void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) +{ + clear_bit(bit, &sig->tick_dep_mask); +} + /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: @@ -259,15 +391,19 @@ void tick_nohz_full_kick_all(void) void __tick_nohz_task_switch(void) { unsigned long flags; + struct tick_sched *ts; local_irq_save(flags); if (!tick_nohz_full_cpu(smp_processor_id())) goto out; - if (tick_nohz_tick_stopped() && !can_stop_full_tick()) - tick_nohz_full_kick(); + ts = this_cpu_ptr(&tick_cpu_sched); + if (ts->tick_stopped) { + if (current->tick_dep_mask || current->signal->tick_dep_mask) + tick_nohz_full_kick(); + } out: local_irq_restore(flags); } @@ -736,7 +872,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; - if (can_stop_full_tick()) + if (can_stop_full_tick(ts)) tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, ktime_get(), 1); diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index a4a8d4e9baa1..eb4e32566a83 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -60,6 +60,7 @@ struct tick_sched { u64 next_timer; ktime_t idle_expires; int do_timer_last; + unsigned long tick_dep_mask; }; extern struct tick_sched *tick_get_tick_sched(int cpu); -- cgit From e6e6cc22e067a6f44449aa8fd0328404079c3ba5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 11 Dec 2015 03:27:25 +0100 Subject: nohz: Use enum code for tick stop failure tracing message It makes nohz tracing more lightweight, standard and easier to parse. Examples: user_loop-2904 [007] d..1 517.701126: tick_stop: success=1 dependency=NONE user_loop-2904 [007] dn.1 518.021181: tick_stop: success=0 dependency=SCHED posix_timers-6142 [007] d..1 1739.027400: tick_stop: success=0 dependency=POSIX_TIMER user_loop-5463 [007] dN.1 1185.931939: tick_stop: success=0 dependency=PERF_EVENTS Suggested-by: Peter Zijlstra Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/tick.h | 1 + include/trace/events/timer.h | 36 +++++++++++++++++++++++++++++++----- kernel/time/tick-sched.c | 18 +++++++++--------- 3 files changed, 41 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index d60e5df8ec28..96d276a923bf 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -104,6 +104,7 @@ enum tick_dep_bits { TICK_DEP_BIT_CLOCK_UNSTABLE = 3 }; +#define TICK_DEP_MASK_NONE 0 #define TICK_DEP_MASK_POSIX_TIMER (1 << TICK_DEP_BIT_POSIX_TIMER) #define TICK_DEP_MASK_PERF_EVENTS (1 << TICK_DEP_BIT_PERF_EVENTS) #define TICK_DEP_MASK_SCHED (1 << TICK_DEP_BIT_SCHED) diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 073b9ac245ba..51440131d337 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -328,23 +328,49 @@ TRACE_EVENT(itimer_expire, ); #ifdef CONFIG_NO_HZ_COMMON + +#define TICK_DEP_NAMES \ + tick_dep_name(NONE) \ + tick_dep_name(POSIX_TIMER) \ + tick_dep_name(PERF_EVENTS) \ + tick_dep_name(SCHED) \ + tick_dep_name_end(CLOCK_UNSTABLE) + +#undef tick_dep_name +#undef tick_dep_name_end + +#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); +#define tick_dep_name_end(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); + +TICK_DEP_NAMES + +#undef tick_dep_name +#undef tick_dep_name_end + +#define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep }, +#define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep } + +#define show_tick_dep_name(val) \ + __print_symbolic(val, TICK_DEP_NAMES) + TRACE_EVENT(tick_stop, - TP_PROTO(int success, char *error_msg), + TP_PROTO(int success, int dependency), - TP_ARGS(success, error_msg), + TP_ARGS(success, dependency), TP_STRUCT__entry( __field( int , success ) - __string( msg, error_msg ) + __field( int , dependency ) ), TP_fast_assign( __entry->success = success; - __assign_str(msg, error_msg); + __entry->dependency = dependency; ), - TP_printk("success=%s msg=%s", __entry->success ? "yes" : "no", __get_str(msg)) + TP_printk("success=%d dependency=%s", __entry->success, \ + show_tick_dep_name(__entry->dependency)) ); #endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 74ab7dbdc023..47e5ac45ce69 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -163,22 +163,22 @@ static unsigned long tick_dep_mask; static void trace_tick_dependency(unsigned long dep) { if (dep & TICK_DEP_MASK_POSIX_TIMER) { - trace_tick_stop(0, "posix timers running\n"); + trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return; } if (dep & TICK_DEP_MASK_PERF_EVENTS) { - trace_tick_stop(0, "perf events running\n"); + trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); return; } if (dep & TICK_DEP_MASK_SCHED) { - trace_tick_stop(0, "more than 1 task in runqueue\n"); + trace_tick_stop(0, TICK_DEP_MASK_SCHED); return; } if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE) - trace_tick_stop(0, "unstable sched clock\n"); + trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); } static bool can_stop_full_tick(struct tick_sched *ts) @@ -206,17 +206,17 @@ static bool can_stop_full_tick(struct tick_sched *ts) } if (!sched_can_stop_tick()) { - trace_tick_stop(0, "more than 1 task in runqueue\n"); + trace_tick_stop(0, TICK_DEP_MASK_SCHED); return false; } if (!posix_cpu_timers_can_stop_tick(current)) { - trace_tick_stop(0, "posix timers running\n"); + trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return false; } if (!perf_event_can_stop_tick()) { - trace_tick_stop(0, "perf events running\n"); + trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); return false; } @@ -228,7 +228,7 @@ static bool can_stop_full_tick(struct tick_sched *ts) * sched_clock_stable is set. */ if (!sched_clock_stable()) { - trace_tick_stop(0, "unstable sched clock\n"); + trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); /* * Don't allow the user to think they can get * full NO_HZ with this machine. @@ -821,7 +821,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; - trace_tick_stop(1, " "); + trace_tick_stop(1, TICK_DEP_MASK_NONE); } /* -- cgit From 555e0c1ef7ff49ee5ac3a1eb12de4a2e4722f63d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2015 17:42:29 +0200 Subject: perf: Migrate perf to use new tick dependency mask model Instead of providing asynchronous checks for the nohz subsystem to verify perf event tick dependency, migrate perf to the new mask. Perf needs the tick for two situations: 1) Freq events. We could set the tick dependency when those are installed on a CPU context. But setting a global dependency on top of the global freq events accounting is much easier. If people want that to be optimized, we can still refine that on the per-CPU tick dependency level. This patch dooesn't change the current behaviour anyway. 2) Throttled events: this is a per-cpu dependency. Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/perf_event.h | 6 ----- include/linux/tick.h | 2 -- kernel/events/core.c | 65 ++++++++++++++++++++++++++++++++++------------ kernel/time/tick-sched.c | 8 +----- 4 files changed, 49 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b35a61a481fa..d3ff88c13632 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1108,12 +1108,6 @@ static inline void perf_event_task_tick(void) { } static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } #endif -#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL) -extern bool perf_event_can_stop_tick(void); -#else -static inline bool perf_event_can_stop_tick(void) { return true; } -#endif - #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern void perf_restore_debug_store(void); #else diff --git a/include/linux/tick.h b/include/linux/tick.h index 96d276a923bf..0e63db4bc662 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -233,7 +233,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, tick_nohz_dep_clear_signal(signal, bit); } -extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_cpu(int cpu); extern void tick_nohz_full_kick_all(void); extern void __tick_nohz_task_switch(void); @@ -260,7 +259,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, enum tick_dep_bits bit) { } static inline void tick_nohz_full_kick_cpu(int cpu) { } -static inline void tick_nohz_full_kick(void) { } static inline void tick_nohz_full_kick_all(void) { } static inline void __tick_nohz_task_switch(void) { } #endif diff --git a/kernel/events/core.c b/kernel/events/core.c index 5946460b2425..f23d480052e3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3060,17 +3060,6 @@ done: return rotate; } -#ifdef CONFIG_NO_HZ_FULL -bool perf_event_can_stop_tick(void) -{ - if (atomic_read(&nr_freq_events) || - __this_cpu_read(perf_throttled_count)) - return false; - else - return true; -} -#endif - void perf_event_task_tick(void) { struct list_head *head = this_cpu_ptr(&active_ctx_list); @@ -3081,6 +3070,7 @@ void perf_event_task_tick(void) __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); + tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) perf_adjust_freq_unthr_context(ctx, throttled); @@ -3511,6 +3501,28 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) atomic_dec(&per_cpu(perf_cgroup_events, cpu)); } +#ifdef CONFIG_NO_HZ_FULL +static DEFINE_SPINLOCK(nr_freq_lock); +#endif + +static void unaccount_freq_event_nohz(void) +{ +#ifdef CONFIG_NO_HZ_FULL + spin_lock(&nr_freq_lock); + if (atomic_dec_and_test(&nr_freq_events)) + tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS); + spin_unlock(&nr_freq_lock); +#endif +} + +static void unaccount_freq_event(void) +{ + if (tick_nohz_full_enabled()) + unaccount_freq_event_nohz(); + else + atomic_dec(&nr_freq_events); +} + static void unaccount_event(struct perf_event *event) { bool dec = false; @@ -3527,7 +3539,7 @@ static void unaccount_event(struct perf_event *event) if (event->attr.task) atomic_dec(&nr_task_events); if (event->attr.freq) - atomic_dec(&nr_freq_events); + unaccount_freq_event(); if (event->attr.context_switch) { dec = true; atomic_dec(&nr_switch_events); @@ -6349,9 +6361,9 @@ static int __perf_event_overflow(struct perf_event *event, if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) { __this_cpu_inc(perf_throttled_count); + tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(event, 0); - tick_nohz_full_kick(); ret = 1; } } @@ -7741,6 +7753,27 @@ static void account_event_cpu(struct perf_event *event, int cpu) atomic_inc(&per_cpu(perf_cgroup_events, cpu)); } +/* Freq events need the tick to stay alive (see perf_event_task_tick). */ +static void account_freq_event_nohz(void) +{ +#ifdef CONFIG_NO_HZ_FULL + /* Lock so we don't race with concurrent unaccount */ + spin_lock(&nr_freq_lock); + if (atomic_inc_return(&nr_freq_events) == 1) + tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS); + spin_unlock(&nr_freq_lock); +#endif +} + +static void account_freq_event(void) +{ + if (tick_nohz_full_enabled()) + account_freq_event_nohz(); + else + atomic_inc(&nr_freq_events); +} + + static void account_event(struct perf_event *event) { bool inc = false; @@ -7756,10 +7789,8 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_comm_events); if (event->attr.task) atomic_inc(&nr_task_events); - if (event->attr.freq) { - if (atomic_inc_return(&nr_freq_events) == 1) - tick_nohz_full_kick_all(); - } + if (event->attr.freq) + account_freq_event(); if (event->attr.context_switch) { atomic_inc(&nr_switch_events); inc = true; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 47e5ac45ce69..3f79def37ca9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -215,11 +214,6 @@ static bool can_stop_full_tick(struct tick_sched *ts) return false; } - if (!perf_event_can_stop_tick()) { - trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); - return false; - } - #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK /* * sched_clock_tick() needs us? @@ -257,7 +251,7 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), * is NMI safe. */ -void tick_nohz_full_kick(void) +static void tick_nohz_full_kick(void) { if (!tick_nohz_full_cpu(smp_processor_id())) return; -- cgit From 01d36d0ac390895e719d0dd8ab91ebbbf506d28e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 4 Nov 2015 18:17:10 +0100 Subject: sched: Account rr tasks In order to evaluate the scheduler tick dependency without probing context switches, we need to know how much SCHED_RR and SCHED_FIFO tasks are enqueued as those policies don't have the same preemption requirements. To prepare for that, let's account SCHED_RR tasks, we'll be able to deduce SCHED_FIFO tasks as well from it and the total RT tasks in the runqueue. Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- kernel/sched/rt.c | 16 ++++++++++++++++ kernel/sched/sched.h | 1 + 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86abe0ea1..3f1fcffbb18f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1141,6 +1141,20 @@ unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) return 1; } +static inline +unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se) +{ + struct rt_rq *group_rq = group_rt_rq(rt_se); + struct task_struct *tsk; + + if (group_rq) + return group_rq->rr_nr_running; + + tsk = rt_task_of(rt_se); + + return (tsk->policy == SCHED_RR) ? 1 : 0; +} + static inline void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { @@ -1148,6 +1162,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_prio(prio)); rt_rq->rt_nr_running += rt_se_nr_running(rt_se); + rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se); inc_rt_prio(rt_rq, prio); inc_rt_migration(rt_se, rt_rq); @@ -1160,6 +1175,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); + rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se); dec_rt_prio(rt_rq, rt_se_prio(rt_se)); dec_rt_migration(rt_se, rt_rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 10f16374df7f..f0abfce14044 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -450,6 +450,7 @@ static inline int rt_bandwidth_enabled(void) struct rt_rq { struct rt_prio_array active; unsigned int rt_nr_running; + unsigned int rr_nr_running; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { int curr; /* highest queued rt task prio */ -- cgit From 76d92ac305f23cada3a9b3c48a7ccea5f71019cb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2015 22:25:49 +0200 Subject: sched: Migrate sched to use new tick dependency mask model Instead of providing asynchronous checks for the nohz subsystem to verify sched tick dependency, migrate sched to the new mask. Everytime a task is enqueued or dequeued, we evaluate the state of the tick dependency on top of the policy of the tasks in the runqueue, by order of priority: SCHED_DEADLINE: Need the tick in order to periodically check for runtime SCHED_FIFO : Don't need the tick (no round-robin) SCHED_RR : Need the tick if more than 1 task of the same priority for round robin (simplified with checking if more than one SCHED_RR task no matter what priority). SCHED_NORMAL : Need the tick if more than 1 task for round-robin. We could optimize that further with one flag per sched policy on the tick dependency mask and perform only the checks relevant to the policy concerned by an enqueue/dequeue operation. Since the checks aren't based on the current task anymore, we could get rid of the task switch hook but it's still needed for posix cpu timers. Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/sched.h | 3 --- kernel/sched/core.c | 35 ++++++++++++++++++++--------------- kernel/sched/sched.h | 47 +++++++++++++++++++++++++++++++++-------------- kernel/time/tick-sched.c | 5 ----- 4 files changed, 53 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 307e48375f21..2b10348806d8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2364,10 +2364,7 @@ static inline void wake_up_nohz_cpu(int cpu) { } #endif #ifdef CONFIG_NO_HZ_FULL -extern bool sched_can_stop_tick(void); extern u64 scheduler_tick_max_deferment(void); -#else -static inline bool sched_can_stop_tick(void) { return false; } #endif #ifdef CONFIG_SCHED_AUTOGROUP diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7142feb4b92d..1fad82364ffe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -701,31 +701,36 @@ static inline bool got_nohz_idle_kick(void) #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL -bool sched_can_stop_tick(void) +bool sched_can_stop_tick(struct rq *rq) { + int fifo_nr_running; + + /* Deadline tasks, even if single, need the tick */ + if (rq->dl.dl_nr_running) + return false; + /* - * FIFO realtime policy runs the highest priority task. Other runnable - * tasks are of a lower priority. The scheduler tick does nothing. + * FIFO realtime policy runs the highest priority task (after DEADLINE). + * Other runnable tasks are of a lower priority. The scheduler tick + * isn't needed. */ - if (current->policy == SCHED_FIFO) + fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; + if (fifo_nr_running) return true; /* * Round-robin realtime tasks time slice with other tasks at the same - * realtime priority. Is this task the only one at this priority? + * realtime priority. */ - if (current->policy == SCHED_RR) { - struct sched_rt_entity *rt_se = ¤t->rt; - - return list_is_singular(&rt_se->run_list); + if (rq->rt.rr_nr_running) { + if (rq->rt.rr_nr_running == 1) + return true; + else + return false; } - /* - * More than one running task need preemption. - * nr_running update is assumed to be visible - * after IPI is sent from wakers. - */ - if (this_rq()->nr_running > 1) + /* Normal multitasking need periodic preemption checks */ + if (rq->cfs.nr_running > 1) return false; return true; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f0abfce14044..4f0bca770108 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1279,6 +1279,35 @@ unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +#ifdef CONFIG_NO_HZ_FULL +extern bool sched_can_stop_tick(struct rq *rq); + +/* + * Tick may be needed by tasks in the runqueue depending on their policy and + * requirements. If tick is needed, lets send the target an IPI to kick it out of + * nohz mode if necessary. + */ +static inline void sched_update_tick_dependency(struct rq *rq) +{ + int cpu; + + if (!tick_nohz_full_enabled()) + return; + + cpu = cpu_of(rq); + + if (!tick_nohz_full_cpu(cpu)) + return; + + if (sched_can_stop_tick(rq)) + tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); + else + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} +#else +static inline void sched_update_tick_dependency(struct rq *rq) { } +#endif + static inline void add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; @@ -1290,26 +1319,16 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (!rq->rd->overload) rq->rd->overload = true; #endif - -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_cpu(rq->cpu)) { - /* - * Tick is needed if more than one task runs on a CPU. - * Send the target an IPI to kick it out of nohz mode. - * - * We assume that IPI implies full memory barrier and the - * new value of rq->nr_running is visible on reception - * from the target. - */ - tick_nohz_full_kick_cpu(rq->cpu); - } -#endif } + + sched_update_tick_dependency(rq); } static inline void sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; + /* Check if we still need preemption */ + sched_update_tick_dependency(rq); } static inline void rq_last_tick_reset(struct rq *rq) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3f79def37ca9..f312c60c3ed2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -204,11 +204,6 @@ static bool can_stop_full_tick(struct tick_sched *ts) return false; } - if (!sched_can_stop_tick()) { - trace_tick_stop(0, TICK_DEP_MASK_SCHED); - return false; - } - if (!posix_cpu_timers_can_stop_tick(current)) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return false; -- cgit From b78783000d5cb7c5994e6742e1d1ce594bfea15b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 17 Jul 2015 22:25:49 +0200 Subject: posix-cpu-timers: Migrate to use new tick dependency mask model Instead of providing asynchronous checks for the nohz subsystem to verify posix cpu timers tick dependency, migrate the latter to the new mask. In order to keep track of the running timers and expose the tick dependency accordingly, we must probe the timers queuing and dequeuing on threads and process lists. Unfortunately it implies both task and signal level dependencies. We should be able to further optimize this and merge all that on the task level dependency, at the cost of a bit of complexity and may be overhead. Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- include/linux/posix-timers.h | 3 --- include/linux/tick.h | 2 -- kernel/time/posix-cpu-timers.c | 52 +++++++++--------------------------------- kernel/time/tick-sched.c | 7 +----- 4 files changed, 12 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 907f3fd191ac..62d44c176071 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -128,9 +128,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer); void run_posix_cpu_timers(struct task_struct *task); void posix_cpu_timers_exit(struct task_struct *task); void posix_cpu_timers_exit_group(struct task_struct *task); - -bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk); - void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval); diff --git a/include/linux/tick.h b/include/linux/tick.h index 0e63db4bc662..21f73649a4dc 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -234,7 +234,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, } extern void tick_nohz_full_kick_cpu(int cpu); -extern void tick_nohz_full_kick_all(void); extern void __tick_nohz_task_switch(void); #else static inline int housekeeping_any_cpu(void) @@ -259,7 +258,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, enum tick_dep_bits bit) { } static inline void tick_nohz_full_kick_cpu(int cpu) { } -static inline void tick_nohz_full_kick_all(void) { } static inline void __tick_nohz_task_switch(void) { } #endif diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index f5e86d282d52..1cafba860b08 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -333,7 +333,6 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) return err; } - /* * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. * This is called from sys_timer_create() and do_cpu_nanosleep() with the @@ -517,6 +516,10 @@ static void arm_timer(struct k_itimer *timer) cputime_expires->sched_exp = exp; break; } + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); + else + tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER); } } @@ -582,39 +585,6 @@ static int cpu_timer_sample_group(const clockid_t which_clock, return 0; } -#ifdef CONFIG_NO_HZ_FULL -static void nohz_kick_work_fn(struct work_struct *work) -{ - tick_nohz_full_kick_all(); -} - -static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); - -/* - * We need the IPIs to be sent from sane process context. - * The posix cpu timers are always set with irqs disabled. - */ -static void posix_cpu_timer_kick_nohz(void) -{ - if (context_tracking_is_enabled()) - schedule_work(&nohz_kick_work); -} - -bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) -{ - if (!task_cputime_zero(&tsk->cputime_expires)) - return false; - - /* Check if cputimer is running. This is accessed without locking. */ - if (READ_ONCE(tsk->signal->cputimer.running)) - return false; - - return true; -} -#else -static inline void posix_cpu_timer_kick_nohz(void) { } -#endif - /* * Guts of sys_timer_settime for CPU timers. * This is called with the timer locked and interrupts disabled. @@ -761,8 +731,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, sample_to_timespec(timer->it_clock, old_incr, &old->it_interval); } - if (!ret) - posix_cpu_timer_kick_nohz(); + return ret; } @@ -911,6 +880,8 @@ static void check_thread_timers(struct task_struct *tsk, __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); } } + if (task_cputime_zero(tsk_expires)) + tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER); } static inline void stop_process_timers(struct signal_struct *sig) @@ -919,6 +890,7 @@ static inline void stop_process_timers(struct signal_struct *sig) /* Turn off cputimer->running. This is done without locking. */ WRITE_ONCE(cputimer->running, false); + tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER); } static u32 onecputick; @@ -1095,8 +1067,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) arm_timer(timer); unlock_task_sighand(p, &flags); - /* Kick full dynticks CPUs in case they need to tick on the new timer */ - posix_cpu_timer_kick_nohz(); out: timer->it_overrun_last = timer->it_overrun; timer->it_overrun = -1; @@ -1270,7 +1240,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } if (!*newval) - goto out; + return; *newval += now; } @@ -1288,8 +1258,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, tsk->signal->cputime_expires.virt_exp = *newval; break; } -out: - posix_cpu_timer_kick_nohz(); + + tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER); } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f312c60c3ed2..e4f2916e66a9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -204,11 +204,6 @@ static bool can_stop_full_tick(struct tick_sched *ts) return false; } - if (!posix_cpu_timers_can_stop_tick(current)) { - trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); - return false; - } - #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK /* * sched_clock_tick() needs us? @@ -270,7 +265,7 @@ void tick_nohz_full_kick_cpu(int cpu) * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ -void tick_nohz_full_kick_all(void) +static void tick_nohz_full_kick_all(void) { int cpu; -- cgit From 4f49b90abb4aca6fe677c95fc352fd0674d489bd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 22 Jul 2015 17:03:52 +0200 Subject: sched-clock: Migrate to use new tick dependency mask model Instead of checking sched_clock_stable from the nohz subsystem to verify its tick dependency, migrate it to the new mask in order to include it to the all-in-one check. Reviewed-by: Chris Metcalf Cc: Christoph Lameter Cc: Chris Metcalf Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker --- kernel/sched/clock.c | 5 +++++ kernel/time/tick-sched.c | 19 ------------------- 2 files changed, 5 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index bc54e84675da..fedb967a9841 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -61,6 +61,7 @@ #include #include #include +#include /* * Scheduler clock - returns current time in nanosec units. @@ -89,6 +90,8 @@ static void __set_sched_clock_stable(void) { if (!sched_clock_stable()) static_key_slow_inc(&__sched_clock_stable); + + tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); } void set_sched_clock_stable(void) @@ -108,6 +111,8 @@ static void __clear_sched_clock_stable(struct work_struct *work) /* XXX worry about clock continuity */ if (sched_clock_stable()) static_key_slow_dec(&__sched_clock_stable); + + tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); } static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e4f2916e66a9..969e6704c3c9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -204,25 +204,6 @@ static bool can_stop_full_tick(struct tick_sched *ts) return false; } -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - /* - * sched_clock_tick() needs us? - * - * TODO: kick full dynticks CPUs when - * sched_clock_stable is set. - */ - if (!sched_clock_stable()) { - trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); - /* - * Don't allow the user to think they can get - * full NO_HZ with this machine. - */ - WARN_ONCE(tick_nohz_full_running, - "NO_HZ FULL will not work with unstable sched clock"); - return false; - } -#endif - return true; } -- cgit From 6bd58f09e1d8cc6c50a824c00bf0d617919986a1 Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:19 -0800 Subject: time: Add cycles to nanoseconds translation The timekeeping code does not currently provide a way to translate externally provided clocksource cycles to system time. The cycle count is always provided by the result clocksource read() method internal to the timekeeping code. The added function timekeeping_cycles_to_ns() calculated a nanosecond value from a cycle count that can be added to tk_read_base.base value yielding the current system time. This allows clocksource cycle values external to the timekeeping code to provide a cycle count that can be transformed to system time. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 34b4cedfa80d..4243d28177ac 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -298,17 +298,34 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; static inline u32 arch_gettimeoffset(void) { return 0; } #endif +static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr, + cycle_t delta) +{ + s64 nsec; + + nsec = delta * tkr->mult + tkr->xtime_nsec; + nsec >>= tkr->shift; + + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + arch_gettimeoffset(); +} + static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) { cycle_t delta; - s64 nsec; delta = timekeeping_get_delta(tkr); + return timekeeping_delta_to_ns(tkr, delta); +} - nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift; +static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, + cycle_t cycles) +{ + cycle_t delta; - /* If arch requires, add in get_arch_timeoffset() */ - return nsec + arch_gettimeoffset(); + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); + return timekeeping_delta_to_ns(tkr, delta); } /** -- cgit From 9da0f49c8767cc0ef6101cb21156cf4380ed50dd Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:20 -0800 Subject: time: Add timekeeping snapshot code capturing system time and counter In the current timekeeping code there isn't any interface to atomically capture the current relationship between the system counter and system time. ktime_get_snapshot() returns this triple (counter, monotonic raw, realtime) in the system_time_snapshot struct. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall [jstultz: Moved structure definitions around to clean things up, fixed cycles_t/cycle_t confusion.] Signed-off-by: John Stultz --- include/linux/timekeeping.h | 18 ++++++++++++++++++ kernel/time/timekeeping.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'kernel') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index ec89d846324c..7817591af46f 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -266,6 +266,24 @@ extern void timekeeping_inject_sleeptime64(struct timespec64 *delta); extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real); +/* + * struct system_time_snapshot - simultaneous raw/real time capture with + * counter value + * @cycles: Clocksource counter value to produce the system times + * @real: Realtime system time + * @raw: Monotonic raw system time + */ +struct system_time_snapshot { + cycle_t cycles; + ktime_t real; + ktime_t raw; +}; + +/* + * Simultaneously snapshot realtime and monotonic raw clocks + */ +extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); + /* * Persistent clock related interfaces */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4243d28177ac..89b4695bd083 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -874,6 +874,36 @@ time64_t __ktime_get_real_seconds(void) return tk->xtime_sec; } +/** + * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter + * @systime_snapshot: pointer to struct receiving the system time snapshot + */ +void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long seq; + ktime_t base_raw; + ktime_t base_real; + s64 nsec_raw; + s64 nsec_real; + cycle_t now; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + now = tk->tkr_mono.read(tk->tkr_mono.clock); + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + systime_snapshot->cycles = now; + systime_snapshot->real = ktime_add_ns(base_real, nsec_real); + systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); +} +EXPORT_SYMBOL_GPL(ktime_get_snapshot); #ifdef CONFIG_NTP_PPS -- cgit From ba26621e63ce6dc481d90ab9f6902e058d4ea39a Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:21 -0800 Subject: time: Remove duplicated code in ktime_get_raw_and_real() The code in ktime_get_snapshot() is a superset of the code in ktime_get_raw_and_real() code. Further, ktime_get_raw_and_real() is called only by the PPS code, pps_get_ts(). Consolidate the pps_get_ts() code into a single function calling ktime_get_snapshot() and eliminate ktime_get_raw_and_real(). A side effect of this is that the raw and real results of pps_get_ts() correspond to exactly the same clock cycle. Previously these values represented separate reads of the system clock. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall Signed-off-by: John Stultz --- include/linux/pps_kernel.h | 17 ++++++----------- kernel/time/timekeeping.c | 40 ++-------------------------------------- 2 files changed, 8 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h index 54bf1484d41f..35ac903956c7 100644 --- a/include/linux/pps_kernel.h +++ b/include/linux/pps_kernel.h @@ -111,22 +111,17 @@ static inline void timespec_to_pps_ktime(struct pps_ktime *kt, kt->nsec = ts.tv_nsec; } -#ifdef CONFIG_NTP_PPS - static inline void pps_get_ts(struct pps_event_time *ts) { - ktime_get_raw_and_real_ts64(&ts->ts_raw, &ts->ts_real); -} + struct system_time_snapshot snap; -#else /* CONFIG_NTP_PPS */ - -static inline void pps_get_ts(struct pps_event_time *ts) -{ - ktime_get_real_ts64(&ts->ts_real); + ktime_get_snapshot(&snap); + ts->ts_real = ktime_to_timespec64(snap.real); +#ifdef CONFIG_NTP_PPS + ts->ts_raw = ktime_to_timespec64(snap.raw); +#endif } -#endif /* CONFIG_NTP_PPS */ - /* Subtract known time delay from PPS event time(s) */ static inline void pps_sub_ts(struct pps_event_time *ts, struct timespec64 delta) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 89b4695bd083..af19a49d5223 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -888,6 +888,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) s64 nsec_real; cycle_t now; + WARN_ON_ONCE(timekeeping_suspended); + do { seq = read_seqcount_begin(&tk_core.seq); @@ -905,44 +907,6 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) } EXPORT_SYMBOL_GPL(ktime_get_snapshot); -#ifdef CONFIG_NTP_PPS - -/** - * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format - * @ts_raw: pointer to the timespec to be set to raw monotonic time - * @ts_real: pointer to the timespec to be set to the time of day - * - * This function reads both the time of day and raw monotonic time at the - * same time atomically and stores the resulting timestamps in timespec - * format. - */ -void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real) -{ - struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; - s64 nsecs_raw, nsecs_real; - - WARN_ON_ONCE(timekeeping_suspended); - - do { - seq = read_seqcount_begin(&tk_core.seq); - - *ts_raw = tk->raw_time; - ts_real->tv_sec = tk->xtime_sec; - ts_real->tv_nsec = 0; - - nsecs_raw = timekeeping_get_ns(&tk->tkr_raw); - nsecs_real = timekeeping_get_ns(&tk->tkr_mono); - - } while (read_seqcount_retry(&tk_core.seq, seq)); - - timespec64_add_ns(ts_raw, nsecs_raw); - timespec64_add_ns(ts_real, nsecs_real); -} -EXPORT_SYMBOL(ktime_get_raw_and_real_ts64); - -#endif /* CONFIG_NTP_PPS */ - /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set -- cgit From 8006c24595cab106bcb9da12d35e32e14ff492df Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:22 -0800 Subject: time: Add driver cross timestamp interface for higher precision time synchronization ACKNOWLEDGMENT: cross timestamp code was developed by Thomas Gleixner . It has changed considerably and any mistakes are mine. The precision with which events on multiple networked systems can be synchronized using, as an example, PTP (IEEE 1588, 802.1AS) is limited by the precision of the cross timestamps between the system clock and the device (timestamp) clock. Precision here is the degree of simultaneity when capturing the cross timestamp. Currently the PTP cross timestamp is captured in software using the PTP device driver ioctl PTP_SYS_OFFSET. Reads of the device clock are interleaved with reads of the realtime clock. At best, the precision of this cross timestamp is on the order of several microseconds due to software latencies. Sub-microsecond precision is required for industrial control and some media applications. To achieve this level of precision hardware supported cross timestamping is needed. The function get_device_system_crosstimestamp() allows device drivers to return a cross timestamp with system time properly scaled to nanoseconds. The realtime value is needed to discipline that clock using PTP and the monotonic raw value is used for applications that don't require a "real" time, but need an unadjusted clock time. The get_device_system_crosstimestamp() code calls back into the driver to ensure that the system counter is within the current timekeeping update interval. Modern Intel hardware provides an Always Running Timer (ART) which is exactly related to TSC through a known frequency ratio. The ART is routed to devices on the system and is used to precisely and simultaneously capture the device clock with the ART. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall [jstultz: Reworked to remove extra structures and simplify calling] Signed-off-by: John Stultz --- include/linux/timekeeping.h | 35 ++++++++++++++++++++++++++++ kernel/time/timekeeping.c | 56 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) (limited to 'kernel') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 7817591af46f..4a2ca65fc778 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -279,6 +279,41 @@ struct system_time_snapshot { ktime_t raw; }; +/* + * struct system_device_crosststamp - system/device cross-timestamp + * (syncronized capture) + * @device: Device time + * @sys_realtime: Realtime simultaneous with device time + * @sys_monoraw: Monotonic raw simultaneous with device time + */ +struct system_device_crosststamp { + ktime_t device; + ktime_t sys_realtime; + ktime_t sys_monoraw; +}; + +/* + * struct system_counterval_t - system counter value with the pointer to the + * corresponding clocksource + * @cycles: System counter value + * @cs: Clocksource corresponding to system counter value. Used by + * timekeeping code to verify comparibility of two cycle values + */ +struct system_counterval_t { + cycle_t cycles; + struct clocksource *cs; +}; + +/* + * Get cross timestamp between system clock and device clock + */ +extern int get_device_system_crosststamp( + int (*get_time_fn)(ktime_t *device_time, + struct system_counterval_t *system_counterval, + void *ctx), + void *ctx, + struct system_device_crosststamp *xtstamp); + /* * Simultaneously snapshot realtime and monotonic raw clocks */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index af19a49d5223..dba595cdb200 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -907,6 +907,62 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) } EXPORT_SYMBOL_GPL(ktime_get_snapshot); +/** + * get_device_system_crosststamp - Synchronously capture system/device timestamp + * @sync_devicetime: Callback to get simultaneous device time and + * system counter from the device driver + * @xtstamp: Receives simultaneously captured system and device time + * + * Reads a timestamp from a device and correlates it to system time + */ +int get_device_system_crosststamp(int (*get_time_fn) + (ktime_t *device_time, + struct system_counterval_t *sys_counterval, + void *ctx), + void *ctx, + struct system_device_crosststamp *xtstamp) +{ + struct system_counterval_t system_counterval; + struct timekeeper *tk = &tk_core.timekeeper; + ktime_t base_real, base_raw; + s64 nsec_real, nsec_raw; + unsigned long seq; + int ret; + + do { + seq = read_seqcount_begin(&tk_core.seq); + /* + * Try to synchronously capture device time and a system + * counter value calling back into the device driver + */ + ret = get_time_fn(&xtstamp->device, &system_counterval, ctx); + if (ret) + return ret; + + /* + * Verify that the clocksource associated with the captured + * system counter value is the same as the currently installed + * timekeeper clocksource + */ + if (tk->tkr_mono.clock != system_counterval.cs) + return -ENODEV; + + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, + system_counterval.cycles); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, + system_counterval.cycles); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); + xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); + return 0; +} +EXPORT_SYMBOL_GPL(get_device_system_crosststamp); + /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set -- cgit From 2c756feb18d9ec258dbb3a3d11c47e28820690d7 Mon Sep 17 00:00:00 2001 From: "Christopher S. Hall" Date: Mon, 22 Feb 2016 03:15:23 -0800 Subject: time: Add history to cross timestamp interface supporting slower devices Another representative use case of time sync and the correlated clocksource (in addition to PTP noted above) is PTP synchronized audio. In a streaming application, as an example, samples will be sent and/or received by multiple devices with a presentation time that is in terms of the PTP master clock. Synchronizing the audio output on these devices requires correlating the audio clock with the PTP master clock. The more precise this correlation is, the better the audio quality (i.e. out of sync audio sounds bad). From an application standpoint, to correlate the PTP master clock with the audio device clock, the system clock is used as a intermediate timebase. The transforms such an application would perform are: System Clock <-> Audio clock System Clock <-> Network Device Clock [<-> PTP Master Clock] Modern Intel platforms can perform a more accurate cross timestamp in hardware (ART,audio device clock). The audio driver requires ART->system time transforms -- the same as required for the network driver. These platforms offload audio processing (including cross-timestamps) to a DSP which to ensure uninterrupted audio processing, communicates and response to the host only once every millsecond. As a result is takes up to a millisecond for the DSP to receive a request, the request is processed by the DSP, the audio output hardware is polled for completion, the result is copied into shared memory, and the host is notified. All of these operation occur on a millisecond cadence. This transaction requires about 2 ms, but under heavier workloads it may take up to 4 ms. Adding a history allows these slow devices the option of providing an ART value outside of the current interval. In this case, the callback provided is an accessor function for the previously obtained counter value. If get_system_device_crosststamp() receives a counter value previous to cycle_last, it consults the history provided as an argument in history_ref and interpolates the realtime and monotonic raw system time using the provided counter value. If there are any clock discontinuities, e.g. from calling settimeofday(), the monotonic raw time is interpolated in the usual way, but the realtime clock time is adjusted by scaling the monotonic raw adjustment. When an accessor function is used a history argument *must* be provided. The history is initialized using ktime_get_snapshot() and must be called before the counter values are read. Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: kevin.b.stanton@intel.com Cc: kevin.j.clarke@intel.com Cc: hpa@zytor.com Cc: jeffrey.t.kirsher@intel.com Cc: netdev@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Christopher S. Hall [jstultz: Fixed up cycles_t/cycle_t type confusion] Signed-off-by: John Stultz --- include/linux/timekeeper_internal.h | 2 + include/linux/timekeeping.h | 5 ++ kernel/time/timekeeping.c | 171 +++++++++++++++++++++++++++++++++++- 3 files changed, 177 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 25247220b4b7..e88005459035 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -50,6 +50,7 @@ struct tk_read_base { * @offs_tai: Offset clock monotonic -> clock tai * @tai_offset: The current UTC to TAI offset in seconds * @clock_was_set_seq: The sequence number of clock was set events + * @cs_was_changed_seq: The sequence number of clocksource change events * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second * @raw_time: Monotonic raw base time in timespec64 format * @cycle_interval: Number of clock cycles in one NTP interval @@ -91,6 +92,7 @@ struct timekeeper { ktime_t offs_tai; s32 tai_offset; unsigned int clock_was_set_seq; + u8 cs_was_changed_seq; ktime_t next_leap_ktime; struct timespec64 raw_time; diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 4a2ca65fc778..96f37bee3bc1 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -272,11 +272,15 @@ extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time * @raw: Monotonic raw system time + * @clock_was_set_seq: The sequence number of clock was set events + * @cs_was_changed_seq: The sequence number of clocksource change events */ struct system_time_snapshot { cycle_t cycles; ktime_t real; ktime_t raw; + unsigned int clock_was_set_seq; + u8 cs_was_changed_seq; }; /* @@ -312,6 +316,7 @@ extern int get_device_system_crosststamp( struct system_counterval_t *system_counterval, void *ctx), void *ctx, + struct system_time_snapshot *history, struct system_device_crosststamp *xtstamp); /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index dba595cdb200..931b0b1a71e9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -233,6 +233,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) u64 tmp, ntpinterval; struct clocksource *old_clock; + ++tk->cs_was_changed_seq; old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; tk->tkr_mono.read = clock->read; @@ -894,6 +895,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) seq = read_seqcount_begin(&tk_core.seq); now = tk->tkr_mono.read(tk->tkr_mono.clock); + systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; + systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); base_raw = tk->tkr_raw.base; @@ -907,10 +910,123 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) } EXPORT_SYMBOL_GPL(ktime_get_snapshot); +/* Scale base by mult/div checking for overflow */ +static int scale64_check_overflow(u64 mult, u64 div, u64 *base) +{ + u64 tmp, rem; + + tmp = div64_u64_rem(*base, div, &rem); + + if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) || + ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) + return -EOVERFLOW; + tmp *= mult; + rem *= mult; + + do_div(rem, div); + *base = tmp + rem; + return 0; +} + +/** + * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval + * @history: Snapshot representing start of history + * @partial_history_cycles: Cycle offset into history (fractional part) + * @total_history_cycles: Total history length in cycles + * @discontinuity: True indicates clock was set on history period + * @ts: Cross timestamp that should be adjusted using + * partial/total ratio + * + * Helper function used by get_device_system_crosststamp() to correct the + * crosstimestamp corresponding to the start of the current interval to the + * system counter value (timestamp point) provided by the driver. The + * total_history_* quantities are the total history starting at the provided + * reference point and ending at the start of the current interval. The cycle + * count between the driver timestamp point and the start of the current + * interval is partial_history_cycles. + */ +static int adjust_historical_crosststamp(struct system_time_snapshot *history, + cycle_t partial_history_cycles, + cycle_t total_history_cycles, + bool discontinuity, + struct system_device_crosststamp *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + u64 corr_raw, corr_real; + bool interp_forward; + int ret; + + if (total_history_cycles == 0 || partial_history_cycles == 0) + return 0; + + /* Interpolate shortest distance from beginning or end of history */ + interp_forward = partial_history_cycles > total_history_cycles/2 ? + true : false; + partial_history_cycles = interp_forward ? + total_history_cycles - partial_history_cycles : + partial_history_cycles; + + /* + * Scale the monotonic raw time delta by: + * partial_history_cycles / total_history_cycles + */ + corr_raw = (u64)ktime_to_ns( + ktime_sub(ts->sys_monoraw, history->raw)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_raw); + if (ret) + return ret; + + /* + * If there is a discontinuity in the history, scale monotonic raw + * correction by: + * mult(real)/mult(raw) yielding the realtime correction + * Otherwise, calculate the realtime correction similar to monotonic + * raw calculation + */ + if (discontinuity) { + corr_real = mul_u64_u32_div + (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); + } else { + corr_real = (u64)ktime_to_ns( + ktime_sub(ts->sys_realtime, history->real)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_real); + if (ret) + return ret; + } + + /* Fixup monotonic raw and real time time values */ + if (interp_forward) { + ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw); + ts->sys_realtime = ktime_add_ns(history->real, corr_real); + } else { + ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw); + ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real); + } + + return 0; +} + +/* + * cycle_between - true if test occurs chronologically between before and after + */ +static bool cycle_between(cycle_t before, cycle_t test, cycle_t after) +{ + if (test > before && test < after) + return true; + if (test < before && before > after) + return true; + return false; +} + /** * get_device_system_crosststamp - Synchronously capture system/device timestamp - * @sync_devicetime: Callback to get simultaneous device time and + * @get_time_fn: Callback to get simultaneous device time and * system counter from the device driver + * @ctx: Context passed to get_time_fn() + * @history_begin: Historical reference point used to interpolate system + * time when counter provided by the driver is before the current interval * @xtstamp: Receives simultaneously captured system and device time * * Reads a timestamp from a device and correlates it to system time @@ -920,13 +1036,18 @@ int get_device_system_crosststamp(int (*get_time_fn) struct system_counterval_t *sys_counterval, void *ctx), void *ctx, + struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { struct system_counterval_t system_counterval; struct timekeeper *tk = &tk_core.timekeeper; + cycle_t cycles, now, interval_start; + unsigned int clock_was_set_seq; ktime_t base_real, base_raw; s64 nsec_real, nsec_raw; + u8 cs_was_changed_seq; unsigned long seq; + bool do_interp; int ret; do { @@ -946,6 +1067,22 @@ int get_device_system_crosststamp(int (*get_time_fn) */ if (tk->tkr_mono.clock != system_counterval.cs) return -ENODEV; + cycles = system_counterval.cycles; + + /* + * Check whether the system counter value provided by the + * device driver is on the current timekeeping interval. + */ + now = tk->tkr_mono.read(tk->tkr_mono.clock); + interval_start = tk->tkr_mono.cycle_last; + if (!cycle_between(interval_start, cycles, now)) { + clock_was_set_seq = tk->clock_was_set_seq; + cs_was_changed_seq = tk->cs_was_changed_seq; + cycles = interval_start; + do_interp = true; + } else { + do_interp = false; + } base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); @@ -959,6 +1096,38 @@ int get_device_system_crosststamp(int (*get_time_fn) xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); + + /* + * Interpolate if necessary, adjusting back from the start of the + * current interval + */ + if (do_interp) { + cycle_t partial_history_cycles, total_history_cycles; + bool discontinuity; + + /* + * Check that the counter value occurs after the provided + * history reference and that the history doesn't cross a + * clocksource change + */ + if (!history_begin || + !cycle_between(history_begin->cycles, + system_counterval.cycles, cycles) || + history_begin->cs_was_changed_seq != cs_was_changed_seq) + return -EINVAL; + partial_history_cycles = cycles - system_counterval.cycles; + total_history_cycles = cycles - history_begin->cycles; + discontinuity = + history_begin->clock_was_set_seq != clock_was_set_seq; + + ret = adjust_historical_crosststamp(history_begin, + partial_history_cycles, + total_history_cycles, + discontinuity, xtstamp); + if (ret) + return ret; + } + return 0; } EXPORT_SYMBOL_GPL(get_device_system_crosststamp); -- cgit From 71f87b2fc64c2e9b6d53cb817f28711b959d3dfe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Mar 2016 10:52:10 +0100 Subject: cpu/hotplug: Plug death reporting race Paul noticed that the conversion of the death reporting introduced a race where the outgoing cpu might be delayed after waking the controll processor, so it might not be able to call rcu_report_dead() before being physically removed, leading to RCU stalls. We cant call complete after rcu_report_dead(), so instead of going back to busy polling, simply issue a function call to do the completion. Fixes: 27d50c7eeb0f "rcu: Make CPU_DYING_IDLE an explicit call" Reported-by: Paul E. McKenney Link: http://lkml.kernel.org/r/20160302201127.GA23440@linux.vnet.ibm.com Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra --- kernel/cpu.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index ff8059b76a85..93e9d89bb0ab 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -755,14 +755,26 @@ static int notify_dead(unsigned int cpu) return 0; } +static void cpuhp_complete_idle_dead(void *arg) +{ + struct cpuhp_cpu_state *st = arg; + + complete(&st->done); +} + void cpuhp_report_idle_dead(void) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); BUG_ON(st->state != CPUHP_AP_OFFLINE); - st->state = CPUHP_AP_IDLE_DEAD; - complete(&st->done); rcu_report_dead(smp_processor_id()); + st->state = CPUHP_AP_IDLE_DEAD; + /* + * We cannot call complete after rcu_report_dead() so we delegate it + * to an online cpu. + */ + smp_call_function_single(cpumask_first(cpu_online_mask), + cpuhp_complete_idle_dead, st, 0); } #else -- cgit From 82e88ff1ea948d83125a8aaa7c9809f03ccc500f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Mar 2016 11:11:12 +0100 Subject: hrtimer: Revert CLOCK_MONOTONIC_RAW support Revert commits: a6e707ddbdf1: KVM: arm/arm64: timer: Switch to CLOCK_MONOTONIC_RAW 9006a01829a5: hrtimer: Catch illegal clockids 9c808765e88e: hrtimer: Add support for CLOCK_MONOTONIC_RAW Marc found out, that there are fundamental issues with that patch series because __hrtimer_get_next_event() and hrtimer_forward() need support for CLOCK_MONOTONIC_RAW. Nothing which is easily fixed, so revert the whole lot. Reported-by: Marc Zyngier Link: http://lkml.kernel.org/r/56D6CEF0.8060607@arm.com Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 1 - kernel/time/hrtimer.c | 18 ++---------------- virt/kvm/arm/arch_timer.c | 4 ++-- 3 files changed, 4 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index a6d64af5e73f..76dd4f0da5ca 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -151,7 +151,6 @@ enum hrtimer_base_type { HRTIMER_BASE_REALTIME, HRTIMER_BASE_BOOTTIME, HRTIMER_BASE_TAI, - HRTIMER_BASE_MONOTONIC_RAW, HRTIMER_MAX_CLOCK_BASES, }; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index cb0fe70f3c51..435b8850dd80 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -90,30 +90,19 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, - { - .index = HRTIMER_BASE_MONOTONIC_RAW, - .clockid = CLOCK_MONOTONIC_RAW, - .get_time = &ktime_get_raw, - }, } }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { - /* Make sure we catch unsupported clockids */ - [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_MONOTONIC_RAW] = HRTIMER_BASE_MONOTONIC_RAW, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - int base = hrtimer_clock_to_base_table[clock_id]; - BUG_ON(base == HRTIMER_MAX_CLOCK_BASES); - return base; + return hrtimer_clock_to_base_table[clock_id]; } /* @@ -1279,10 +1268,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) if (!(active & 0x01)) continue; - if (unlikely(base->index == HRTIMER_BASE_MONOTONIC_RAW)) - basenow = ktime_get_raw(); - else - basenow = ktime_add(now, base->offset); + basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 97c58153f923..69bca185c471 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -48,7 +48,7 @@ static bool timer_is_armed(struct arch_timer_cpu *timer) static void timer_arm(struct arch_timer_cpu *timer, u64 ns) { timer->armed = true; - hrtimer_start(&timer->timer, ktime_add_ns(ktime_get_raw(), ns), + hrtimer_start(&timer->timer, ktime_add_ns(ktime_get(), ns), HRTIMER_MODE_ABS); } @@ -308,7 +308,7 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); - hrtimer_init(&timer->timer, CLOCK_MONOTONIC_RAW, HRTIMER_MODE_ABS); + hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); timer->timer.function = kvm_timer_expire; } -- cgit From 2378d8b8ba3d2adffb4f98c0c60ee2f448b3be69 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:57 -0500 Subject: cgroup: re-hash init_css_set after subsystems are initialized css_sets are hashed by their subsys[] contents and in cgroup_init() init_css_set is hashed early, before subsystem inits, when all entries in its subsys[] are NULL, so that cgroup_dfl_root initialization can find and link to it. As subsystems are initialized, init_css_set.subsys[] is filled up but the hashing is never updated making init_css_set hashed in the wrong place. While incorrect, this doesn't cause a critical failure as css_set management code would create an identical css_set dynamically. Fix it by rehashing init_css_set after subsystems are initialized. While at it, drop unnecessary @key local variable. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 46529502e9d5..e97772b42dfb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5266,7 +5266,6 @@ static u16 cgroup_disable_mask __initdata; int __init cgroup_init(void) { struct cgroup_subsys *ss; - unsigned long key; int ssid; BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); @@ -5276,9 +5275,12 @@ int __init cgroup_init(void) mutex_lock(&cgroup_mutex); - /* Add init_css_set to the hash table */ - key = css_set_hash(init_css_set.subsys); - hash_add(css_set_table, &init_css_set.hlist, key); + /* + * Add init_css_set to the hash table so that dfl_root can link to + * it during init. + */ + hash_add(css_set_table, &init_css_set.hlist, + css_set_hash(init_css_set.subsys)); BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); @@ -5331,6 +5333,11 @@ int __init cgroup_init(void) ss->bind(init_css_set.subsys[ssid]); } + /* init_css_set.subsys[] has been updated, re-hash */ + hash_del(&init_css_set.hlist); + hash_add(css_set_table, &init_css_set.hlist, + css_set_hash(init_css_set.subsys)); + WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); -- cgit From 20b454a61fba59be13de52b4493898583ea26d20 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:57 -0500 Subject: cgroup: suppress spurious de-populated events During task migration, tasks may transfer between two css_sets which are associated with the same cgroup. If those tasks are the only tasks in the cgroup, this currently triggers a spurious de-populated event on the cgroup. Fix it by bumping up populated count before bumping it down during migration to ensure that it doesn't reach zero spuriously. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e97772b42dfb..5d452e7fcb4f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -678,6 +678,9 @@ static void css_set_move_task(struct task_struct *task, { lockdep_assert_held(&css_set_lock); + if (to_cset && !css_set_populated(to_cset)) + css_set_update_populated(to_cset, true); + if (from_cset) { struct css_task_iter *it, *pos; @@ -711,8 +714,6 @@ static void css_set_move_task(struct task_struct *task, */ WARN_ON_ONCE(task->flags & PF_EXITING); - if (!css_set_populated(to_cset)) - css_set_update_populated(to_cset, true); rcu_assign_pointer(task->cgroups, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); -- cgit From 6cd0f5bbaf594f40a97d01dbc565dda41f30d37c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:58 -0500 Subject: cgroup: separate out interface file creation from css creation Currently, interface files are created when a css is created depending on whether @visible is set. This patch separates out the two into separate steps to help code refactoring and eventually allow cgroups which aren't visible through cgroup fs. Move css_populate_dir() out of create_css() and drop @visible. While at it, rename the function to css_create() for consistency. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 72 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5d452e7fcb4f..4178e45becb4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -222,8 +222,8 @@ static struct cftype cgroup_legacy_base_files[]; static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, - bool visible); +static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, + struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, @@ -3082,14 +3082,26 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, */ do_each_subsys_mask(ss, ssid, enable) { cgroup_for_each_live_child(child, cgrp) { - if (css_enable & (1 << ssid)) - ret = create_css(child, ss, - cgrp->subtree_control & (1 << ssid)); - else + if (css_enable & (1 << ssid)) { + struct cgroup_subsys_state *css; + + css = css_create(child, ss); + if (IS_ERR(css)) { + ret = PTR_ERR(css); + goto err_undo_css; + } + + if (cgrp->subtree_control & (1 << ssid)) { + ret = css_populate_dir(css, NULL); + if (ret) + goto err_undo_css; + } + } else { ret = css_populate_dir(cgroup_css(child, ss), NULL); - if (ret) - goto err_undo_css; + if (ret) + goto err_undo_css; + } } } while_each_subsys_mask(); @@ -4717,7 +4729,9 @@ static void css_release_work_fn(struct work_struct *work) * Those are supported by RCU protecting clearing of * cgrp->kn->priv backpointer. */ - RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + if (cgrp->kn) + RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, + NULL); } mutex_unlock(&cgroup_mutex); @@ -4801,17 +4815,16 @@ static void offline_css(struct cgroup_subsys_state *css) } /** - * create_css - create a cgroup_subsys_state + * css_create - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with * @ss: the subsys of new css - * @visible: whether to create control knobs for the new css or not * * Create a new css associated with @cgrp - @ss pair. On success, the new - * css is online and installed in @cgrp with all interface files created if - * @visible. Returns 0 on success, -errno on failure. + * css is online and installed in @cgrp. This function doesn't create the + * interface files. Returns 0 on success, -errno on failure. */ -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, - bool visible) +static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, + struct cgroup_subsys *ss) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); @@ -4822,7 +4835,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, css = ss->css_alloc(parent_css); if (IS_ERR(css)) - return PTR_ERR(css); + return css; init_and_link_css(css, ss, cgrp); @@ -4835,12 +4848,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, goto err_free_percpu_ref; css->id = err; - if (visible) { - err = css_populate_dir(css, NULL); - if (err) - goto err_free_id; - } - /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); @@ -4858,18 +4865,16 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, ss->warned_broken_hierarchy = true; } - return 0; + return css; err_list_del: list_del_rcu(&css->sibling); - css_clear_dir(css, NULL); -err_free_id: cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: percpu_ref_exit(&css->refcnt); err_free_css: call_rcu(&css->rcu_head, css_free_rcu_fn); - return err; + return ERR_PTR(err); } static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, @@ -4966,10 +4971,19 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, /* let's create and online css's */ do_each_subsys_mask(ss, ssid, parent->subtree_ss_mask) { - ret = create_css(cgrp, ss, - parent->subtree_control & (1 << ssid)); - if (ret) + struct cgroup_subsys_state *css; + + css = css_create(cgrp, ss); + if (IS_ERR(css)) { + ret = PTR_ERR(css); goto out_destroy; + } + + if (parent->subtree_control & (1 << ssid)) { + ret = css_populate_dir(css, NULL); + if (ret) + goto out_destroy; + } } while_each_subsys_mask(); /* -- cgit From 88cb04b96a1934ecbfd1d324e7cde55890c1a576 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:58 -0500 Subject: cgroup: explicitly track whether a cgroup_subsys_state is visible to userland Currently, whether a css (cgroup_subsys_state) has its interface files created is not tracked and assumed to change together with the owning cgroup's lifecycle. cgroup directory and interface creation is being separated out from internal object creation to help refactoring and eventually allow cgroups which are not visible through cgroupfs. This patch adds CSS_VISIBLE to track whether a css has its interface files created and perform management operations only when necessary which helps decoupling interface file handling from internal object lifecycle. After this patch, all css interface file management functions can be called regardless of the current state and will achieve the expected result. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- include/linux/cgroup-defs.h | 1 + kernel/cgroup.c | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8fc3f0452289..7593c1a46786 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -45,6 +45,7 @@ enum { CSS_NO_REF = (1 << 0), /* no reference counting for this css */ CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ + CSS_VISIBLE = (1 << 3), /* css is visible to userland */ }; /* bits in struct cgroup flags field */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4178e45becb4..a9a53ca942f3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1421,6 +1421,11 @@ static void css_clear_dir(struct cgroup_subsys_state *css, struct cgroup *cgrp = cgrp_override ?: css->cgroup; struct cftype *cfts; + if (!(css->flags & CSS_VISIBLE)) + return; + + css->flags &= ~CSS_VISIBLE; + list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); } @@ -1439,6 +1444,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css, struct cftype *cfts, *failed_cfts; int ret; + if (css->flags & CSS_VISIBLE) + return 0; + if (!css->ss) { if (cgroup_on_dfl(cgrp)) cfts = cgroup_dfl_base_files; @@ -1455,6 +1463,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css, goto err; } } + + css->flags |= CSS_VISIBLE; + return 0; err: list_for_each_entry(cfts, &css->ss->cfts, node) { @@ -3403,7 +3414,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; - if (cgroup_is_dead(cgrp)) + if (!(css->flags & CSS_VISIBLE)) continue; ret = cgroup_addrm_files(css, cgrp, cfts, is_add); -- cgit From 195e9b6c4b09434dad6ec3c163fdf037e16b3c96 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:58 -0500 Subject: cgroup: reorder operations in cgroup_mkdir() Currently, operations to initialize internal objects and create interface directory and files are intermixed in cgroup_mkdir(). We're in the process of refactoring cgroup and css management paths to separate them out to eventually allow cgroups which aren't visible through cgroup fs. This patch reorders operations inside cgroup_mkdir() so that interface directory and file handling comes after internal object initialization. This will enable further refactoring. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 61 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 30 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a9a53ca942f3..a6d484a667aa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4945,20 +4945,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); - /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - goto out_free_id; - } - cgrp->kn = kn; - - /* - * This extra ref will be put in cgroup_free_fn() and guarantees - * that @cgrp->kn is always accessible. - */ - kernfs_get(kn); - cgrp->self.serial_nr = css_serial_nr_next++; /* allocation complete, commit to creation */ @@ -4972,15 +4958,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, */ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - ret = cgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - - ret = css_populate_dir(&cgrp->self, NULL); - if (ret) - goto out_destroy; - - /* let's create and online css's */ + /* create the csses */ do_each_subsys_mask(ss, ssid, parent->subtree_ss_mask) { struct cgroup_subsys_state *css; @@ -4989,12 +4967,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, ret = PTR_ERR(css); goto out_destroy; } - - if (parent->subtree_control & (1 << ssid)) { - ret = css_populate_dir(css, NULL); - if (ret) - goto out_destroy; - } } while_each_subsys_mask(); /* @@ -5006,13 +4978,40 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgroup_refresh_subtree_ss_mask(cgrp); } + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + goto out_destroy; + } + cgrp->kn = kn; + + /* + * This extra ref will be put in cgroup_free_fn() and guarantees + * that @cgrp->kn is always accessible. + */ + kernfs_get(kn); + + ret = cgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + ret = css_populate_dir(&cgrp->self, NULL); + if (ret) + goto out_destroy; + + do_each_subsys_mask(ss, ssid, parent->subtree_control) { + ret = css_populate_dir(cgroup_css(cgrp, ss), NULL); + if (ret) + goto out_destroy; + } while_each_subsys_mask(); + + /* let's create and online css's */ kernfs_activate(kn); ret = 0; goto out_unlock; -out_free_id: - cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: -- cgit From a5bca2152036de826595723437c5cbe8f6c13983 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:58 -0500 Subject: cgroup: factor out cgroup_create() out of cgroup_mkdir() We're in the process of refactoring cgroup and css management paths to separate them out to eventually allow cgroups which aren't visible through cgroup fs. This patch factors out cgroup_create() out of cgroup_mkdir(). cgroup_create() contains all internal object creation and initialization. cgroup_mkdir() uses cgroup_create() to create the internal cgroup and adds interface directory and file creation. This patch doesn't cause any behavior differences. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 72 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a6d484a667aa..e1b3d0fead05 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4888,33 +4888,19 @@ err_free_css: return ERR_PTR(err); } -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +static struct cgroup *cgroup_create(struct cgroup *parent) { - struct cgroup *parent, *cgrp, *tcgrp; - struct cgroup_root *root; + struct cgroup_root *root = parent->root; struct cgroup_subsys *ss; - struct kernfs_node *kn; - int level, ssid, ret; - - /* Do not accept '\n' to prevent making /proc//cgroup unparsable. - */ - if (strchr(name, '\n')) - return -EINVAL; - - parent = cgroup_kn_lock_live(parent_kn); - if (!parent) - return -ENODEV; - root = parent->root; - level = parent->level + 1; + struct cgroup *cgrp, *tcgrp; + int level = parent->level + 1; + int ssid, ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp) + sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL); - if (!cgrp) { - ret = -ENOMEM; - goto out_unlock; - } + if (!cgrp) + return ERR_PTR(-ENOMEM); ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) @@ -4978,6 +4964,40 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgroup_refresh_subtree_ss_mask(cgrp); } + return cgrp; + +out_cancel_ref: + percpu_ref_exit(&cgrp->self.refcnt); +out_free_cgrp: + kfree(cgrp); + return ERR_PTR(ret); +out_destroy: + cgroup_destroy_locked(cgrp); + return ERR_PTR(ret); +} + +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + struct cgroup *parent, *cgrp; + struct cgroup_subsys *ss; + struct kernfs_node *kn; + int ssid, ret; + + /* do not accept '\n' to prevent making /proc//cgroup unparsable */ + if (strchr(name, '\n')) + return -EINVAL; + + parent = cgroup_kn_lock_live(parent_kn); + if (!parent) + return -ENODEV; + + cgrp = cgroup_create(parent); + if (IS_ERR(cgrp)) { + ret = PTR_ERR(cgrp); + goto out_unlock; + } + /* create the directory */ kn = kernfs_create_dir(parent->kn, name, mode, cgrp); if (IS_ERR(kn)) { @@ -5012,17 +5032,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, ret = 0; goto out_unlock; -out_cancel_ref: - percpu_ref_exit(&cgrp->self.refcnt); -out_free_cgrp: - kfree(cgrp); +out_destroy: + cgroup_destroy_locked(cgrp); out_unlock: cgroup_kn_unlock(parent_kn); return ret; - -out_destroy: - cgroup_destroy_locked(cgrp); - goto out_unlock; } /* -- cgit From 5531dc915ba10b78a80dcffa48d7360d3c0bab61 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:58 -0500 Subject: cgroup: introduce cgroup_control() and cgroup_ss_mask() When a controller is enabled and visible on a non-root cgroup is determined by subtree_control and subtree_ss_mask of the parent cgroup. For a root cgroup, by the type of the hierarchy and which controllers are attached to it. Deciding the above on each usage is fragile and unnecessarily complicates the users. This patch introduces cgroup_control() and cgroup_ss_mask() which calculate and return the [visibly] enabled subsyste mask for the specified cgroup and conver the existing usages. * cgroup_e_css() is restructured for simplicity. * cgroup_calc_subtree_ss_mask() and cgroup_subtree_control_write() no longer need to distinguish root and non-root cases. * With cgroup_control(), cgroup_controllers_show() can now handle both root and non-root cases. cgroup_root_controllers_show() is removed. v2: cgroup_control() updated to yield the correct result on v1 hierarchies too. cgroup_subtree_control_write() converted. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 72 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 37 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e1b3d0fead05..2cb4b5419852 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -346,6 +346,32 @@ static struct cgroup *cgroup_parent(struct cgroup *cgrp) return NULL; } +/* subsystems visibly enabled on a cgroup */ +static u16 cgroup_control(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + u16 root_ss_mask = cgrp->root->subsys_mask; + + if (parent) + return parent->subtree_control; + + if (cgroup_on_dfl(cgrp)) + root_ss_mask &= ~cgrp_dfl_inhibit_ss_mask; + + return root_ss_mask; +} + +/* subsystems enabled on a cgroup */ +static u16 cgroup_ss_mask(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + + if (parent) + return parent->subtree_ss_mask; + + return cgrp->root->subsys_mask; +} + /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest @@ -385,16 +411,15 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, if (!ss) return &cgrp->self; - if (!(cgrp->root->subsys_mask & (1 << ss->id))) - return NULL; - /* * This function is used while updating css associations and thus - * can't test the csses directly. Use ->subtree_ss_mask. + * can't test the csses directly. Test ss_mask. */ - while (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->subtree_ss_mask & (1 << ss->id))) + while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) { cgrp = cgroup_parent(cgrp); + if (!cgrp) + return NULL; + } return cgroup_css(cgrp, ss); } @@ -1276,7 +1301,6 @@ static umode_t cgroup_file_mode(const struct cftype *cft) */ static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control) { - struct cgroup *parent = cgroup_parent(cgrp); u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; @@ -1298,10 +1322,7 @@ static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control) * happen only if some depended-upon subsystems were bound * to non-default hierarchies. */ - if (parent) - new_ss_mask &= parent->subtree_ss_mask; - else - new_ss_mask &= cgrp->root->subsys_mask; + new_ss_mask &= cgroup_ss_mask(cgrp); if (new_ss_mask == cur_ss_mask) break; @@ -2864,22 +2885,12 @@ static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) seq_putc(seq, '\n'); } -/* show controllers which are currently attached to the default hierarchy */ -static int cgroup_root_controllers_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - - cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & - ~cgrp_dfl_inhibit_ss_mask); - return 0; -} - /* show controllers which are enabled from the parent */ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control); + cgroup_print_ss_mask(seq, cgroup_control(cgrp)); return 0; } @@ -3005,10 +3016,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, continue; } - /* unavailable or not enabled on the parent? */ - if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || - (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) { + if (!(cgroup_control(cgrp) & (1 << ssid))) { ret = -ENOENT; goto out_unlock; } @@ -4566,12 +4574,6 @@ static struct cftype cgroup_dfl_base_files[] = { }, { .name = "cgroup.controllers", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_root_controllers_show, - }, - { - .name = "cgroup.controllers", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_controllers_show, }, { @@ -4945,7 +4947,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); /* create the csses */ - do_each_subsys_mask(ss, ssid, parent->subtree_ss_mask) { + do_each_subsys_mask(ss, ssid, cgroup_ss_mask(cgrp)) { struct cgroup_subsys_state *css; css = css_create(cgrp, ss); @@ -4960,7 +4962,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) * subtree_control from the parent. Each is configured manually. */ if (!cgroup_on_dfl(cgrp)) { - cgrp->subtree_control = parent->subtree_control; + cgrp->subtree_control = cgroup_control(cgrp); cgroup_refresh_subtree_ss_mask(cgrp); } @@ -5020,7 +5022,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - do_each_subsys_mask(ss, ssid, parent->subtree_control) { + do_each_subsys_mask(ss, ssid, cgroup_control(cgrp)) { ret = css_populate_dir(cgroup_css(cgrp, ss), NULL); if (ret) goto out_destroy; -- cgit From 1b9b96a12b5433ccc477265111122720ccb4965e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:59 -0500 Subject: cgroup: factor out cgroup_drain_offline() from cgroup_subtree_control_write() Factor out async css offline draining into cgroup_drain_offline(). * Nest subsystem walk inside child walk. The child walk will later be converted to subtree walk which is a bit more expensive. * Relocate the draining above subsystem mask preparation, which doesn't create any behavior differences but helps further refactoring. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 77 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2cb4b5419852..d295e6a91cdc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2965,6 +2965,53 @@ out_finish: return ret; } +/** + * cgroup_drain_offline - wait for previously offlined csses to go away + * @cgrp: parent of the target cgroups + * + * Because css offlining is asynchronous, userland may try to re-enable a + * controller while the previous css is still around. This function drains + * the previous css instances of @cgrp's children. + * + * Must be called with cgroup_mutex held. Returns %false if there were no + * dying css instances. Returns %true if there were one or more and this + * function waited. On %true return, cgroup_mutex has been dropped and + * re-acquired inbetween which anything could have happened. The caller + * typically would have to start over. + */ +static bool cgroup_drain_offline(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys *ss; + int ssid; + + lockdep_assert_held(&cgroup_mutex); + + cgroup_for_each_live_child(dsct, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + DEFINE_WAIT(wait); + + if (!css) + continue; + + cgroup_get(dsct); + prepare_to_wait(&dsct->offline_waitq, &wait, + TASK_UNINTERRUPTIBLE); + + mutex_unlock(&cgroup_mutex); + schedule(); + finish_wait(&dsct->offline_waitq, &wait); + mutex_lock(&cgroup_mutex); + + cgroup_put(dsct); + return true; + } + } + + return false; +} + /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, @@ -3050,6 +3097,11 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, goto out_unlock; } + if (cgroup_drain_offline(cgrp)) { + cgroup_kn_unlock(of->kn); + return restart_syscall(); + } + /* * Update subsys masks and calculate what needs to be done. More * subsystems than specified may need to be enabled or disabled @@ -3065,31 +3117,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, enable |= css_enable; disable |= css_disable; - /* - * Because css offlining is asynchronous, userland might try to - * re-enable the same controller while the previous instance is - * still around. In such cases, wait till it's gone using - * offline_waitq. - */ - do_each_subsys_mask(ss, ssid, css_enable) { - cgroup_for_each_live_child(child, cgrp) { - DEFINE_WAIT(wait); - - if (!cgroup_css(child, ss)) - continue; - - cgroup_get(child); - prepare_to_wait(&child->offline_waitq, &wait, - TASK_UNINTERRUPTIBLE); - cgroup_kn_unlock(of->kn); - schedule(); - finish_wait(&child->offline_waitq, &wait); - cgroup_put(child); - - return restart_syscall(); - } - } while_each_subsys_mask(); - cgrp->subtree_control = new_sc; cgrp->subtree_ss_mask = new_ss; -- cgit From 12b3bb6af862477f96e1adac51b201a143a8f3c4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:59 -0500 Subject: cgroup: factor out cgroup_apply_control_disable() from cgroup_subtree_control_write() Factor out css disabling and hiding into cgroup_apply_control_disable(). * Nest subsystem walk inside child walk. The child walk will later be converted to subtree walk which is a bit more expensive. * Instead of operating on the differential masks @css_enable and @css_disable, simply disable or hide csses according to the current cgroup_control() and cgroup_ss_mask(). This leads to the same result and is simpler and more robust. * This allows error handling path to share the same code. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 74 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d295e6a91cdc..97cb1315bcac 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3012,6 +3012,43 @@ static bool cgroup_drain_offline(struct cgroup *cgrp) return false; } +/** + * cgroup_apply_control_disable - kill or hide csses according to control + * @cgrp: parent of the target cgroups + * + * Walk @cgrp's children and kill and hide csses so that they match + * cgroup_ss_mask() and cgroup_visible_mask(). + * + * A css is hidden when the userland requests it to be disabled while other + * subsystems are still depending on it. The css must not actively control + * resources and be in the vanilla state if it's made visible again later. + * Controllers which may be depended upon should provide ->css_reset() for + * this purpose. + */ +static void cgroup_apply_control_disable(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys *ss; + int ssid; + + cgroup_for_each_live_child(dsct, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + + if (!css) + continue; + + if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) { + kill_css(css); + } else if (!(cgroup_control(dsct) & (1 << ss->id))) { + css_clear_dir(css, NULL); + if (ss->css_reset) + ss->css_reset(css); + } + } + } +} + /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, @@ -3160,27 +3197,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (ret) goto err_undo_css; - /* - * All tasks are migrated out of disabled csses. Kill or hide - * them. A css is hidden when the userland requests it to be - * disabled while other subsystems are still depending on it. The - * css must not actively control resources and be in the vanilla - * state if it's made visible again later. Controllers which may - * be depended upon should provide ->css_reset() for this purpose. - */ - do_each_subsys_mask(ss, ssid, disable) { - cgroup_for_each_live_child(child, cgrp) { - struct cgroup_subsys_state *css = cgroup_css(child, ss); - - if (css_disable & (1 << ssid)) { - kill_css(css); - } else { - css_clear_dir(css, NULL); - if (ss->css_reset) - ss->css_reset(css); - } - } - } while_each_subsys_mask(); + /* all tasks are migrated out of disabled csses, commit disable */ + cgroup_apply_control_disable(cgrp); kernfs_activate(cgrp->kn); ret = 0; @@ -3189,22 +3207,12 @@ out_unlock: return ret ?: nbytes; err_undo_css: + /* restore masks and shoot down new csses */ cgrp->subtree_control = old_sc; cgrp->subtree_ss_mask = old_ss; - do_each_subsys_mask(ss, ssid, enable) { - cgroup_for_each_live_child(child, cgrp) { - struct cgroup_subsys_state *css = cgroup_css(child, ss); - - if (!css) - continue; + cgroup_apply_control_disable(cgrp); - if (css_enable & (1 << ssid)) - kill_css(css); - else - css_clear_dir(css, NULL); - } - } while_each_subsys_mask(); goto out_unlock; } -- cgit From bdb53bd797dcef46d1a252b9529f8fd511bf714c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:59 -0500 Subject: cgroup: factor out cgroup_apply_control_enable() from cgroup_subtree_control_write() Factor out css enabling and showing into cgroup_apply_control_enable(). * Nest subsystem walk inside child walk. The child walk will later be converted to subtree walk which is a bit more expensive. * Instead of operating on the differential masks @css_enable, simply enable or show csses according to the current cgroup_control() and cgroup_ss_mask(). This leads to the same result and is simpler and more robust. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 77 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 97cb1315bcac..1193038d0729 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3012,6 +3012,49 @@ static bool cgroup_drain_offline(struct cgroup *cgrp) return false; } +/** + * cgroup_apply_control_enable - enable or show csses according to control + * @cgrp: parent of the target cgroups + * + * Walk @cgrp's children and create new csses or make the existing ones + * visible. A css is created invisible if it's being implicitly enabled + * through dependency. An invisible css is made visible when the userland + * explicitly enables it. + * + * Returns 0 on success, -errno on failure. On failure, csses which have + * been processed already aren't cleaned up. The caller is responsible for + * cleaning up with cgroup_apply_control_disble(). + */ +static int cgroup_apply_control_enable(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys *ss; + int ssid, ret; + + cgroup_for_each_live_child(dsct, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + + if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) + continue; + + if (!css) { + css = css_create(dsct, ss); + if (IS_ERR(css)) + return PTR_ERR(css); + } + + if (cgroup_control(dsct) & (1 << ss->id)) { + ret = css_populate_dir(css, NULL); + if (ret) + return ret; + } + } + } + + return 0; +} + /** * cgroup_apply_control_disable - kill or hide csses according to control * @cgrp: parent of the target cgroups @@ -3157,36 +3200,10 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgrp->subtree_control = new_sc; cgrp->subtree_ss_mask = new_ss; - /* - * Create new csses or make the existing ones visible. A css is - * created invisible if it's being implicitly enabled through - * dependency. An invisible css is made visible when the userland - * explicitly enables it. - */ - do_each_subsys_mask(ss, ssid, enable) { - cgroup_for_each_live_child(child, cgrp) { - if (css_enable & (1 << ssid)) { - struct cgroup_subsys_state *css; - - css = css_create(child, ss); - if (IS_ERR(css)) { - ret = PTR_ERR(css); - goto err_undo_css; - } - - if (cgrp->subtree_control & (1 << ssid)) { - ret = css_populate_dir(css, NULL); - if (ret) - goto err_undo_css; - } - } else { - ret = css_populate_dir(cgroup_css(child, ss), - NULL); - if (ret) - goto err_undo_css; - } - } - } while_each_subsys_mask(); + /* prepare csses */ + ret = cgroup_apply_control_enable(cgrp); + if (ret) + goto err_undo_css; /* * At this point, cgroup_e_css() results reflect the new csses -- cgit From ce3f1d9d19371045981a64815227bab822554878 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:59 -0500 Subject: cgroup: make cgroup_drain_offline() and cgroup_apply_control_{disable|enable}() recursive The three factored out css management operations - cgroup_drain_offline() and cgroup_apply_control_{disable|enable}() - only depend on the current state of the target cgroups and idempotent and thus can be easily made to operate on the subtree instead of the immediate children. This patch introduces the iterators which walk live subtree and converts the three functions to operate on the subtree including self instead of the children. While this leads to spurious walking and be slightly more expensive, it will allow them to be used for wider scope of operations. Note that cgroup_drain_offline() now tests for whether a css is dying before trying to drain it. This is to avoid trying to drain live csses as there can be mix of live and dying csses in a subtree unlike children of the same parent. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1193038d0729..0398f2a6673b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -573,6 +573,24 @@ static int notify_on_release(const struct cgroup *cgrp) ; \ else +/* walk live descendants in preorder */ +#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ + css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + +/* walk live descendants in postorder */ +#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ + css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + static void cgroup_release_agent(struct work_struct *work); static void check_for_release(struct cgroup *cgrp); @@ -2967,11 +2985,11 @@ out_finish: /** * cgroup_drain_offline - wait for previously offlined csses to go away - * @cgrp: parent of the target cgroups + * @cgrp: root of the target subtree * * Because css offlining is asynchronous, userland may try to re-enable a * controller while the previous css is still around. This function drains - * the previous css instances of @cgrp's children. + * the previous css instances of @cgrp's subtree. * * Must be called with cgroup_mutex held. Returns %false if there were no * dying css instances. Returns %true if there were one or more and this @@ -2982,17 +3000,18 @@ out_finish: static bool cgroup_drain_offline(struct cgroup *cgrp) { struct cgroup *dsct; + struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&cgroup_mutex); - cgroup_for_each_live_child(dsct, cgrp) { + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); DEFINE_WAIT(wait); - if (!css) + if (!css || !percpu_ref_is_dying(&css->refcnt)) continue; cgroup_get(dsct); @@ -3014,9 +3033,9 @@ static bool cgroup_drain_offline(struct cgroup *cgrp) /** * cgroup_apply_control_enable - enable or show csses according to control - * @cgrp: parent of the target cgroups + * @cgrp: root of the target subtree * - * Walk @cgrp's children and create new csses or make the existing ones + * Walk @cgrp's subtree and create new csses or make the existing ones * visible. A css is created invisible if it's being implicitly enabled * through dependency. An invisible css is made visible when the userland * explicitly enables it. @@ -3028,10 +3047,11 @@ static bool cgroup_drain_offline(struct cgroup *cgrp) static int cgroup_apply_control_enable(struct cgroup *cgrp) { struct cgroup *dsct; + struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid, ret; - cgroup_for_each_live_child(dsct, cgrp) { + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); @@ -3057,9 +3077,9 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) /** * cgroup_apply_control_disable - kill or hide csses according to control - * @cgrp: parent of the target cgroups + * @cgrp: root of the target subtree * - * Walk @cgrp's children and kill and hide csses so that they match + * Walk @cgrp's subtree and kill and hide csses so that they match * cgroup_ss_mask() and cgroup_visible_mask(). * * A css is hidden when the userland requests it to be disabled while other @@ -3071,10 +3091,11 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) static void cgroup_apply_control_disable(struct cgroup *cgrp) { struct cgroup *dsct; + struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; - cgroup_for_each_live_child(dsct, cgrp) { + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); -- cgit From 15a27c362d54378f17ec078579b2f6af88495a3f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:59 -0500 Subject: cgroup: introduce cgroup_{save|propagate|restore}_control() While controllers are being enabled and disabled in cgroup_subtree_control_write(), the original subsystem masks are stashed in local variables so that they can be restored if the operation fails in the middle. This patch adds dedicated fields to struct cgroup to be used instead of the local variables and implements functions to stash the current values, propagate the changes and restore them recursively. Combined with the previous changes, this makes subsystem management operations fully recursive and modularlized. This will be used to expand cgroup core functionalities. While at it, remove now unused @css_enable and @css_disable from cgroup_subtree_control_write(). Signed-off-by: Tejun Heo Acked-by: Zefan Li --- include/linux/cgroup-defs.h | 2 ++ kernel/cgroup.c | 82 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 64 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 7593c1a46786..aae8c94de4b3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -260,6 +260,8 @@ struct cgroup { */ u16 subtree_control; u16 subtree_ss_mask; + u16 old_subtree_control; + u16 old_subtree_ss_mask; /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0398f2a6673b..452a90e455fa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3031,6 +3031,62 @@ static bool cgroup_drain_offline(struct cgroup *cgrp) return false; } +/** + * cgroup_save_control - save control masks of a subtree + * @cgrp: root of the target subtree + * + * Save ->subtree_control and ->subtree_ss_mask to the respective old_ + * prefixed fields for @cgrp's subtree including @cgrp itself. + */ +static void cgroup_save_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { + dsct->old_subtree_control = dsct->subtree_control; + dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; + } +} + +/** + * cgroup_propagate_control - refresh control masks of a subtree + * @cgrp: root of the target subtree + * + * For @cgrp and its subtree, ensure ->subtree_ss_mask matches + * ->subtree_control and propagate controller availability through the + * subtree so that descendants don't have unavailable controllers enabled. + */ +static void cgroup_propagate_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { + dsct->subtree_control &= cgroup_control(dsct); + dsct->subtree_ss_mask = cgroup_calc_subtree_ss_mask(dsct, + dsct->subtree_control); + } +} + +/** + * cgroup_restore_control - restore control masks of a subtree + * @cgrp: root of the target subtree + * + * Restore ->subtree_control and ->subtree_ss_mask from the respective old_ + * prefixed fields for @cgrp's subtree including @cgrp itself. + */ +static void cgroup_restore_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { + dsct->subtree_control = dsct->old_subtree_control; + dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; + } +} + /** * cgroup_apply_control_enable - enable or show csses according to control * @cgrp: root of the target subtree @@ -3119,7 +3175,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, loff_t off) { u16 enable = 0, disable = 0; - u16 css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -3203,25 +3258,14 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return restart_syscall(); } - /* - * Update subsys masks and calculate what needs to be done. More - * subsystems than specified may need to be enabled or disabled - * depending on subsystem dependencies. - */ - old_sc = cgrp->subtree_control; - old_ss = cgrp->subtree_ss_mask; - new_sc = (old_sc | enable) & ~disable; - new_ss = cgroup_calc_subtree_ss_mask(cgrp, new_sc); + /* save and update control masks and prepare csses */ + cgroup_save_control(cgrp); - css_enable = ~old_ss & new_ss; - css_disable = old_ss & ~new_ss; - enable |= css_enable; - disable |= css_disable; + cgrp->subtree_control |= enable; + cgrp->subtree_control &= ~disable; - cgrp->subtree_control = new_sc; - cgrp->subtree_ss_mask = new_ss; + cgroup_propagate_control(cgrp); - /* prepare csses */ ret = cgroup_apply_control_enable(cgrp); if (ret) goto err_undo_css; @@ -3246,9 +3290,7 @@ out_unlock: err_undo_css: /* restore masks and shoot down new csses */ - cgrp->subtree_control = old_sc; - cgrp->subtree_ss_mask = old_ss; - + cgroup_restore_control(cgrp); cgroup_apply_control_disable(cgrp); goto out_unlock; -- cgit From f7b2814bb9b6cb1d69333e1592c702260fcb4184 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:58:00 -0500 Subject: cgroup: factor out cgroup_{apply|finalize}_control() from cgroup_subtree_control_write() Factor out cgroup_{apply|finalize}_control() so that control mask update can be done in several simple steps. This patch doesn't introduce behavior changes. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 81 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 452a90e455fa..2adf0433a3cf 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3169,6 +3169,62 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) } } +/** + * cgroup_apply_control - apply control mask updates to the subtree + * @cgrp: root of the target subtree + * + * subsystems can be enabled and disabled in a subtree using the following + * steps. + * + * 1. Call cgroup_save_control() to stash the current state. + * 2. Update ->subtree_control masks in the subtree as desired. + * 3. Call cgroup_apply_control() to apply the changes. + * 4. Optionally perform other related operations. + * 5. Call cgroup_finalize_control() to finish up. + * + * This function implements step 3 and propagates the mask changes + * throughout @cgrp's subtree, updates csses accordingly and perform + * process migrations. + */ +static int cgroup_apply_control(struct cgroup *cgrp) +{ + int ret; + + cgroup_propagate_control(cgrp); + + ret = cgroup_apply_control_enable(cgrp); + if (ret) + return ret; + + /* + * At this point, cgroup_e_css() results reflect the new csses + * making the following cgroup_update_dfl_csses() properly update + * css associations of all tasks in the subtree. + */ + ret = cgroup_update_dfl_csses(cgrp); + if (ret) + return ret; + + return 0; +} + +/** + * cgroup_finalize_control - finalize control mask update + * @cgrp: root of the target subtree + * @ret: the result of the update + * + * Finalize control mask update. See cgroup_apply_control() for more info. + */ +static void cgroup_finalize_control(struct cgroup *cgrp, int ret) +{ + if (ret) { + cgroup_restore_control(cgrp); + cgroup_propagate_control(cgrp); + } + + cgroup_apply_control_disable(cgrp); +} + /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, @@ -3264,36 +3320,15 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgrp->subtree_control |= enable; cgrp->subtree_control &= ~disable; - cgroup_propagate_control(cgrp); + ret = cgroup_apply_control(cgrp); - ret = cgroup_apply_control_enable(cgrp); - if (ret) - goto err_undo_css; - - /* - * At this point, cgroup_e_css() results reflect the new csses - * making the following cgroup_update_dfl_csses() properly update - * css associations of all tasks in the subtree. - */ - ret = cgroup_update_dfl_csses(cgrp); - if (ret) - goto err_undo_css; - - /* all tasks are migrated out of disabled csses, commit disable */ - cgroup_apply_control_disable(cgrp); + cgroup_finalize_control(cgrp, ret); kernfs_activate(cgrp->kn); ret = 0; out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; - -err_undo_css: - /* restore masks and shoot down new csses */ - cgroup_restore_control(cgrp); - cgroup_apply_control_disable(cgrp); - - goto out_unlock; } static int cgroup_events_show(struct seq_file *seq, void *v) -- cgit From 945ba1996888809cf510a8da000a9c20a9fab5ad Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:58:00 -0500 Subject: cgroup: combine cgroup_mutex locking and offline css draining cgroup_drain_offline() is used to wait for csses being offlined to uninstall itself from cgroup->subsys[] array so that new csses can be installed. The function's only user, cgroup_subtree_control_write(), calls it after performing some checks and restarts the whole process via restart_syscall() if draining has to release cgroup_mutex to wait. This can be simplified by draining before other synchronized operations so that there's nothing to restart. This patch converts cgroup_drain_offline() to cgroup_lock_and_drain_offline() which performs both locking and draining and updates cgroup_kn_lock_live() use it instead of cgroup_mutex() if requested. This combined locking and draining operations are easier to use and less error-prone. While at it, add WARNs in control_apply functions which triggers if the subtree isn't properly drained. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 55 +++++++++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2adf0433a3cf..bbeb35f14eda 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -220,6 +220,7 @@ static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp); static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, @@ -1391,19 +1392,22 @@ static void cgroup_kn_unlock(struct kernfs_node *kn) /** * cgroup_kn_lock_live - locking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced + * @drain_offline: perform offline draining on the cgroup * * This helper is to be used by a cgroup kernfs method currently servicing * @kn. It breaks the active protection, performs cgroup locking and * verifies that the associated cgroup is alive. Returns the cgroup if * alive; otherwise, %NULL. A successful return should be undone by a - * matching cgroup_kn_unlock() invocation. + * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the + * cgroup is drained of offlining csses before return. * * Any cgroup kernfs method implementation which requires locking the * associated cgroup should use this helper. It avoids nesting cgroup * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ -static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) +static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, + bool drain_offline) { struct cgroup *cgrp; @@ -1422,7 +1426,10 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) return NULL; kernfs_break_active_protection(kn); - mutex_lock(&cgroup_mutex); + if (drain_offline) + cgroup_lock_and_drain_offline(cgrp); + else + mutex_lock(&cgroup_mutex); if (!cgroup_is_dead(cgrp)) return cgrp; @@ -2761,7 +2768,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; @@ -2859,7 +2866,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); @@ -2984,27 +2991,23 @@ out_finish: } /** - * cgroup_drain_offline - wait for previously offlined csses to go away + * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses * @cgrp: root of the target subtree * * Because css offlining is asynchronous, userland may try to re-enable a - * controller while the previous css is still around. This function drains - * the previous css instances of @cgrp's subtree. - * - * Must be called with cgroup_mutex held. Returns %false if there were no - * dying css instances. Returns %true if there were one or more and this - * function waited. On %true return, cgroup_mutex has been dropped and - * re-acquired inbetween which anything could have happened. The caller - * typically would have to start over. + * controller while the previous css is still around. This function grabs + * cgroup_mutex and drains the previous css instances of @cgrp's subtree. */ -static bool cgroup_drain_offline(struct cgroup *cgrp) +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp) + __acquires(&cgroup_mutex) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; - lockdep_assert_held(&cgroup_mutex); +restart: + mutex_lock(&cgroup_mutex); cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { @@ -3021,14 +3024,11 @@ static bool cgroup_drain_offline(struct cgroup *cgrp) mutex_unlock(&cgroup_mutex); schedule(); finish_wait(&dsct->offline_waitq, &wait); - mutex_lock(&cgroup_mutex); cgroup_put(dsct); - return true; + goto restart; } } - - return false; } /** @@ -3111,6 +3111,8 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); + if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) continue; @@ -3155,6 +3157,8 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); + if (!css) continue; @@ -3264,7 +3268,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return -EINVAL; } - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENODEV; @@ -3309,11 +3313,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, goto out_unlock; } - if (cgroup_drain_offline(cgrp)) { - cgroup_kn_unlock(of->kn); - return restart_syscall(); - } - /* save and update control masks and prepare csses */ cgroup_save_control(cgrp); @@ -5140,7 +5139,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (strchr(name, '\n')) return -EINVAL; - parent = cgroup_kn_lock_live(parent_kn); + parent = cgroup_kn_lock_live(parent_kn, false); if (!parent) return -ENODEV; @@ -5339,7 +5338,7 @@ static int cgroup_rmdir(struct kernfs_node *kn) struct cgroup *cgrp; int ret = 0; - cgrp = cgroup_kn_lock_live(kn); + cgrp = cgroup_kn_lock_live(kn, false); if (!cgrp) return 0; -- cgit From 03970d3c11faf870dc5126bb2e84fd1d692af1b7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:58:00 -0500 Subject: cgroup: use cgroup_apply_enable_control() in cgroup creation path cgroup_create() manually updates control masks and creates child csses which cgroup_mkdir() then manually populates. Both can be simplified by using cgroup_apply_enable_control() and friends. The only catch is that it calls css_populate_dir() with NULL cgroup->kn during cgroup_create(). This is worked around by making the function noop on NULL kn. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 38 ++++++++++++++------------------------ 1 file changed, 14 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bbeb35f14eda..ee6951b1e35d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1490,7 +1490,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css, struct cftype *cfts, *failed_cfts; int ret; - if (css->flags & CSS_VISIBLE) + if ((css->flags & CSS_VISIBLE) || !cgrp->kn) return 0; if (!css->ss) { @@ -5042,10 +5042,9 @@ err_free_css: static struct cgroup *cgroup_create(struct cgroup *parent) { struct cgroup_root *root = parent->root; - struct cgroup_subsys *ss; struct cgroup *cgrp, *tcgrp; int level = parent->level + 1; - int ssid, ret; + int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp) + @@ -5095,25 +5094,19 @@ static struct cgroup *cgroup_create(struct cgroup *parent) */ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - /* create the csses */ - do_each_subsys_mask(ss, ssid, cgroup_ss_mask(cgrp)) { - struct cgroup_subsys_state *css; - - css = css_create(cgrp, ss); - if (IS_ERR(css)) { - ret = PTR_ERR(css); - goto out_destroy; - } - } while_each_subsys_mask(); - /* * On the default hierarchy, a child doesn't automatically inherit * subtree_control from the parent. Each is configured manually. */ - if (!cgroup_on_dfl(cgrp)) { + if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); - cgroup_refresh_subtree_ss_mask(cgrp); - } + + cgroup_propagate_control(cgrp); + + /* @cgrp doesn't have dir yet so the following will only create csses */ + ret = cgroup_apply_control_enable(cgrp); + if (ret) + goto out_destroy; return cgrp; @@ -5131,9 +5124,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; - struct cgroup_subsys *ss; struct kernfs_node *kn; - int ssid, ret; + int ret; /* do not accept '\n' to prevent making /proc//cgroup unparsable */ if (strchr(name, '\n')) @@ -5171,11 +5163,9 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - do_each_subsys_mask(ss, ssid, cgroup_control(cgrp)) { - ret = css_populate_dir(cgroup_css(cgrp, ss), NULL); - if (ret) - goto out_destroy; - } while_each_subsys_mask(); + ret = cgroup_apply_control_enable(cgrp); + if (ret) + goto out_destroy; /* let's create and online css's */ kernfs_activate(kn); -- cgit From 334c3679ec4b2b113c35ebe37d2018b112dd5013 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:58:01 -0500 Subject: cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends rebind_subsystem() open codes quite a bit of css and interface file manipulations. It tries to be fail-safe but doesn't quite achieve it. It can be greatly simplified by using the new css management helpers. This patch reimplements rebind_subsytsems() using cgroup_apply_control() and friends. * The half-baked rollback on file creation failure is dropped. It is an extremely cold path, failure isn't critical, and, aside from kernel bugs, the only reason it can fail is memory allocation failure which pretty much doesn't happen for small allocations. * As cgroup_apply_control_disable() is now used to clean up root cgroup on rebind, make sure that it doesn't end up killing root csses. * All callers of rebind_subsystems() are updated to use cgroup_lock_and_drain_offline() as the apply_control functions require drained subtree. * This leaves cgroup_refresh_subtree_ss_mask() without any user. Removed. * css_populate_dir() and css_clear_dir() no longer needs @cgrp_override parameter. Dropped. * While at it, add WARN_ON() to rebind_subsystem() calls which are expected to always succeed just in case. While the rules visible to userland aren't changed, this reimplementation not only simplifies rebind_subsystems() but also allows it to disable and enable csses recursively. This can be used to implement more flexible rebinding. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 107 +++++++++++++++----------------------------------------- 1 file changed, 28 insertions(+), 79 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ee6951b1e35d..98e644b0a532 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -221,6 +221,8 @@ static struct cftype cgroup_legacy_base_files[]; static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); static void cgroup_lock_and_drain_offline(struct cgroup *cgrp); +static int cgroup_apply_control(struct cgroup *cgrp); +static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, @@ -1160,13 +1162,13 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; - mutex_lock(&cgroup_mutex); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->self.children)); /* Rebind all subsystems back to the default hierarchy */ - rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); + WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); /* * Release all the links from cset_links to this hierarchy's @@ -1351,19 +1353,6 @@ static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control) return cur_ss_mask; } -/** - * cgroup_refresh_subtree_ss_mask - update subtree_ss_mask - * @cgrp: the target cgroup - * - * Update @cgrp->subtree_ss_mask according to the current - * @cgrp->subtree_control using cgroup_calc_subtree_ss_mask(). - */ -static void cgroup_refresh_subtree_ss_mask(struct cgroup *cgrp) -{ - cgrp->subtree_ss_mask = - cgroup_calc_subtree_ss_mask(cgrp, cgrp->subtree_control); -} - /** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced @@ -1459,12 +1448,10 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) /** * css_clear_dir - remove subsys files in a cgroup directory * @css: taget css - * @cgrp_override: specify if target cgroup is different from css->cgroup */ -static void css_clear_dir(struct cgroup_subsys_state *css, - struct cgroup *cgrp_override) +static void css_clear_dir(struct cgroup_subsys_state *css) { - struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cgroup *cgrp = css->cgroup; struct cftype *cfts; if (!(css->flags & CSS_VISIBLE)) @@ -1479,14 +1466,12 @@ static void css_clear_dir(struct cgroup_subsys_state *css, /** * css_populate_dir - create subsys files in a cgroup directory * @css: target css - * @cgrp_overried: specify if target cgroup is different from css->cgroup * * On failure, no file is added. */ -static int css_populate_dir(struct cgroup_subsys_state *css, - struct cgroup *cgrp_override) +static int css_populate_dir(struct cgroup_subsys_state *css) { - struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cgroup *cgrp = css->cgroup; struct cftype *cfts, *failed_cfts; int ret; @@ -1526,7 +1511,6 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - u16 tmp_ss_mask; int ssid, i, ret; lockdep_assert_held(&cgroup_mutex); @@ -1541,46 +1525,6 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) return -EBUSY; } while_each_subsys_mask(); - /* skip creating root files on dfl_root for inhibited subsystems */ - tmp_ss_mask = ss_mask; - if (dst_root == &cgrp_dfl_root) - tmp_ss_mask &= ~cgrp_dfl_inhibit_ss_mask; - - do_each_subsys_mask(ss, ssid, tmp_ss_mask) { - struct cgroup *scgrp = &ss->root->cgrp; - int tssid; - - ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp); - if (!ret) - continue; - - /* - * Rebinding back to the default root is not allowed to - * fail. Using both default and non-default roots should - * be rare. Moving subsystems back and forth even more so. - * Just warn about it and continue. - */ - if (dst_root == &cgrp_dfl_root) { - if (cgrp_dfl_visible) { - pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", - ret, ss_mask); - pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); - } - continue; - } - - do_each_subsys_mask(ss, tssid, tmp_ss_mask) { - if (tssid == ssid) - break; - css_clear_dir(cgroup_css(scgrp, ss), dcgrp); - } while_each_subsys_mask(); - return ret; - } while_each_subsys_mask(); - - /* - * Nothing can fail from this point on. Remove files for the - * removed subsystems and rebind each subsystem. - */ do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; @@ -1589,8 +1533,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) WARN_ON(!css || cgroup_css(dcgrp, ss)); - css_clear_dir(css, NULL); + /* disable from the source */ + src_root->subsys_mask &= ~(1 << ssid); + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; @@ -1602,20 +1550,20 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) &dcgrp->e_csets[ss->id]); spin_unlock_bh(&css_set_lock); - src_root->subsys_mask &= ~(1 << ssid); - scgrp->subtree_control &= ~(1 << ssid); - cgroup_refresh_subtree_ss_mask(scgrp); - /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); } else { dcgrp->subtree_control |= 1 << ssid; - cgroup_refresh_subtree_ss_mask(dcgrp); static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } + ret = cgroup_apply_control(dcgrp); + if (ret) + pr_warn("partial failure to rebind %s controller (err=%d)\n", + ss->name, ret); + if (ss->bind) ss->bind(css); } while_each_subsys_mask(); @@ -1807,7 +1755,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) return -EINVAL; } - mutex_lock(&cgroup_mutex); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ ret = parse_cgroupfs_options(data, &opts); @@ -1840,7 +1788,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - rebind_subsystems(&cgrp_dfl_root, removed_mask); + WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); if (opts.release_agent) { spin_lock(&release_agent_path_lock); @@ -1991,7 +1939,7 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) } root_cgrp->kn = root->kf_root->kn; - ret = css_populate_dir(&root_cgrp->self, NULL); + ret = css_populate_dir(&root_cgrp->self); if (ret) goto destroy_root; @@ -2070,7 +2018,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_mount; } - mutex_lock(&cgroup_mutex); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); @@ -3123,7 +3071,7 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) } if (cgroup_control(dsct) & (1 << ss->id)) { - ret = css_populate_dir(css, NULL); + ret = css_populate_dir(css); if (ret) return ret; } @@ -3162,10 +3110,11 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) if (!css) continue; - if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) { + if (css->parent && + !(cgroup_ss_mask(dsct) & (1 << ss->id))) { kill_css(css); } else if (!(cgroup_control(dsct) & (1 << ss->id))) { - css_clear_dir(css, NULL); + css_clear_dir(css); if (ss->css_reset) ss->css_reset(css); } @@ -5159,7 +5108,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - ret = css_populate_dir(&cgrp->self, NULL); + ret = css_populate_dir(&cgrp->self); if (ret) goto out_destroy; @@ -5231,7 +5180,7 @@ static void kill_css(struct cgroup_subsys_state *css) * This must happen before css is disassociated with its cgroup. * See seq_css() for details. */ - css_clear_dir(css, NULL); + css_clear_dir(css); /* * Killing would put the base ref, but we need to keep it alive -- cgit From 5ced2518bd3e3a4f01e2122122211f217cd99f4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:58:01 -0500 Subject: cgroup: make cgroup_calc_subtree_ss_mask() take @this_ss_mask cgroup_calc_subtree_ss_mask() currently takes @cgrp and @subtree_control. @cgrp is used for two purposes - to decide whether it's for default hierarchy and the mask of available subsystems. The former doesn't matter as the results are the same regardless. The latter can be specified directly through a subsystem mask. This patch makes cgroup_calc_subtree_ss_mask() perform the same calculations for both default and legacy hierarchies and take @this_ss_mask for available subsystems. @cgrp is no longer used and dropped. This is to allow using the function in contexts where available controllers can't be decided from the cgroup. v2: cgroup_refres_subtree_ss_mask() is removed by a previous patch. Updated accordingly. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 98e644b0a532..58e02e9aa970 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1309,18 +1309,17 @@ static umode_t cgroup_file_mode(const struct cftype *cft) /** * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask - * @cgrp: the target cgroup * @subtree_control: the new subtree_control mask to consider + * @this_ss_mask: available subsystems * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * * This function calculates which subsystems need to be enabled if - * @subtree_control is to be applied to @cgrp. The returned mask is always - * a superset of @subtree_control and follows the usual hierarchy rules. + * @subtree_control is to be applied while restricted to @this_ss_mask. */ -static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control) +static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) { u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; @@ -1328,9 +1327,6 @@ static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control) lockdep_assert_held(&cgroup_mutex); - if (!cgroup_on_dfl(cgrp)) - return cur_ss_mask; - while (true) { u16 new_ss_mask = cur_ss_mask; @@ -1343,7 +1339,7 @@ static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control) * happen only if some depended-upon subsystems were bound * to non-default hierarchies. */ - new_ss_mask &= cgroup_ss_mask(cgrp); + new_ss_mask &= this_ss_mask; if (new_ss_mask == cur_ss_mask) break; @@ -3012,8 +3008,9 @@ static void cgroup_propagate_control(struct cgroup *cgrp) cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->subtree_control &= cgroup_control(dsct); - dsct->subtree_ss_mask = cgroup_calc_subtree_ss_mask(dsct, - dsct->subtree_control); + dsct->subtree_ss_mask = + cgroup_calc_subtree_ss_mask(dsct->subtree_control, + cgroup_ss_mask(dsct)); } } -- cgit From 04313591ae487da8b5781a0d8d444073a3fdee0d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:58:01 -0500 Subject: cgroup: allocate 2x cgrp_cset_links when setting up a new root During prep, cgroup_setup_root() allocates cgrp_cset_links matching the number of existing css_sets to later link the new root. This is fine for now as the only operation which can happen inbetween is rebind_subsystems() and rebinding of empty subsystems doesn't create new css_sets. However, while not yet allowed, with the recent reimplementation, rebind_subsystems() can rebind subsystems with descendant csses and thus can create new css_sets. This patch makes cgroup_setup_root() allocate 2x of the existing css_sets so that later use of live subsystem rebinding doesn't blow up. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 58e02e9aa970..40ed329482dd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1915,10 +1915,11 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) /* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding - * cgroup_lock, and that's us. The worst that can happen is that we - * have some link structures left over + * cgroup_lock, and that's us. Later rebinding may disable + * controllers on the default hierarchy and thus create new csets, + * which can't be more than the existing ones. Allocate 2x. */ - ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); + ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links); if (ret) goto cancel_ref; -- cgit From 549626047df99f1129d4e742cce741055bdc2dcb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:58:01 -0500 Subject: cgroup: update css iteration in cgroup_update_dfl_csses() The existing sequences of operations ensure that the offlining csses are drained before cgroup_update_dfl_csses(), so even though cgroup_update_dfl_csses() uses css_for_each_descendant_pre() to walk the target cgroups, it doesn't end up operating on dead cgroups. Also, the function explicitly excludes the subtree root from operation. This is fragile and inconsistent with the rest of css update operations. This patch updates cgroup_update_dfl_csses() to use cgroup_for_each_live_descendant_pre() instead and include the subtree root. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 40ed329482dd..c63fce0c5b24 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2877,16 +2877,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy * @cgrp: root of the subtree to update csses for * - * @cgrp's subtree_ss_mask has changed and its subtree's (self excluded) - * css associations need to be updated accordingly. This function looks up - * all css_sets which are attached to the subtree, creates the matching - * updated css_sets and migrates the tasks to the new ones. + * @cgrp's control masks have changed and its subtree's css associations + * need to be updated accordingly. This function looks up all css_sets + * which are attached to the subtree, creates the matching updated css_sets + * and migrates the tasks to the new ones. */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { LIST_HEAD(preloaded_csets); struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); - struct cgroup_subsys_state *css; + struct cgroup_subsys_state *d_css; + struct cgroup *dsct; struct css_set *src_cset; int ret; @@ -2896,14 +2897,10 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) /* look up all csses currently attached to @cgrp's subtree */ spin_lock_bh(&css_set_lock); - css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; - /* self is not affected by subtree_ss_mask change */ - if (css->cgroup == cgrp) - continue; - - list_for_each_entry(link, &css->cgroup->cset_links, cset_link) + list_for_each_entry(link, &dsct->cset_links, cset_link) cgroup_migrate_add_src(link->cset, cgrp, &preloaded_csets); } -- cgit From 4e8ae72a75aae285ec5b93518b9680da198afd0d Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 3 Mar 2016 21:49:27 +0000 Subject: X.509: Make algo identifiers text instead of enum Make the identifier public key and digest algorithm fields text instead of enum. Signed-off-by: David Howells Acked-by: Herbert Xu --- crypto/asymmetric_keys/mscode_parser.c | 14 +++++++------- crypto/asymmetric_keys/pkcs7_parser.c | 18 ++++++++---------- crypto/asymmetric_keys/pkcs7_verify.c | 8 +++----- crypto/asymmetric_keys/public_key.c | 24 ++++-------------------- crypto/asymmetric_keys/verify_pefile.c | 4 ++-- crypto/asymmetric_keys/verify_pefile.h | 2 +- crypto/asymmetric_keys/x509_cert_parser.c | 26 +++++++++++++------------- crypto/asymmetric_keys/x509_public_key.c | 17 ++++++++--------- include/crypto/public_key.h | 30 ++++-------------------------- kernel/module_signing.c | 6 ++++++ security/integrity/digsig_asymmetric.c | 7 ++++--- security/integrity/integrity.h | 2 +- 12 files changed, 61 insertions(+), 97 deletions(-) (limited to 'kernel') diff --git a/crypto/asymmetric_keys/mscode_parser.c b/crypto/asymmetric_keys/mscode_parser.c index adcef59eec0b..3242cbfaeaa2 100644 --- a/crypto/asymmetric_keys/mscode_parser.c +++ b/crypto/asymmetric_keys/mscode_parser.c @@ -86,25 +86,25 @@ int mscode_note_digest_algo(void *context, size_t hdrlen, oid = look_up_OID(value, vlen); switch (oid) { case OID_md4: - ctx->digest_algo = HASH_ALGO_MD4; + ctx->digest_algo = "md4"; break; case OID_md5: - ctx->digest_algo = HASH_ALGO_MD5; + ctx->digest_algo = "md5"; break; case OID_sha1: - ctx->digest_algo = HASH_ALGO_SHA1; + ctx->digest_algo = "sha1"; break; case OID_sha256: - ctx->digest_algo = HASH_ALGO_SHA256; + ctx->digest_algo = "sha256"; break; case OID_sha384: - ctx->digest_algo = HASH_ALGO_SHA384; + ctx->digest_algo = "sha384"; break; case OID_sha512: - ctx->digest_algo = HASH_ALGO_SHA512; + ctx->digest_algo = "sha512"; break; case OID_sha224: - ctx->digest_algo = HASH_ALGO_SHA224; + ctx->digest_algo = "sha224"; break; case OID__NR: diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c index cbbd03fd94f8..40de03f49ff8 100644 --- a/crypto/asymmetric_keys/pkcs7_parser.c +++ b/crypto/asymmetric_keys/pkcs7_parser.c @@ -218,25 +218,25 @@ int pkcs7_sig_note_digest_algo(void *context, size_t hdrlen, switch (ctx->last_oid) { case OID_md4: - ctx->sinfo->sig.pkey_hash_algo = HASH_ALGO_MD4; + ctx->sinfo->sig.hash_algo = "md4"; break; case OID_md5: - ctx->sinfo->sig.pkey_hash_algo = HASH_ALGO_MD5; + ctx->sinfo->sig.hash_algo = "md5"; break; case OID_sha1: - ctx->sinfo->sig.pkey_hash_algo = HASH_ALGO_SHA1; + ctx->sinfo->sig.hash_algo = "sha1"; break; case OID_sha256: - ctx->sinfo->sig.pkey_hash_algo = HASH_ALGO_SHA256; + ctx->sinfo->sig.hash_algo = "sha256"; break; case OID_sha384: - ctx->sinfo->sig.pkey_hash_algo = HASH_ALGO_SHA384; + ctx->sinfo->sig.hash_algo = "sha384"; break; case OID_sha512: - ctx->sinfo->sig.pkey_hash_algo = HASH_ALGO_SHA512; + ctx->sinfo->sig.hash_algo = "sha512"; break; case OID_sha224: - ctx->sinfo->sig.pkey_hash_algo = HASH_ALGO_SHA224; + ctx->sinfo->sig.hash_algo = "sha224"; default: printk("Unsupported digest algo: %u\n", ctx->last_oid); return -ENOPKG; @@ -255,7 +255,7 @@ int pkcs7_sig_note_pkey_algo(void *context, size_t hdrlen, switch (ctx->last_oid) { case OID_rsaEncryption: - ctx->sinfo->sig.pkey_algo = PKEY_ALGO_RSA; + ctx->sinfo->sig.pkey_algo = "rsa"; break; default: printk("Unsupported pkey algo: %u\n", ctx->last_oid); @@ -615,8 +615,6 @@ int pkcs7_sig_note_signature(void *context, size_t hdrlen, { struct pkcs7_parse_context *ctx = context; - BUG_ON(ctx->sinfo->sig.pkey_algo != PKEY_ALGO_RSA); - ctx->sinfo->sig.s = kmemdup(value, vlen, GFP_KERNEL); if (!ctx->sinfo->sig.s) return -ENOMEM; diff --git a/crypto/asymmetric_keys/pkcs7_verify.c b/crypto/asymmetric_keys/pkcs7_verify.c index f5db1378c096..50be2a15e531 100644 --- a/crypto/asymmetric_keys/pkcs7_verify.c +++ b/crypto/asymmetric_keys/pkcs7_verify.c @@ -31,17 +31,15 @@ static int pkcs7_digest(struct pkcs7_message *pkcs7, void *digest; int ret; - kenter(",%u,%u", sinfo->index, sinfo->sig.pkey_hash_algo); + kenter(",%u,%s", sinfo->index, sinfo->sig.hash_algo); - if (sinfo->sig.pkey_hash_algo >= PKEY_HASH__LAST || - !hash_algo_name[sinfo->sig.pkey_hash_algo]) + if (!sinfo->sig.hash_algo) return -ENOPKG; /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ - tfm = crypto_alloc_shash(hash_algo_name[sinfo->sig.pkey_hash_algo], - 0, 0); + tfm = crypto_alloc_shash(sinfo->sig.hash_algo, 0, 0); if (IS_ERR(tfm)) return (PTR_ERR(tfm) == -ENOENT) ? -ENOPKG : PTR_ERR(tfm); diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index 27ebc2f44394..0f8b264b3961 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -24,19 +24,6 @@ MODULE_LICENSE("GPL"); -const char *const pkey_algo_name[PKEY_ALGO__LAST] = { - [PKEY_ALGO_DSA] = "dsa", - [PKEY_ALGO_RSA] = "rsa", -}; -EXPORT_SYMBOL_GPL(pkey_algo_name); - -const char *const pkey_id_type_name[PKEY_ID_TYPE__LAST] = { - [PKEY_ID_PGP] = "PGP", - [PKEY_ID_X509] = "X509", - [PKEY_ID_PKCS7] = "PKCS#7", -}; -EXPORT_SYMBOL_GPL(pkey_id_type_name); - /* * Provide a part of a description of the key for /proc/keys. */ @@ -46,9 +33,7 @@ static void public_key_describe(const struct key *asymmetric_key, struct public_key *key = asymmetric_key->payload.data[asym_crypto]; if (key) - seq_printf(m, "%s.%s", - pkey_id_type_name[key->id_type], - pkey_algo_name[key->pkey_algo]); + seq_printf(m, "%s.%s", key->id_type, key->pkey_algo); } /* @@ -103,15 +88,14 @@ int public_key_verify_signature(const struct public_key *pkey, BUG_ON(!sig->digest); BUG_ON(!sig->s); - alg_name = pkey_algo_name[sig->pkey_algo]; - if (sig->pkey_algo == PKEY_ALGO_RSA) { + alg_name = sig->pkey_algo; + if (strcmp(sig->pkey_algo, "rsa") == 0) { /* The data wangled by the RSA algorithm is typically padded * and encoded in some manner, such as EMSA-PKCS1-1_5 [RFC3447 * sec 8.2]. */ if (snprintf(alg_name_buf, CRYPTO_MAX_ALG_NAME, - "pkcs1pad(rsa,%s)", - hash_algo_name[sig->pkey_hash_algo] + "pkcs1pad(rsa,%s)", sig->hash_algo ) >= CRYPTO_MAX_ALG_NAME) return -EINVAL; alg_name = alg_name_buf; diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c index 897b734dabf9..7e8c2338ae25 100644 --- a/crypto/asymmetric_keys/verify_pefile.c +++ b/crypto/asymmetric_keys/verify_pefile.c @@ -328,12 +328,12 @@ static int pefile_digest_pe(const void *pebuf, unsigned int pelen, void *digest; int ret; - kenter(",%u", ctx->digest_algo); + kenter(",%s", ctx->digest_algo); /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ - tfm = crypto_alloc_shash(hash_algo_name[ctx->digest_algo], 0, 0); + tfm = crypto_alloc_shash(ctx->digest_algo, 0, 0); if (IS_ERR(tfm)) return (PTR_ERR(tfm) == -ENOENT) ? -ENOPKG : PTR_ERR(tfm); diff --git a/crypto/asymmetric_keys/verify_pefile.h b/crypto/asymmetric_keys/verify_pefile.h index 55d5f7ebc45a..a133eb81a492 100644 --- a/crypto/asymmetric_keys/verify_pefile.h +++ b/crypto/asymmetric_keys/verify_pefile.h @@ -28,7 +28,7 @@ struct pefile_context { /* PKCS#7 MS Individual Code Signing content */ const void *digest; /* Digest */ unsigned digest_len; /* Digest length */ - enum hash_algo digest_algo; /* Digest algorithm */ + const char *digest_algo; /* Digest algorithm */ }; #define kenter(FMT, ...) \ diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c index c02c200a7136..4a29bac70060 100644 --- a/crypto/asymmetric_keys/x509_cert_parser.c +++ b/crypto/asymmetric_keys/x509_cert_parser.c @@ -188,33 +188,33 @@ int x509_note_pkey_algo(void *context, size_t hdrlen, return -ENOPKG; /* Unsupported combination */ case OID_md4WithRSAEncryption: - ctx->cert->sig.pkey_hash_algo = HASH_ALGO_MD5; - ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; + ctx->cert->sig.hash_algo = "md4"; + ctx->cert->sig.pkey_algo = "rsa"; break; case OID_sha1WithRSAEncryption: - ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA1; - ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; + ctx->cert->sig.hash_algo = "sha1"; + ctx->cert->sig.pkey_algo = "rsa"; break; case OID_sha256WithRSAEncryption: - ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA256; - ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; + ctx->cert->sig.hash_algo = "sha256"; + ctx->cert->sig.pkey_algo = "rsa"; break; case OID_sha384WithRSAEncryption: - ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA384; - ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; + ctx->cert->sig.hash_algo = "sha384"; + ctx->cert->sig.pkey_algo = "rsa"; break; case OID_sha512WithRSAEncryption: - ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA512; - ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; + ctx->cert->sig.hash_algo = "sha512"; + ctx->cert->sig.pkey_algo = "rsa"; break; case OID_sha224WithRSAEncryption: - ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA224; - ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; + ctx->cert->sig.hash_algo = "sha224"; + ctx->cert->sig.pkey_algo = "rsa"; break; } @@ -396,7 +396,7 @@ int x509_extract_key_data(void *context, size_t hdrlen, if (ctx->last_oid != OID_rsaEncryption) return -ENOPKG; - ctx->cert->pub->pkey_algo = PKEY_ALGO_RSA; + ctx->cert->pub->pkey_algo = "rsa"; /* Discard the BIT STRING metadata */ ctx->key = value + 1; diff --git a/crypto/asymmetric_keys/x509_public_key.c b/crypto/asymmetric_keys/x509_public_key.c index 7092d5cbb5d3..733c046aacc6 100644 --- a/crypto/asymmetric_keys/x509_public_key.c +++ b/crypto/asymmetric_keys/x509_public_key.c @@ -176,7 +176,7 @@ int x509_get_sig_params(struct x509_certificate *cert) /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ - tfm = crypto_alloc_shash(hash_algo_name[cert->sig.pkey_hash_algo], 0, 0); + tfm = crypto_alloc_shash(cert->sig.hash_algo, 0, 0); if (IS_ERR(tfm)) { if (PTR_ERR(tfm) == -ENOENT) { cert->unsupported_crypto = true; @@ -291,21 +291,20 @@ static int x509_key_preparse(struct key_preparsed_payload *prep) pr_devel("Cert Issuer: %s\n", cert->issuer); pr_devel("Cert Subject: %s\n", cert->subject); - if (cert->pub->pkey_algo >= PKEY_ALGO__LAST || - cert->sig.pkey_algo >= PKEY_ALGO__LAST || - cert->sig.pkey_hash_algo >= PKEY_HASH__LAST || - !hash_algo_name[cert->sig.pkey_hash_algo]) { + if (!cert->pub->pkey_algo || + !cert->sig.pkey_algo || + !cert->sig.hash_algo) { ret = -ENOPKG; goto error_free_cert; } - pr_devel("Cert Key Algo: %s\n", pkey_algo_name[cert->pub->pkey_algo]); + pr_devel("Cert Key Algo: %s\n", cert->pub->pkey_algo); pr_devel("Cert Valid period: %lld-%lld\n", cert->valid_from, cert->valid_to); pr_devel("Cert Signature: %s + %s\n", - pkey_algo_name[cert->sig.pkey_algo], - hash_algo_name[cert->sig.pkey_hash_algo]); + cert->sig.pkey_algo, + cert->sig.hash_algo); - cert->pub->id_type = PKEY_ID_X509; + cert->pub->id_type = "X509"; /* Check the signature on the key if it appears to be self-signed */ if ((!cert->akid_skid && !cert->akid_id) || diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h index 80ab099a3edf..aa730ea7faf8 100644 --- a/include/crypto/public_key.h +++ b/include/crypto/public_key.h @@ -14,28 +14,6 @@ #ifndef _LINUX_PUBLIC_KEY_H #define _LINUX_PUBLIC_KEY_H -#include - -enum pkey_algo { - PKEY_ALGO_DSA, - PKEY_ALGO_RSA, - PKEY_ALGO__LAST -}; - -extern const char *const pkey_algo_name[PKEY_ALGO__LAST]; - -/* asymmetric key implementation supports only up to SHA224 */ -#define PKEY_HASH__LAST (HASH_ALGO_SHA224 + 1) - -enum pkey_id_type { - PKEY_ID_PGP, /* OpenPGP generated key ID */ - PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */ - PKEY_ID_PKCS7, /* Signature in PKCS#7 message */ - PKEY_ID_TYPE__LAST -}; - -extern const char *const pkey_id_type_name[PKEY_ID_TYPE__LAST]; - /* * The use to which an asymmetric key is being put. */ @@ -59,8 +37,8 @@ extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR]; struct public_key { void *key; u32 keylen; - enum pkey_algo pkey_algo : 8; - enum pkey_id_type id_type : 8; + const char *id_type; + const char *pkey_algo; }; extern void public_key_destroy(void *payload); @@ -73,8 +51,8 @@ struct public_key_signature { u32 s_size; /* Number of bytes in signature */ u8 *digest; u8 digest_size; /* Number of bytes in digest */ - enum pkey_algo pkey_algo : 8; - enum hash_algo pkey_hash_algo : 8; + const char *pkey_algo; + const char *hash_algo; }; extern struct asymmetric_key_subtype public_key_subtype; diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 9cfa46d8d14f..64b9dead4a07 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -16,6 +16,12 @@ #include #include "module-internal.h" +enum pkey_id_type { + PKEY_ID_PGP, /* OpenPGP generated key ID */ + PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */ + PKEY_ID_PKCS7, /* Signature in PKCS#7 message */ +}; + /* * Module signature information block. * diff --git a/security/integrity/digsig_asymmetric.c b/security/integrity/digsig_asymmetric.c index 69a92e6db23d..80052ed8d467 100644 --- a/security/integrity/digsig_asymmetric.c +++ b/security/integrity/digsig_asymmetric.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -94,7 +95,7 @@ int asymmetric_verify(struct key *keyring, const char *sig, if (siglen != __be16_to_cpu(hdr->sig_size)) return -EBADMSG; - if (hdr->hash_algo >= PKEY_HASH__LAST) + if (hdr->hash_algo >= HASH_ALGO__LAST) return -ENOPKG; key = request_asymmetric_key(keyring, __be32_to_cpu(hdr->keyid)); @@ -103,8 +104,8 @@ int asymmetric_verify(struct key *keyring, const char *sig, memset(&pks, 0, sizeof(pks)); - pks.pkey_algo = PKEY_ALGO_RSA; - pks.pkey_hash_algo = hdr->hash_algo; + pks.pkey_algo = "rsa"; + pks.hash_algo = hash_algo_name[hdr->hash_algo]; pks.digest = (u8 *)data; pks.digest_size = datalen; pks.s = hdr->sig; diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h index 5efe2ecc538d..c7708d9a1b41 100644 --- a/security/integrity/integrity.h +++ b/security/integrity/integrity.h @@ -94,7 +94,7 @@ struct ima_digest_data { struct signature_v2_hdr { uint8_t type; /* xattr type */ uint8_t version; /* signature format version */ - uint8_t hash_algo; /* Digest algorithm [enum pkey_hash_algo] */ + uint8_t hash_algo; /* Digest algorithm [enum hash_algo] */ uint32_t keyid; /* IMA key identifier - not X509/PGP specific */ uint16_t sig_size; /* signature size */ uint8_t sig[0]; /* signature payload */ -- cgit From e57cbaf0eb006eaa207395f3bfd7ce52c1b5539c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 3 Mar 2016 17:18:20 -0500 Subject: tracing: Do not have 'comm' filter override event 'comm' field Commit 9f61668073a8d "tracing: Allow triggers to filter for CPU ids and process names" added a 'comm' filter that will filter events based on the current tasks struct 'comm'. But this now hides the ability to filter events that have a 'comm' field too. For example, sched_migrate_task trace event. That has a 'comm' field of the task to be migrated. echo 'comm == "bash"' > events/sched_migrate_task/filter will now filter all sched_migrate_task events for tasks named "bash" that migrates other tasks (in interrupt context), instead of seeing when "bash" itself gets migrated. This fix requires a couple of changes. 1) Change the look up order for filter predicates to look at the events fields before looking at the generic filters. 2) Instead of basing the filter function off of the "comm" name, have the generic "comm" filter have its own filter_type (FILTER_COMM). Test against the type instead of the name to assign the filter function. 3) Add a new "COMM" filter that works just like "comm" but will filter based on the current task, even if the trace event contains a "comm" field. Do the same for "cpu" field, adding a FILTER_CPU and a filter "CPU". Cc: stable@vger.kernel.org # v4.3+ Fixes: 9f61668073a8d "tracing: Allow triggers to filter for CPU ids and process names" Reported-by: Matt Fleming Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 2 ++ kernel/trace/trace_events.c | 14 ++++++++------ kernel/trace/trace_events_filter.c | 13 +++++++------ 3 files changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 429fdfc3baf5..925730bc9fc1 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -568,6 +568,8 @@ enum { FILTER_DYN_STRING, FILTER_PTR_STRING, FILTER_TRACE_FN, + FILTER_COMM, + FILTER_CPU, }; extern int trace_event_raw_init(struct trace_event_call *call); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ab09829d3b97..05ddc0820771 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -97,16 +97,16 @@ trace_find_event_field(struct trace_event_call *call, char *name) struct ftrace_event_field *field; struct list_head *head; - field = __find_event_field(&ftrace_generic_fields, name); + head = trace_get_fields(call); + field = __find_event_field(head, name); if (field) return field; - field = __find_event_field(&ftrace_common_fields, name); + field = __find_event_field(&ftrace_generic_fields, name); if (field) return field; - head = trace_get_fields(call); - return __find_event_field(head, name); + return __find_event_field(&ftrace_common_fields, name); } static int __trace_define_field(struct list_head *head, const char *type, @@ -171,8 +171,10 @@ static int trace_define_generic_fields(void) { int ret; - __generic_field(int, cpu, FILTER_OTHER); - __generic_field(char *, comm, FILTER_PTR_STRING); + __generic_field(int, CPU, FILTER_CPU); + __generic_field(int, cpu, FILTER_CPU); + __generic_field(char *, COMM, FILTER_COMM); + __generic_field(char *, comm, FILTER_COMM); return ret; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f93a219b18da..6816302542b2 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1043,13 +1043,14 @@ static int init_pred(struct filter_parse_state *ps, return -EINVAL; } - if (is_string_field(field)) { + if (field->filter_type == FILTER_COMM) { + filter_build_regex(pred); + fn = filter_pred_comm; + pred->regex.field_len = TASK_COMM_LEN; + } else if (is_string_field(field)) { filter_build_regex(pred); - if (!strcmp(field->name, "comm")) { - fn = filter_pred_comm; - pred->regex.field_len = TASK_COMM_LEN; - } else if (field->filter_type == FILTER_STATIC_STRING) { + if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) @@ -1072,7 +1073,7 @@ static int init_pred(struct filter_parse_state *ps, } pred->val = val; - if (!strcmp(field->name, "cpu")) + if (field->filter_type == FILTER_CPU) fn = filter_pred_cpu; else fn = select_comparison_fn(pred->op, field->size, -- cgit From e9532e69b8d1d1284e8ecf8d2586de34aec61244 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 4 Mar 2016 15:59:42 +0100 Subject: sched/cputime: Fix steal time accounting vs. CPU hotplug On CPU hotplug the steal time accounting can keep a stale rq->prev_steal_time value over CPU down and up. So after the CPU comes up again the delta calculation in steal_account_process_tick() wreckages itself due to the unsigned math: u64 steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; So if steal is smaller than rq->prev_steal_time we end up with an insane large value which then gets added to rq->prev_steal_time, resulting in a permanent wreckage of the accounting. As a consequence the per CPU stats in /proc/stat become stale. Nice trick to tell the world how idle the system is (100%) while the CPU is 100% busy running tasks. Though we prefer realistic numbers. None of the accounting values which use a previous value to account for fractions is reset at CPU hotplug time. update_rq_clock_task() has a sanity check for prev_irq_time and prev_steal_time_rq, but that sanity check solely deals with clock warps and limits the /proc/stat visible wreckage. The prev_time values are still wrong. Solution is simple: Reset rq->prev_*_time when the CPU is plugged in again. Signed-off-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Cc: Frederic Weisbecker Cc: Glauber Costa Cc: Linus Torvalds Cc: Peter Zijlstra Fixes: commit 095c0aa83e52 "sched: adjust scheduler cpu power for stolen time" Fixes: commit aa483808516c "sched: Remove irq time from available CPU power" Fixes: commit e6e6685accfa "KVM guest: Steal time accounting" Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603041539490.3686@nanos Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + kernel/sched/sched.h | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ab814bf100e1..406182af99ac 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5627,6 +5627,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_UP_PREPARE: rq->calc_load_update = calc_load_update; + account_reset_rq(rq); break; case CPU_ONLINE: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 30ea2d871ba7..4f6598ae4c31 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1738,3 +1738,16 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +static inline void account_reset_rq(struct rq *rq) +{ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + rq->prev_irq_time = 0; +#endif +#ifdef CONFIG_PARAVIRT + rq->prev_steal_time = 0; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + rq->prev_steal_time_rq = 0; +#endif +} -- cgit From 5d8eb84253333f8f63ec704276e3f0a8ec8f3189 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Mar 2016 12:24:10 +0100 Subject: cpu/hotplug: Remove redundant state check The check for the AP range in cpuhp_is_ap_state() is redundant after commit 8df3e07e7f21 "cpu/hotplug: Let upcoming cpu bring itself fully up" because all states above CPUHP_BRINGUP_CPU are invoked on the hotplugged cpu. Remove it. Reported-by: Richard Cochran Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 93e9d89bb0ab..373e831e0faa 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1231,8 +1231,6 @@ static int cpuhp_cb_check(enum cpuhp_state state) static bool cpuhp_is_ap_state(enum cpuhp_state state) { - if (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE) - return true; return state > CPUHP_BRINGUP_CPU; } -- cgit From 6436257b491cc0d456c39330dfc22126148d5ed7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 8 Mar 2016 11:09:53 +0100 Subject: time/timekeeping: Work around false positive GCC warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Newer GCC versions trigger the following warning: kernel/time/timekeeping.c: In function ‘get_device_system_crosststamp’: kernel/time/timekeeping.c:987:5: warning: ‘clock_was_set_seq’ may be used uninitialized in this function [-Wmaybe-uninitialized] if (discontinuity) { ^ kernel/time/timekeeping.c:1045:15: note: ‘clock_was_set_seq’ was declared here unsigned int clock_was_set_seq; ^ GCC clearly is unable to recognize that the 'do_interp' boolean tracks the initialization status of 'clock_was_set_seq'. The GCC version used was: gcc version 5.3.1 20151207 (Red Hat 5.3.1-2) (GCC) Work it around by initializing clock_was_set_seq to 0. Compilers that are able to recognize the code flow will eliminate the unnecessary initialization. Acked-by: Thomas Gleixner Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 931b0b1a71e9..9c629bbed572 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1042,7 +1042,7 @@ int get_device_system_crosststamp(int (*get_time_fn) struct system_counterval_t system_counterval; struct timekeeper *tk = &tk_core.timekeeper; cycle_t cycles, now, interval_start; - unsigned int clock_was_set_seq; + unsigned int clock_was_set_seq = 0; ktime_t base_real, base_raw; s64 nsec_real, nsec_raw; u8 cs_was_changed_seq; -- cgit From 927a5570855836e5d5859a80ce7e91e963545e8f Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 2 Mar 2016 13:24:14 +0200 Subject: perf/core: Fix perf_sched_count derailment The error path in perf_event_open() is such that asking for a sampling event on a PMU that doesn't generate interrupts will end up in dropping the perf_sched_count even though it hasn't been incremented for this event yet. Given a sufficient amount of these calls, we'll end up disabling scheduler's jump label even though we'd still have active events in the system, thereby facilitating the arrival of the infernal regions upon us. I'm fixing this by moving account_event() inside perf_event_alloc(). Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Link: http://lkml.kernel.org/r/1456917854-29427-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5dcc0bd08d11..b7231498de47 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8000,6 +8000,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } } + /* symmetric to unaccount_event() in _free_event() */ + account_event(event); + return event; err_per_task: @@ -8363,8 +8366,6 @@ SYSCALL_DEFINE5(perf_event_open, } } - account_event(event); - /* * Special case software events and allow them to be part of * any hardware group. @@ -8661,8 +8662,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; - account_event(event); - ctx = find_get_context(event->pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); -- cgit From 72f9f3fdc928dc3ecd223e801b32d930b662b6ed Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Mon, 7 Mar 2016 12:27:04 +0100 Subject: sched/deadline: Remove dl_new from struct sched_dl_entity The dl_new field of struct sched_dl_entity is currently used to identify new deadline tasks, so that their deadline and runtime can be properly initialised. However, these tasks can be easily identified by checking if their deadline is smaller than the current time when they switch to SCHED_DEADLINE. So, dl_new can be removed by introducing this check in switched_to_dl(); this allows to simplify the SCHED_DEADLINE code. Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1457350024-7825-2-git-send-email-luca.abeni@unitn.it Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 +----- kernel/sched/core.c | 1 - kernel/sched/deadline.c | 35 +++++++++++++---------------------- 3 files changed, 14 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9519b66fc21f..838a89a78332 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1333,10 +1333,6 @@ struct sched_dl_entity { * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * - * @dl_new tells if a new instance arrived. If so we must - * start executing it with full runtime and reset its absolute - * deadline; - * * @dl_boosted tells if we are boosted due to DI. If so we are * outside bandwidth enforcement mechanism (but only until we * exit the critical section); @@ -1344,7 +1340,7 @@ struct sched_dl_entity { * @dl_yielded tells if task gave up the cpu before consuming * all its available runtime during the last job. */ - int dl_throttled, dl_new, dl_boosted, dl_yielded; + int dl_throttled, dl_boosted, dl_yielded; /* * Bandwidth enforcement timer. Each -deadline task has its diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 423452cae5cf..249e37dc02c8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2051,7 +2051,6 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_throttled = 0; - dl_se->dl_new = 1; dl_se->dl_yielded = 0; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 15abf04bf0b8..c7a036facbe1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -352,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); + WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); + + /* + * We are racing with the deadline timer. So, do nothing because + * the deadline timer handler will take care of properly recharging + * the runtime and postponing the deadline + */ + if (dl_se->dl_throttled) + return; /* * We use the regular wall clock time to set deadlines in the @@ -361,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, */ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; - dl_se->dl_new = 0; } /* @@ -503,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - /* - * The arrival of a new instance needs special treatment, i.e., - * the actual scheduling parameters have to be "renewed". - */ - if (dl_se->dl_new) { - setup_new_dl_entity(dl_se, pi_se); - return; - } - if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; @@ -607,16 +605,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) goto unlock; } - /* - * This is possible if switched_from_dl() raced against a running - * callback that took the above !dl_task() path and we've since then - * switched back into SCHED_DEADLINE. - * - * There's nothing to do except drop our task reference. - */ - if (dl_se->dl_new) - goto unlock; - /* * The task might have been boosted by someone else and might be in the * boosting/deboosting path, its not throttled. @@ -925,7 +913,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) + if (flags & ENQUEUE_WAKEUP) update_dl_entity(dl_se, pi_se); else if (flags & ENQUEUE_REPLENISH) replenish_dl_entity(dl_se, pi_se); @@ -1726,6 +1714,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + if (dl_time_before(p->dl.deadline, rq_clock(rq))) + setup_new_dl_entity(&p->dl, &p->dl); + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) -- cgit From f9c904b7613b8b4c85b10cd6b33ad41b2843fa9d Mon Sep 17 00:00:00 2001 From: Chris Friesen Date: Sat, 5 Mar 2016 23:18:48 -0600 Subject: sched/cputime: Fix steal_account_process_tick() to always return jiffies The callers of steal_account_process_tick() expect it to return whether a jiffy should be considered stolen or not. Currently the return value of steal_account_process_tick() is in units of cputime, which vary between either jiffies or nsecs depending on CONFIG_VIRT_CPU_ACCOUNTING_GEN. If cputime has nsecs granularity and there is a tiny amount of stolen time (a few nsecs, say) then we will consider the entire tick stolen and will not account the tick on user/system/idle, causing /proc/stats to show invalid data. The fix is to change steal_account_process_tick() to accumulate the stolen time and only account it once it's worth a jiffy. (Thanks to Frederic Weisbecker for suggestions to fix a bug in my first version of the patch.) Signed-off-by: Chris Friesen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/56DBBDB8.40305@mail.usask.ca Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 01d9898bc9a2..75f98c5498d5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -262,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void) #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { u64 steal; - cputime_t steal_ct; + unsigned long steal_jiffies; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; /* - * cputime_t may be less precise than nsecs (eg: if it's - * based on jiffies). Lets cast the result to cputime + * steal is in nsecs but our caller is expecting steal + * time in jiffies. Lets cast the result to jiffies * granularity and account the rest on the next rounds. */ - steal_ct = nsecs_to_cputime(steal); - this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); + steal_jiffies = nsecs_to_jiffies(steal); + this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); - account_steal_time(steal_ct); - return steal_ct; + account_steal_time(jiffies_to_cputime(steal_jiffies)); + return steal_jiffies; } #endif return false; -- cgit From 29b75eb2d56a714190a93d7be4525e617591077a Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Mon, 7 Mar 2016 09:32:24 +0800 Subject: futex: Replace barrier() in unqueue_me() with READ_ONCE() Commit e91467ecd1ef ("bug in futex unqueue_me") introduced a barrier() in unqueue_me() to prevent the compiler from rereading the lock pointer which might change after a check for NULL. Replace the barrier() with a READ_ONCE() for the following reasons: 1) READ_ONCE() is a weaker form of barrier() that affects only the specific load operation, while barrier() is a general compiler level memory barrier. READ_ONCE() was not available at the time when the barrier was added. 2) Aside of that READ_ONCE() is descriptive and self explainatory while a barrier without comment is not clear to the casual reader. No functional change. [ tglx: Massaged changelog ] Signed-off-by: Jianyu Zhan Acked-by: Christian Borntraeger Acked-by: Darren Hart Cc: dave@stgolabs.net Cc: peterz@infradead.org Cc: linux@rasmusvillemoes.dk Cc: akpm@linux-foundation.org Cc: fengguang.wu@intel.com Cc: bigeasy@linutronix.de Link: http://lkml.kernel.org/r/1457314344-5685-1-git-send-email-nasa4836@gmail.com Signed-off-by: Thomas Gleixner --- kernel/futex.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index bae542e4b2e9..a5d2e74c89e0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2010,8 +2010,12 @@ static int unqueue_me(struct futex_q *q) /* In the common case we don't take the spinlock, which is nice. */ retry: - lock_ptr = q->lock_ptr; - barrier(); + /* + * q->lock_ptr can change between this read and the following spin_lock. + * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and + * optimizing lock_ptr out of the logic below. + */ + lock_ptr = READ_ONCE(q->lock_ptr); if (lock_ptr != NULL) { spin_lock(lock_ptr); /* -- cgit From d39cdd2036a63eef17a14efbd969405ca5612886 Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Tue, 8 Mar 2016 21:37:01 +0800 Subject: tracing: Make tracer_flags use the right set_flag callback When I was updating the ftrace_stress test of ltp. I encountered a strange phenomemon, excute following steps: echo nop > /sys/kernel/debug/tracing/current_tracer echo 0 > /sys/kernel/debug/tracing/options/funcgraph-cpu bash: echo: write error: Invalid argument check dmesg: [ 1024.903855] nop_test_refuse flag set to 0: we refuse.Now cat trace_options to see the result The reason is that the trace option test will randomly setup trace option under tracing/options no matter what the current_tracer is. but the set_tracer_option is always using the set_flag callback from the current_tracer. This patch adds a pointer to tracer_flags and make it point to the tracer it belongs to. When the option is setup, the set_flag of the right tracer will be used no matter what the the current_tracer is. And the old dummy_tracer_flags is used for all the tracers which doesn't have a tracer_flags, having issue to use it to save the pointer of a tracer. So remove it and use dynamic dummy tracer_flags for tracers needing a dummy tracer_flags, as a result, there are no tracers sharing tracer_flags, so remove the check code. And save the current tracer to trace_option_dentry seems not good as it may waste mem space when mount the debug/trace fs more than one time. Link: http://lkml.kernel.org/r/1457444222-8654-1-git-send-email-chuhu@redhat.com Signed-off-by: Chunyu Hu [ Fixed up function tracer options to work with the change ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 28 ++++++++++++++-------------- kernel/trace/trace.h | 1 + kernel/trace/trace_functions.c | 6 ++++++ 3 files changed, 21 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d9293402ee68..b401a1892dc6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -74,11 +74,6 @@ static struct tracer_opt dummy_tracer_opt[] = { { } }; -static struct tracer_flags dummy_tracer_flags = { - .val = 0, - .opts = dummy_tracer_opt -}; - static int dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { @@ -1258,12 +1253,20 @@ int __init register_tracer(struct tracer *type) if (!type->set_flag) type->set_flag = &dummy_set_flag; - if (!type->flags) - type->flags = &dummy_tracer_flags; - else + if (!type->flags) { + /*allocate a dummy tracer_flags*/ + type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL); + if (!type->flags) + return -ENOMEM; + type->flags->val = 0; + type->flags->opts = dummy_tracer_opt; + } else if (!type->flags->opts) type->flags->opts = dummy_tracer_opt; + /* store the tracer for __set_tracer_option */ + type->flags->trace = type; + ret = run_tracer_selftest(type); if (ret < 0) goto out; @@ -3505,7 +3508,7 @@ static int __set_tracer_option(struct trace_array *tr, struct tracer_flags *tracer_flags, struct tracer_opt *opts, int neg) { - struct tracer *trace = tr->current_trace; + struct tracer *trace = tracer_flags->trace; int ret; ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); @@ -6391,11 +6394,8 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) return; for (i = 0; i < tr->nr_topts; i++) { - /* - * Check if these flags have already been added. - * Some tracers share flags. - */ - if (tr->topts[i].tracer->flags == tracer->flags) + /* Make sure there's no duplicate flags. */ + if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags)) return; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8414fa40bf27..b4cae47f283e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -345,6 +345,7 @@ struct tracer_opt { struct tracer_flags { u32 val; struct tracer_opt *opts; + struct tracer *trace; }; /* Makes more easy to define a tracer opt */ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index fcd41a166405..5a095c2e4b69 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -219,6 +219,8 @@ static void tracing_stop_function_trace(struct trace_array *tr) unregister_ftrace_function(tr->ops); } +static struct tracer function_trace; + static int func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { @@ -228,6 +230,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) break; + /* We can change this flag when not running. */ + if (tr->current_trace != &function_trace) + break; + unregister_ftrace_function(tr->ops); if (set) { -- cgit From 4ef56902fba4d9949918c6266e67ba7d05fba7a4 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 10 Dec 2015 12:50:43 -0600 Subject: tracing: Make ftrace_event_field checking functions available Make is_string_field() and is_function_field() accessible outside of trace_event_filters.c for other users of ftrace_event_fields. Link: http://lkml.kernel.org/r/2d3f00d3311702e556e82eed7754bae6f017939f.1449767187.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Reviewed-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 12 ++++++++++++ kernel/trace/trace_events_filter.c | 12 ------------ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b4cae47f283e..81a8359e9c29 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1112,6 +1112,18 @@ struct filter_pred { unsigned short right; }; +static inline bool is_string_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_STATIC_STRING || + field->filter_type == FILTER_PTR_STRING; +} + +static inline bool is_function_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_TRACE_FN; +} + extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct trace_event_file *file, diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 6816302542b2..b3f5051cd4e9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -961,18 +961,6 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } -static bool is_function_field(struct ftrace_event_field *field) -{ - return field->filter_type == FILTER_TRACE_FN; -} - -static bool is_string_field(struct ftrace_event_field *field) -{ - return field->filter_type == FILTER_DYN_STRING || - field->filter_type == FILTER_STATIC_STRING || - field->filter_type == FILTER_PTR_STRING; -} - static bool is_legal_op(struct ftrace_event_field *field, int op) { if (is_string_field(field) && -- cgit From ab4bf008928e8fc73fe1cbaa9249792d36845345 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 10 Dec 2015 12:50:44 -0600 Subject: tracing: Make event trigger functions available Make various event trigger utility functions available outside of trace_events_trigger.c so that new triggers can be defined outside of that file. Link: http://lkml.kernel.org/r/4a40c1695dd43cac6cd475d72e13ffe30ba84bff.1449767187.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 14 ++++++++++++++ kernel/trace/trace_events_trigger.c | 28 +++++++++++++--------------- 2 files changed, 27 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 81a8359e9c29..b2bc956e2b0d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1175,6 +1175,20 @@ struct event_trigger_data { struct list_head list; }; +extern void trigger_data_free(struct event_trigger_data *data); +extern int event_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data); +extern int trace_event_trigger_enable_disable(struct trace_event_file *file, + int trigger_enable); +extern void update_cond_flag(struct trace_event_file *file); +extern void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file); +extern int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct trace_event_file *file); +extern int register_event_command(struct event_command *cmd); + /** * struct event_trigger_ops - callbacks for trace event triggers * diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index b38f617b6181..f40424f35dcb 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -28,8 +28,7 @@ static LIST_HEAD(trigger_commands); static DEFINE_MUTEX(trigger_cmd_mutex); -static void -trigger_data_free(struct event_trigger_data *data) +void trigger_data_free(struct event_trigger_data *data) { if (data->cmd_ops->set_filter) data->cmd_ops->set_filter(NULL, data, NULL); @@ -306,7 +305,7 @@ const struct file_operations event_trigger_fops = { * Currently we only register event commands from __init, so mark this * __init too. */ -static __init int register_event_command(struct event_command *cmd) +__init int register_event_command(struct event_command *cmd) { struct event_command *p; int ret = 0; @@ -395,9 +394,8 @@ event_trigger_print(const char *name, struct seq_file *m, * * Return: 0 on success, errno otherwise */ -static int -event_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +int event_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) { data->ref++; return 0; @@ -425,8 +423,8 @@ event_trigger_free(struct event_trigger_ops *ops, trigger_data_free(data); } -static int trace_event_trigger_enable_disable(struct trace_event_file *file, - int trigger_enable) +int trace_event_trigger_enable_disable(struct trace_event_file *file, + int trigger_enable) { int ret = 0; @@ -483,7 +481,7 @@ clear_event_triggers(struct trace_array *tr) * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be * cleared. */ -static void update_cond_flag(struct trace_event_file *file) +void update_cond_flag(struct trace_event_file *file) { struct event_trigger_data *data; bool set_cond = false; @@ -560,9 +558,9 @@ out: * Usually used directly as the @unreg method in event command * implementations. */ -static void unregister_trigger(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *test, - struct trace_event_file *file) +void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file) { struct event_trigger_data *data; bool unregistered = false; @@ -696,9 +694,9 @@ event_trigger_callback(struct event_command *cmd_ops, * * Return: 0 on success, errno otherwise */ -static int set_trigger_filter(char *filter_str, - struct event_trigger_data *trigger_data, - struct trace_event_file *file) +int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct trace_event_file *file) { struct event_trigger_data *data = trigger_data; struct event_filter *filter = NULL, *tmp; -- cgit From c4a5923055c9e0c87dfc0387f7cda5ee2bbac3c1 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 10 Dec 2015 12:50:45 -0600 Subject: tracing: Add event record param to trigger_ops.func() Some triggers may need access to the trace event, so pass it in. Also fix up the existing trigger funcs and their callers. Link: http://lkml.kernel.org/r/543e31e9fc445ef61077421ab219033401c39846.1449767187.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 7 ++++--- kernel/trace/trace.h | 6 ++++-- kernel/trace/trace_events_trigger.c | 36 +++++++++++++++++++----------------- 3 files changed, 27 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 925730bc9fc1..0d6930e941e8 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -430,7 +430,8 @@ extern int call_filter_check_discard(struct trace_event_call *call, void *rec, extern enum event_trigger_type event_triggers_call(struct trace_event_file *file, void *rec); extern void event_triggers_post_call(struct trace_event_file *file, - enum event_trigger_type tt); + enum event_trigger_type tt, + void *rec); bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); @@ -517,7 +518,7 @@ event_trigger_unlock_commit(struct trace_event_file *file, trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); if (tt) - event_triggers_post_call(file, tt); + event_triggers_post_call(file, tt, entry); } /** @@ -550,7 +551,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file, irq_flags, pc, regs); if (tt) - event_triggers_post_call(file, tt); + event_triggers_post_call(file, tt, entry); } #ifdef CONFIG_BPF_EVENTS diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b2bc956e2b0d..c10456e72106 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1201,7 +1201,8 @@ extern int register_event_command(struct event_command *cmd); * @func: The trigger 'probe' function called when the triggering * event occurs. The data passed into this callback is the data * that was supplied to the event_command @reg() function that - * registered the trigger (see struct event_command). + * registered the trigger (see struct event_command) along with + * the trace record, rec. * * @init: An optional initialization function called for the trigger * when the trigger is registered (via the event_command reg() @@ -1226,7 +1227,8 @@ extern int register_event_command(struct event_command *cmd); * (see trace_event_triggers.c). */ struct event_trigger_ops { - void (*func)(struct event_trigger_data *data); + void (*func)(struct event_trigger_data *data, + void *rec); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); void (*free)(struct event_trigger_ops *ops, diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index f40424f35dcb..0a62887c63c0 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -73,7 +73,7 @@ event_triggers_call(struct trace_event_file *file, void *rec) list_for_each_entry_rcu(data, &file->triggers, list) { if (!rec) { - data->ops->func(data); + data->ops->func(data, rec); continue; } filter = rcu_dereference_sched(data->filter); @@ -83,7 +83,7 @@ event_triggers_call(struct trace_event_file *file, void *rec) tt |= data->cmd_ops->trigger_type; continue; } - data->ops->func(data); + data->ops->func(data, rec); } return tt; } @@ -93,6 +93,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call); * event_triggers_post_call - Call 'post_triggers' for a trace event * @file: The trace_event_file associated with the event * @tt: enum event_trigger_type containing a set bit for each trigger to invoke + * @rec: The trace entry for the event * * For each trigger associated with an event, invoke the trigger * function registered with the associated trigger command, if the @@ -103,13 +104,14 @@ EXPORT_SYMBOL_GPL(event_triggers_call); */ void event_triggers_post_call(struct trace_event_file *file, - enum event_trigger_type tt) + enum event_trigger_type tt, + void *rec) { struct event_trigger_data *data; list_for_each_entry_rcu(data, &file->triggers, list) { if (data->cmd_ops->trigger_type & tt) - data->ops->func(data); + data->ops->func(data, rec); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -745,7 +747,7 @@ int set_trigger_filter(char *filter_str, } static void -traceon_trigger(struct event_trigger_data *data) +traceon_trigger(struct event_trigger_data *data, void *rec) { if (tracing_is_on()) return; @@ -754,7 +756,7 @@ traceon_trigger(struct event_trigger_data *data) } static void -traceon_count_trigger(struct event_trigger_data *data) +traceon_count_trigger(struct event_trigger_data *data, void *rec) { if (tracing_is_on()) return; @@ -769,7 +771,7 @@ traceon_count_trigger(struct event_trigger_data *data) } static void -traceoff_trigger(struct event_trigger_data *data) +traceoff_trigger(struct event_trigger_data *data, void *rec) { if (!tracing_is_on()) return; @@ -778,7 +780,7 @@ traceoff_trigger(struct event_trigger_data *data) } static void -traceoff_count_trigger(struct event_trigger_data *data) +traceoff_count_trigger(struct event_trigger_data *data, void *rec) { if (!tracing_is_on()) return; @@ -874,13 +876,13 @@ static struct event_command trigger_traceoff_cmd = { #ifdef CONFIG_TRACER_SNAPSHOT static void -snapshot_trigger(struct event_trigger_data *data) +snapshot_trigger(struct event_trigger_data *data, void *rec) { tracing_snapshot(); } static void -snapshot_count_trigger(struct event_trigger_data *data) +snapshot_count_trigger(struct event_trigger_data *data, void *rec) { if (!data->count) return; @@ -888,7 +890,7 @@ snapshot_count_trigger(struct event_trigger_data *data) if (data->count != -1) (data->count)--; - snapshot_trigger(data); + snapshot_trigger(data, rec); } static int @@ -967,13 +969,13 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #define STACK_SKIP 3 static void -stacktrace_trigger(struct event_trigger_data *data) +stacktrace_trigger(struct event_trigger_data *data, void *rec) { trace_dump_stack(STACK_SKIP); } static void -stacktrace_count_trigger(struct event_trigger_data *data) +stacktrace_count_trigger(struct event_trigger_data *data, void *rec) { if (!data->count) return; @@ -981,7 +983,7 @@ stacktrace_count_trigger(struct event_trigger_data *data) if (data->count != -1) (data->count)--; - stacktrace_trigger(data); + stacktrace_trigger(data, rec); } static int @@ -1052,7 +1054,7 @@ struct enable_trigger_data { }; static void -event_enable_trigger(struct event_trigger_data *data) +event_enable_trigger(struct event_trigger_data *data, void *rec) { struct enable_trigger_data *enable_data = data->private_data; @@ -1063,7 +1065,7 @@ event_enable_trigger(struct event_trigger_data *data) } static void -event_enable_count_trigger(struct event_trigger_data *data) +event_enable_count_trigger(struct event_trigger_data *data, void *rec) { struct enable_trigger_data *enable_data = data->private_data; @@ -1077,7 +1079,7 @@ event_enable_count_trigger(struct event_trigger_data *data) if (data->count != -1) (data->count)--; - event_enable_trigger(data); + event_enable_trigger(data, rec); } static int -- cgit From dbfeaa7abae4f105afdf8ed4f85b5879cff136ea Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 10 Dec 2015 12:50:46 -0600 Subject: tracing: Add get_syscall_name() Add a utility function to grab the syscall name from the syscall metadata, given a syscall id. Link: http://lkml.kernel.org/r/be26a8dfe3f15e16a837799f1c1e2b4d62742843.1449767187.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 5 +++++ kernel/trace/trace_syscalls.c | 11 +++++++++++ 2 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c10456e72106..0044b91d5469 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1394,8 +1394,13 @@ int perf_ftrace_event_register(struct trace_event_call *call, #ifdef CONFIG_FTRACE_SYSCALLS void init_ftrace_syscalls(void); +const char *get_syscall_name(int syscall); #else static inline void init_ftrace_syscalls(void) { } +static inline const char *get_syscall_name(int syscall) +{ + return NULL; +} #endif #ifdef CONFIG_EVENT_TRACING diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 0655afbea83f..50be5602217c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -106,6 +106,17 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr) return syscalls_metadata[nr]; } +const char *get_syscall_name(int syscall) +{ + struct syscall_metadata *entry; + + entry = syscall_nr_to_meta(syscall); + if (!entry) + return NULL; + + return entry->name; +} + static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) -- cgit From 104f281044a9c2ac86b851bbebbf74500172b625 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 10 Dec 2015 12:50:47 -0600 Subject: tracing: Add a per-event-trigger 'paused' field Add a simple per-trigger 'paused' flag, allowing individual triggers to pause. We could leave it to individual triggers that need this functionality to do it themselves, but we also want to allow other events to control pausing, so add it to the trigger data. Link: http://lkml.kernel.org/r/fed37e4879684d7dcc57fe00ce0cbf170032b06d.1449767187.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 1 + kernel/trace/trace_events_trigger.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0044b91d5469..f1868677f856 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1172,6 +1172,7 @@ struct event_trigger_data { struct event_filter __rcu *filter; char *filter_str; void *private_data; + bool paused; struct list_head list; }; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 0a62887c63c0..e4d8b3763175 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -72,6 +72,8 @@ event_triggers_call(struct trace_event_file *file, void *rec) return tt; list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->paused) + continue; if (!rec) { data->ops->func(data, rec); continue; @@ -110,6 +112,8 @@ event_triggers_post_call(struct trace_event_file *file, struct event_trigger_data *data; list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->paused) + continue; if (data->cmd_ops->trigger_type & tt) data->ops->func(data, rec); } -- cgit From a5863dae84e2da83a1e5de485a7f150d0c28f08e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 10 Dec 2015 12:50:48 -0600 Subject: tracing: Add needs_rec flag to event triggers Add a new needs_rec flag for triggers that require unconditional access to trace records in order to function. Normally a trigger requires access to the contents of a trace record only if it has a filter associated with it (since filters need the contents of a record in order to make a filtering decision). Some types of triggers, such as 'hist' triggers, require access to trace record contents independent of the presence of filters, so add a new flag for those triggers. Link: http://lkml.kernel.org/r/7be8fa38f9b90fdb6c47ca0f98d20a07b9fd512b.1449767187.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 7 +++++++ kernel/trace/trace_events_trigger.c | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f1868677f856..8c6aefbb24d2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1292,6 +1292,12 @@ struct event_trigger_ops { * itself logs to the trace buffer, this flag should be set, * otherwise it can be left unspecified. * + * @needs_rec: A flag that says whether or not this command needs + * access to the trace record in order to perform its function, + * regardless of whether or not it has a filter associated with + * it (filters make a trigger require access to the trace record + * but are not always present). + * * All the methods below, except for @set_filter(), must be * implemented. * @@ -1332,6 +1338,7 @@ struct event_command { char *name; enum event_trigger_type trigger_type; bool post_trigger; + bool needs_rec; int (*func)(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *params); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index e4d8b3763175..a11bb4780f82 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -493,7 +493,8 @@ void update_cond_flag(struct trace_event_file *file) bool set_cond = false; list_for_each_entry_rcu(data, &file->triggers, list) { - if (data->filter || data->cmd_ops->post_trigger) { + if (data->filter || data->cmd_ops->post_trigger || + data->cmd_ops->needs_rec) { set_cond = true; break; } -- cgit From a88e1cfb1d3081ffb34864d9cf8a5c289630f48e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 10 Dec 2015 12:50:49 -0600 Subject: tracing: Add an unreg_all() callback to trigger commands Add a new unreg_all() callback that can be used to remove all command-specific triggers from an event and arrange to have it called whenever a trigger file is opened with O_TRUNC set. Commands that don't want truncate semantics, or existing commands that don't implement this function simply do nothing and their triggers remain intact. Link: http://lkml.kernel.org/r/2b7d62854d01f28c19185e1bbb8f826f385edfba.1449767187.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 9 +++++++-- kernel/trace/trace_events_trigger.c | 13 +++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8c6aefbb24d2..f4dd0adf71df 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1298,8 +1298,8 @@ struct event_trigger_ops { * it (filters make a trigger require access to the trace record * but are not always present). * - * All the methods below, except for @set_filter(), must be - * implemented. + * All the methods below, except for @set_filter() and @unreg_all(), + * must be implemented. * * @func: The callback function responsible for parsing and * registering the trigger written to the 'trigger' file by the @@ -1324,6 +1324,10 @@ struct event_trigger_ops { * This is usually implemented by the generic utility function * @unregister_trigger() (see trace_event_triggers.c). * + * @unreg_all: An optional function called to remove all the triggers + * from the list of triggers associated with the event. Called + * when a trigger file is opened in truncate mode. + * * @set_filter: An optional function called to parse and set a filter * for the trigger. If no @set_filter() method is set for the * event command, filters set by the user for the command will be @@ -1350,6 +1354,7 @@ struct event_command { struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); + void (*unreg_all)(struct trace_event_file *file); int (*set_filter)(char *filter_str, struct event_trigger_data *data, struct trace_event_file *file); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index a11bb4780f82..cbb7ee531983 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -193,6 +193,19 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) return -ENODEV; } + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) { + struct trace_event_file *event_file; + struct event_command *p; + + event_file = event_file_data(file); + + list_for_each_entry(p, &trigger_commands, list) { + if (p->unreg_all) + p->unreg_all(event_file); + } + } + if (file->f_mode & FMODE_READ) { ret = seq_open(file, &event_triggers_seq_ops); if (!ret) { -- cgit From 353206f5ca05eb65704b2b3ec9a331b4fdfd3257 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 22 Feb 2016 15:55:09 -0500 Subject: tracing: Use flags instead of bool in trigger structure gcc isn't known for handling bool in structures. Instead of using bool, use an integer mask and use bit flags instead. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 71 +++++++++++++++++++++++-------------- kernel/trace/trace_events_trigger.c | 8 ++--- 2 files changed, 49 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f4dd0adf71df..39588c23dd8b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1273,30 +1273,7 @@ struct event_trigger_ops { * values are defined by adding new values to the trigger_type * enum in include/linux/trace_events.h. * - * @post_trigger: A flag that says whether or not this command needs - * to have its action delayed until after the current event has - * been closed. Some triggers need to avoid being invoked while - * an event is currently in the process of being logged, since - * the trigger may itself log data into the trace buffer. Thus - * we make sure the current event is committed before invoking - * those triggers. To do that, the trigger invocation is split - * in two - the first part checks the filter using the current - * trace record; if a command has the @post_trigger flag set, it - * sets a bit for itself in the return value, otherwise it - * directly invokes the trigger. Once all commands have been - * either invoked or set their return flag, the current record is - * either committed or discarded. At that point, if any commands - * have deferred their triggers, those commands are finally - * invoked following the close of the current event. In other - * words, if the event_trigger_ops @func() probe implementation - * itself logs to the trace buffer, this flag should be set, - * otherwise it can be left unspecified. - * - * @needs_rec: A flag that says whether or not this command needs - * access to the trace record in order to perform its function, - * regardless of whether or not it has a filter associated with - * it (filters make a trigger require access to the trace record - * but are not always present). + * @flags: See the enum event_command_flags below. * * All the methods below, except for @set_filter() and @unreg_all(), * must be implemented. @@ -1341,8 +1318,7 @@ struct event_command { struct list_head list; char *name; enum event_trigger_type trigger_type; - bool post_trigger; - bool needs_rec; + int flags; int (*func)(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *params); @@ -1361,6 +1337,49 @@ struct event_command { struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); }; +/** + * enum event_command_flags - flags for struct event_command + * + * @POST_TRIGGER: A flag that says whether or not this command needs + * to have its action delayed until after the current event has + * been closed. Some triggers need to avoid being invoked while + * an event is currently in the process of being logged, since + * the trigger may itself log data into the trace buffer. Thus + * we make sure the current event is committed before invoking + * those triggers. To do that, the trigger invocation is split + * in two - the first part checks the filter using the current + * trace record; if a command has the @post_trigger flag set, it + * sets a bit for itself in the return value, otherwise it + * directly invokes the trigger. Once all commands have been + * either invoked or set their return flag, the current record is + * either committed or discarded. At that point, if any commands + * have deferred their triggers, those commands are finally + * invoked following the close of the current event. In other + * words, if the event_trigger_ops @func() probe implementation + * itself logs to the trace buffer, this flag should be set, + * otherwise it can be left unspecified. + * + * @NEEDS_REC: A flag that says whether or not this command needs + * access to the trace record in order to perform its function, + * regardless of whether or not it has a filter associated with + * it (filters make a trigger require access to the trace record + * but are not always present). + */ +enum event_command_flags { + EVENT_CMD_FL_POST_TRIGGER = 1, + EVENT_CMD_FL_NEEDS_REC = 2, +}; + +static inline bool event_command_post_trigger(struct event_command *cmd_ops) +{ + return cmd_ops->flags & EVENT_CMD_FL_POST_TRIGGER; +} + +static inline bool event_command_needs_rec(struct event_command *cmd_ops) +{ + return cmd_ops->flags & EVENT_CMD_FL_NEEDS_REC; +} + extern int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable); extern int tracing_alloc_snapshot(void); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cbb7ee531983..d67992f3bb0e 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -81,7 +81,7 @@ event_triggers_call(struct trace_event_file *file, void *rec) filter = rcu_dereference_sched(data->filter); if (filter && !filter_match_preds(filter, rec)) continue; - if (data->cmd_ops->post_trigger) { + if (event_command_post_trigger(data->cmd_ops)) { tt |= data->cmd_ops->trigger_type; continue; } @@ -506,8 +506,8 @@ void update_cond_flag(struct trace_event_file *file) bool set_cond = false; list_for_each_entry_rcu(data, &file->triggers, list) { - if (data->filter || data->cmd_ops->post_trigger || - data->cmd_ops->needs_rec) { + if (data->filter || event_command_post_trigger(data->cmd_ops) || + event_command_needs_rec(data->cmd_ops)) { set_cond = true; break; } @@ -1035,7 +1035,7 @@ stacktrace_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_stacktrace_cmd = { .name = "stacktrace", .trigger_type = ETT_STACKTRACE, - .post_trigger = true, + .flags = EVENT_CMD_FL_POST_TRIGGER, .func = event_trigger_callback, .reg = register_trigger, .unreg = unregister_trigger, -- cgit From 1cf8067b541884366b7db3a328342073fed2f38f Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Tue, 8 Mar 2016 21:37:02 +0800 Subject: tracing: Fix typoes in code comment and printk in trace_nop.c echo nop > /sys/kernel/debug/tracing/options/current_tracer echo 1 > /sys/kernel/debug/tracing/options/test_nop_accept echo 0 > /sys/kernel/debug/tracing/options/test_nop_accept echo 1 > /sys/kernel/debug/tracing/options/test_nop_refuse Before the fix, the dmesg is a bit ugly since a align issue. [ 191.973081] nop_test_accept flag set to 1: we accept. Now cat trace_options to see the result [ 195.156942] nop_test_refuse flag set to 1: we refuse.Now cat trace_options to see the result After the fix, the dmesg will show aligned log for nop_test_refuse and nop_test_accept. [ 2718.032413] nop_test_refuse flag set to 1: we refuse. Now cat trace_options to see the result [ 2734.253360] nop_test_accept flag set to 1: we accept. Now cat trace_options to see the result Link: http://lkml.kernel.org/r/1457444222-8654-2-git-send-email-chuhu@redhat.com Signed-off-by: Chunyu Hu Signed-off-by: Steven Rostedt --- kernel/trace/trace_nop.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 8bb2071474dd..49f61fe96a6b 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -56,7 +56,7 @@ static void nop_trace_reset(struct trace_array *tr) } /* It only serves as a signal handler and a callback to - * accept or refuse tthe setting of a flag. + * accept or refuse the setting of a flag. * If you don't implement it, then the flag setting will be * automatically accepted. */ @@ -75,7 +75,7 @@ static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (bit == TRACE_NOP_OPT_REFUSE) { printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse." - "Now cat trace_options to see the result\n", + " Now cat trace_options to see the result\n", set); return -EINVAL; } -- cgit From 58cdb1ceb15aab7b34719ad225ff023775d774e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Mar 2016 11:51:25 -0500 Subject: cgroup: fix incorrect destination cgroup in cgroup_update_dfl_csses() cgroup_update_dfl_csses() should move each task in the subtree to self; however, it was incorrectly calling cgroup_migrate_add_src() with the root of the subtree as @dst_cgrp. Fortunately, cgroup_migrate_add_src() currently uses @dst_cgrp only to determine the hierarchy and the bug doesn't cause any actual breakages. Fix it. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c63fce0c5b24..50879aadcbd0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2901,7 +2901,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgrp_cset_link *link; list_for_each_entry(link, &dsct->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, cgrp, + cgroup_migrate_add_src(link->cset, dsct, &preloaded_csets); } spin_unlock_bh(&css_set_lock); -- cgit From 6c694c88255b2052d9922d62df6df7c9e152eeeb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Mar 2016 11:51:25 -0500 Subject: cgroup: move migration destination verification out of cgroup_migrate_prepare_dst() cgroup_migrate_prepare_dst() verifies whether the destination cgroup is allowable; however, the test doesn't really belong there. It's too deep and common in the stack and as a result the test itself is gated by another test. Separate the test out into cgroup_may_migrate_to() and update cgroup_attach_task() and cgroup_transfer_tasks() to perform the test directly. This doesn't cause any behavior differences. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 50879aadcbd0..8a02076d4317 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2443,6 +2443,20 @@ out_release_tset: return ret; } +/** + * cgroup_may_migrate_to - verify whether a cgroup can be migration destination + * @dst_cgrp: destination cgroup to test + * + * On the default hierarchy, except for the root, subtree_control must be + * zero for migration destination cgroups with tasks so that child cgroups + * don't compete against tasks. + */ +static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) +{ + return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || + !dst_cgrp->subtree_control; +} + /** * cgroup_migrate_finish - cleanup after attach * @preloaded_csets: list of preloaded css_sets @@ -2529,14 +2543,6 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, lockdep_assert_held(&cgroup_mutex); - /* - * Except for the root, subtree_control must be zero for a cgroup - * with tasks so that child cgroups don't compete against tasks. - */ - if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && - dst_cgrp->subtree_control) - return -EBUSY; - /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; @@ -2634,6 +2640,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *task; int ret; + if (!cgroup_may_migrate_to(dst_cgrp)) + return -EBUSY; + /* look up all src csets */ spin_lock_bh(&css_set_lock); rcu_read_lock(); @@ -4136,6 +4145,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) struct task_struct *task; int ret; + if (!cgroup_may_migrate_to(to)) + return -EBUSY; + mutex_lock(&cgroup_mutex); /* all tasks in @from are being moved, all csets are source */ -- cgit From 37ff9f8f474216d0cfca7565a4e0caa521ee6e7e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Mar 2016 11:51:26 -0500 Subject: cgroup: make cgroup[_taskset]_migrate() take cgroup_root instead of cgroup On the default hierarchy, a migration can be multi-source and/or multi-destination. cgroup_taskest_migrate() used to incorrectly assume single destination cgroup but the bug has been fixed by 1f7dd3e5a6e4 ("cgroup: fix handling of multi-destination migration from subtree_control enabling"). Since the commit, @dst_cgrp to cgroup[_taskset]_migrate() is only used to determine which subsystems are affected or which cgroup_root the migration is taking place in. As such, @dst_cgrp is misleading. This patch replaces @dst_cgrp with @root. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 70 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8a02076d4317..5dd761355033 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2355,38 +2355,38 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, } /** - * cgroup_taskset_migrate - migrate a taskset to a cgroup + * cgroup_taskset_migrate - migrate a taskset * @tset: taget taskset - * @dst_cgrp: destination cgroup + * @root: cgroup root the migration is taking place on * - * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the - * ->can_attach callbacks fails and guarantees that either all or none of - * the tasks in @tset are migrated. @tset is consumed regardless of - * success. + * Migrate tasks in @tset as setup by migration preparation functions. + * This function fails iff one of the ->can_attach callbacks fails and + * guarantees that either all or none of the tasks in @tset are migrated. + * @tset is consumed regardless of success. */ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, - struct cgroup *dst_cgrp) + struct cgroup_root *root) { - struct cgroup_subsys_state *css, *failed_css = NULL; + struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; - int i, ret; + int ssid, failed_ssid, ret; /* methods shouldn't be called if no task is actually migrating */ if (list_empty(&tset->src_csets)) return 0; /* check that we can legitimately attach to the cgroup */ - for_each_e_css(css, i, dst_cgrp) { - if (css->ss->can_attach) { - tset->ssid = i; - ret = css->ss->can_attach(tset); + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ss->can_attach) { + tset->ssid = ssid; + ret = ss->can_attach(tset); if (ret) { - failed_css = css; + failed_ssid = ssid; goto out_cancel_attach; } } - } + } while_each_subsys_mask(); /* * Now that we're guaranteed success, proceed to move all tasks to @@ -2413,25 +2413,25 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, */ tset->csets = &tset->dst_csets; - for_each_e_css(css, i, dst_cgrp) { - if (css->ss->attach) { - tset->ssid = i; - css->ss->attach(tset); + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ss->attach) { + tset->ssid = ssid; + ss->attach(tset); } - } + } while_each_subsys_mask(); ret = 0; goto out_release_tset; out_cancel_attach: - for_each_e_css(css, i, dst_cgrp) { - if (css == failed_css) + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ssid == failed_ssid) break; - if (css->ss->cancel_attach) { - tset->ssid = i; - css->ss->cancel_attach(tset); + if (ss->cancel_attach) { + tset->ssid = ssid; + ss->cancel_attach(tset); } - } + } while_each_subsys_mask(); out_release_tset: spin_lock_bh(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); @@ -2586,11 +2586,11 @@ err: * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task - * @cgrp: the destination cgroup + * @root: cgroup root migration is taking place on * - * Migrate a process or task denoted by @leader to @cgrp. If migrating a - * process, the caller must be holding cgroup_threadgroup_rwsem. The - * caller is also responsible for invoking cgroup_migrate_add_src() and + * Migrate a process or task denoted by @leader. If migrating a process, + * the caller must be holding cgroup_threadgroup_rwsem. The caller is also + * responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). * @@ -2601,7 +2601,7 @@ err: * actually starting migrating. */ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup *cgrp) + struct cgroup_root *root) { struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct task_struct *task; @@ -2622,7 +2622,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, rcu_read_unlock(); spin_unlock_bh(&css_set_lock); - return cgroup_taskset_migrate(&tset, cgrp); + return cgroup_taskset_migrate(&tset, root); } /** @@ -2659,7 +2659,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); if (!ret) - ret = cgroup_migrate(leader, threadgroup, dst_cgrp); + ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); cgroup_migrate_finish(&preloaded_csets); return ret; @@ -2934,7 +2934,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) } spin_unlock_bh(&css_set_lock); - ret = cgroup_taskset_migrate(&tset, cgrp); + ret = cgroup_taskset_migrate(&tset, cgrp->root); out_finish: cgroup_migrate_finish(&preloaded_csets); percpu_up_write(&cgroup_threadgroup_rwsem); @@ -4172,7 +4172,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) css_task_iter_end(&it); if (task) { - ret = cgroup_migrate(task, false, to); + ret = cgroup_migrate(task, false, to->root); put_task_struct(task); } } while (task && !ret); -- cgit From e4857982f49d21c05a84351b56724bf353022355 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Mar 2016 11:51:26 -0500 Subject: cgroup: use css_set->mg_dst_cgrp for the migration target cgroup Migration can be multi-target on the default hierarchy when a controller is enabled - processes belonging to each child cgroup have to be moved to the child cgroup itself to refresh css association. This isn't a problem for cgroup_migrate_add_src() as each source css_set still maps to single source and target cgroups; however, cgroup_migrate_prepare_dst() is called once after all source css_sets are added and thus might not have a single destination cgroup. This is currently worked around by specifying NULL for @dst_cgrp and using the source's default cgroup as destination as the only multi-target migration in use is self-targetting. While this works, it's subtle and clunky. As all taget cgroups are already specified while preparing the source css_sets, this clunkiness can easily be removed by recording the target cgroup in each source css_set. This patch adds css_set->mg_dst_cgrp which is recorded on cgroup_migrate_src() and used by cgroup_migrate_prepare_dst(). This also makes migration code ready for arbitrary multi-target migration. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 9 +++++---- kernel/cgroup.c | 26 +++++++++++++------------- 2 files changed, 18 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index aae8c94de4b3..590291d56049 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -191,12 +191,13 @@ struct css_set { /* * If this cset is acting as the source of migration the following - * two fields are set. mg_src_cgrp is the source cgroup of the - * on-going migration and mg_dst_cset is the destination cset the - * target tasks on this cset should be migrated to. Protected by - * cgroup_mutex. + * two fields are set. mg_src_cgrp and mg_dst_cgrp are + * respectively the source and destination cgroups of the on-going + * migration. mg_dst_cset is the destination cset the target tasks + * on this cset should be migrated to. Protected by cgroup_mutex. */ struct cgroup *mg_src_cgrp; + struct cgroup *mg_dst_cgrp; struct css_set *mg_dst_cset; /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5dd761355033..fbd3e99a4e98 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2473,6 +2473,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) spin_lock_bh(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { cset->mg_src_cgrp = NULL; + cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); @@ -2511,32 +2512,31 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, return; WARN_ON(src_cset->mg_src_cgrp); + WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); WARN_ON(!list_empty(&src_cset->mg_node)); src_cset->mg_src_cgrp = src_cgrp; + src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); list_add(&src_cset->mg_preload_node, preloaded_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @dst_cgrp: the destination cgroup (may be %NULL) * @preloaded_csets: list of preloaded source css_sets * - * Tasks are about to be moved to @dst_cgrp and all the source css_sets - * have been preloaded to @preloaded_csets. This function looks up and - * pins all destination css_sets, links each to its source, and append them - * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each - * source css_set is assumed to be its cgroup on the default hierarchy. + * Tasks are about to be moved and all the source css_sets have been + * preloaded to @preloaded_csets. This function looks up and pins all + * destination css_sets, links each to its source, and append them to + * @preloaded_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on * @preloaded_csets. */ -static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, - struct list_head *preloaded_csets) +static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) { LIST_HEAD(csets); struct css_set *src_cset, *tmp_cset; @@ -2547,8 +2547,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; - dst_cset = find_css_set(src_cset, - dst_cgrp ?: src_cset->dfl_cgrp); + dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) goto err; @@ -2561,6 +2560,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, */ if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; + src_cset->mg_dst_cgrp = NULL; list_del_init(&src_cset->mg_preload_node); put_css_set(src_cset); put_css_set(dst_cset); @@ -2657,7 +2657,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, spin_unlock_bh(&css_set_lock); /* prepare dst csets and commit */ - ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (!ret) ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); @@ -2916,7 +2916,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) spin_unlock_bh(&css_set_lock); /* NULL dst indicates self on default hierarchy */ - ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_finish; @@ -4156,7 +4156,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_migrate_add_src(link->cset, to, &preloaded_csets); spin_unlock_bh(&css_set_lock); - ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_err; -- cgit From f6d635ad341d5cc0b9c7ab46adfbf3bf5886cee4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Mar 2016 11:51:26 -0500 Subject: cgroup: implement cgroup_subsys->implicit_on_dfl Some controllers, perf_event for now and possibly freezer in the future, don't really make sense to control explicitly through "cgroup.subtree_control". For example, the primary role of perf_event is identifying the cgroups of tasks; however, because the controller also keeps a small amount of state per cgroup, it can't be replaced with simple cgroup membership tests. This patch implements cgroup_subsys->implicit_on_dfl flag. When set, the controller is implicitly enabled on all cgroups on the v2 hierarchy so that utility type controllers such as perf_event can be enabled and function transparently. An implicit controller doesn't show up in "cgroup.controllers" or "cgroup.subtree_control", is exempt from no internal process rule and can be stolen from the default hierarchy even if there are non-root csses. v2: Reimplemented on top of the recent updates to css handling and subsystem rebinding. Rebinding implicit subsystems is now a simple matter of exempting it from the busy subsystem check. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 13 +++++++++++++ kernel/cgroup.c | 38 +++++++++++++++++++++++++++++++------- 2 files changed, 44 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 590291d56049..34b42f03fcd8 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -450,6 +450,19 @@ struct cgroup_subsys { bool early_init:1; + /* + * If %true, the controller, on the default hierarchy, doesn't show + * up in "cgroup.controllers" or "cgroup.subtree_control", is + * implicitly enabled on all cgroups on the default hierarchy, and + * bypasses the "no internal process" constraint. This is for + * utility type controllers which is transparent to userland. + * + * An implicit controller can be stolen from the default hierarchy + * anytime and thus must be okay with offline csses from previous + * hierarchies coexisting with csses for the current one. + */ + bool implicit_on_dfl:1; + /* * If %false, this subsystem is properly hierarchical - * configuration, resource accounting and restriction on a parent diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fbd3e99a4e98..e22df5d81e59 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -186,6 +186,9 @@ static u16 cgroup_no_v1_mask; /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; +/* some controllers are implicitly enabled on the default hierarchy */ +static unsigned long cgrp_dfl_implicit_ss_mask; + /* The list of hierarchy roots */ static LIST_HEAD(cgroup_roots); @@ -359,8 +362,8 @@ static u16 cgroup_control(struct cgroup *cgrp) return parent->subtree_control; if (cgroup_on_dfl(cgrp)) - root_ss_mask &= ~cgrp_dfl_inhibit_ss_mask; - + root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | + cgrp_dfl_implicit_ss_mask); return root_ss_mask; } @@ -1327,6 +1330,8 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) lockdep_assert_held(&cgroup_mutex); + cur_ss_mask |= cgrp_dfl_implicit_ss_mask; + while (true) { u16 new_ss_mask = cur_ss_mask; @@ -1512,8 +1517,13 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) lockdep_assert_held(&cgroup_mutex); do_each_subsys_mask(ss, ssid, ss_mask) { - /* if @ss has non-root csses attached to it, can't move */ - if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) + /* + * If @ss has non-root csses attached to it, can't move. + * If @ss is an implicit controller, it is exempt from this + * rule and can be stolen. + */ + if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) && + !ss->implicit_on_dfl) return -EBUSY; /* can't move between two non-dummy roots either */ @@ -3039,6 +3049,18 @@ static void cgroup_restore_control(struct cgroup *cgrp) } } +static bool css_visible(struct cgroup_subsys_state *css) +{ + struct cgroup_subsys *ss = css->ss; + struct cgroup *cgrp = css->cgroup; + + if (cgroup_control(cgrp) & (1 << ss->id)) + return true; + if (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) + return false; + return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl; +} + /** * cgroup_apply_control_enable - enable or show csses according to control * @cgrp: root of the target subtree @@ -3074,7 +3096,7 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) return PTR_ERR(css); } - if (cgroup_control(dsct) & (1 << ss->id)) { + if (css_visible(css)) { ret = css_populate_dir(css); if (ret) return ret; @@ -3117,7 +3139,7 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { kill_css(css); - } else if (!(cgroup_control(dsct) & (1 << ss->id))) { + } else if (!css_visible(css)) { css_clear_dir(css); if (ss->css_reset) ss->css_reset(css); @@ -5455,7 +5477,9 @@ int __init cgroup_init(void) cgrp_dfl_root.subsys_mask |= 1 << ss->id; - if (!ss->dfl_cftypes) + if (ss->implicit_on_dfl) + cgrp_dfl_implicit_ss_mask |= 1 << ss->id; + else if (!ss->dfl_cftypes) cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; if (ss->dfl_cftypes == ss->legacy_cftypes) { -- cgit From b121d1e74d1f24654bdc3165d3db1ca149501356 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 7 Mar 2016 21:57:13 -0800 Subject: bpf: prevent kprobe+bpf deadlocks if kprobe is placed within update or delete hash map helpers that hold bucket spin lock and triggered bpf program is trying to grab the spinlock for the same bucket on the same cpu, it will deadlock. Fix it by extending existing recursion prevention mechanism. Note, map_lookup and other tracing helpers don't have this problem, since they don't hold any locks and don't modify global data. bpf_trace_printk has its own recursive check and ok as well. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 3 +++ kernel/bpf/syscall.c | 13 +++++++++++++ kernel/trace/bpf_trace.c | 2 -- 3 files changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 51e498e5470e..4b070827200d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -10,6 +10,7 @@ #include #include #include +#include struct bpf_map; @@ -163,6 +164,8 @@ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *f const struct bpf_func_proto *bpf_get_trace_printk_proto(void); #ifdef CONFIG_BPF_SYSCALL +DECLARE_PER_CPU(int, bpf_prog_active); + void bpf_register_prog_type(struct bpf_prog_type_list *tl); void bpf_register_map_type(struct bpf_map_type_list *tl); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c95a753c2007..dc99f6a000f5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -18,6 +18,8 @@ #include #include +DEFINE_PER_CPU(int, bpf_prog_active); + int sysctl_unprivileged_bpf_disabled __read_mostly; static LIST_HEAD(bpf_map_types); @@ -347,6 +349,11 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; + /* must increment bpf_prog_active to avoid kprobe+bpf triggering from + * inside bpf map update or delete otherwise deadlocks are possible + */ + preempt_disable(); + __this_cpu_inc(bpf_prog_active); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { @@ -356,6 +363,8 @@ static int map_update_elem(union bpf_attr *attr) err = map->ops->map_update_elem(map, key, value, attr->flags); rcu_read_unlock(); } + __this_cpu_dec(bpf_prog_active); + preempt_enable(); free_value: kfree(value); @@ -394,9 +403,13 @@ static int map_delete_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + preempt_disable(); + __this_cpu_inc(bpf_prog_active); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); free_key: kfree(key); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4b8caa392b86..3e4ffb3ace5f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,8 +13,6 @@ #include #include "trace.h" -static DEFINE_PER_CPU(int, bpf_prog_active); - /** * trace_call_bpf - invoke BPF program * @prog: BPF program -- cgit From e19494edab82f55a633911f25094581891bdc351 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 7 Mar 2016 21:57:14 -0800 Subject: bpf: introduce percpu_freelist Introduce simple percpu_freelist to keep single list of elements spread across per-cpu singly linked lists. /* push element into the list */ void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); /* pop element from the list */ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *); The object is pushed to the current cpu list. Pop first trying to get the object from the current cpu list, if it's empty goes to the neigbour cpu list. For bpf program usage pattern the collision rate is very low, since programs push and pop the objects typically on the same cpu. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 2 +- kernel/bpf/percpu_freelist.c | 100 +++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/percpu_freelist.h | 31 ++++++++++++++ 3 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/percpu_freelist.c create mode 100644 kernel/bpf/percpu_freelist.h (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 8a932d079c24..eed911d091da 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c new file mode 100644 index 000000000000..5c51d1985b51 --- /dev/null +++ b/kernel/bpf/percpu_freelist.c @@ -0,0 +1,100 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include "percpu_freelist.h" + +int pcpu_freelist_init(struct pcpu_freelist *s) +{ + int cpu; + + s->freelist = alloc_percpu(struct pcpu_freelist_head); + if (!s->freelist) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); + + raw_spin_lock_init(&head->lock); + head->first = NULL; + } + return 0; +} + +void pcpu_freelist_destroy(struct pcpu_freelist *s) +{ + free_percpu(s->freelist); +} + +static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head, + struct pcpu_freelist_node *node) +{ + raw_spin_lock(&head->lock); + node->next = head->first; + head->first = node; + raw_spin_unlock(&head->lock); +} + +void pcpu_freelist_push(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); + + __pcpu_freelist_push(head, node); +} + +void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, + u32 nr_elems) +{ + struct pcpu_freelist_head *head; + unsigned long flags; + int i, cpu, pcpu_entries; + + pcpu_entries = nr_elems / num_possible_cpus() + 1; + i = 0; + + /* disable irq to workaround lockdep false positive + * in bpf usage pcpu_freelist_populate() will never race + * with pcpu_freelist_push() + */ + local_irq_save(flags); + for_each_possible_cpu(cpu) { +again: + head = per_cpu_ptr(s->freelist, cpu); + __pcpu_freelist_push(head, buf); + i++; + buf += elem_size; + if (i == nr_elems) + break; + if (i % pcpu_entries) + goto again; + } + local_irq_restore(flags); +} + +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) +{ + struct pcpu_freelist_head *head; + struct pcpu_freelist_node *node; + int orig_cpu, cpu; + + orig_cpu = cpu = raw_smp_processor_id(); + while (1) { + head = per_cpu_ptr(s->freelist, cpu); + raw_spin_lock(&head->lock); + node = head->first; + if (node) { + head->first = node->next; + raw_spin_unlock(&head->lock); + return node; + } + raw_spin_unlock(&head->lock); + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = 0; + if (cpu == orig_cpu) + return NULL; + } +} diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h new file mode 100644 index 000000000000..3049aae8ea1e --- /dev/null +++ b/kernel/bpf/percpu_freelist.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __PERCPU_FREELIST_H__ +#define __PERCPU_FREELIST_H__ +#include +#include + +struct pcpu_freelist_head { + struct pcpu_freelist_node *first; + raw_spinlock_t lock; +}; + +struct pcpu_freelist { + struct pcpu_freelist_head __percpu *freelist; +}; + +struct pcpu_freelist_node { + struct pcpu_freelist_node *next; +}; + +void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *); +void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, + u32 nr_elems); +int pcpu_freelist_init(struct pcpu_freelist *); +void pcpu_freelist_destroy(struct pcpu_freelist *s); +#endif -- cgit From 6c90598174322b8888029e40dd84a4eb01f56afe Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 7 Mar 2016 21:57:15 -0800 Subject: bpf: pre-allocate hash map elements If kprobe is placed on spin_unlock then calling kmalloc/kfree from bpf programs is not safe, since the following dead lock is possible: kfree->spin_lock(kmem_cache_node->lock)...spin_unlock->kprobe-> bpf_prog->map_update->kmalloc->spin_lock(of the same kmem_cache_node->lock) and deadlocks. The following solutions were considered and some implemented, but eventually discarded - kmem_cache_create for every map - add recursion check to slow-path of slub - use reserved memory in bpf_map_update for in_irq or in preempt_disabled - kmalloc via irq_work At the end pre-allocation of all map elements turned out to be the simplest solution and since the user is charged upfront for all the memory, such pre-allocation doesn't affect the user space visible behavior. Since it's impossible to tell whether kprobe is triggered in a safe location from kmalloc point of view, use pre-allocation by default and introduce new BPF_F_NO_PREALLOC flag. While testing of per-cpu hash maps it was discovered that alloc_percpu(GFP_ATOMIC) has odd corner cases and often fails to allocate memory even when 90% of it is free. The pre-allocation of per-cpu hash elements solves this problem as well. Turned out that bpf_map_update() quickly followed by bpf_map_lookup()+bpf_map_delete() is very common pattern used in many of iovisor/bcc/tools, so there is additional benefit of pre-allocation, since such use cases are must faster. Since all hash map elements are now pre-allocated we can remove atomic increment of htab->count and save few more cycles. Also add bpf_map_precharge_memlock() to check rlimit_memlock early to avoid large malloc/free done by users who don't have sufficient limits. Pre-allocation is done with vmalloc and alloc/free is done via percpu_freelist. Here are performance numbers for different pre-allocation algorithms that were implemented, but discarded in favor of percpu_freelist: 1 cpu: pcpu_ida 2.1M pcpu_ida nolock 2.3M bt 2.4M kmalloc 1.8M hlist+spinlock 2.3M pcpu_freelist 2.6M 4 cpu: pcpu_ida 1.5M pcpu_ida nolock 1.8M bt w/smp_align 1.7M bt no/smp_align 1.1M kmalloc 0.7M hlist+spinlock 0.2M pcpu_freelist 2.0M 8 cpu: pcpu_ida 0.7M bt w/smp_align 0.8M kmalloc 0.4M pcpu_freelist 1.5M 32 cpu: kmalloc 0.13M pcpu_freelist 0.49M pcpu_ida nolock is a modified percpu_ida algorithm without percpu_ida_cpu locks and without cross-cpu tag stealing. It's faster than existing percpu_ida, but not as fast as pcpu_freelist. bt is a variant of block/blk-mq-tag.c simlified and customized for bpf use case. bt w/smp_align is using cache line for every 'long' (similar to blk-mq-tag). bt no/smp_align allocates 'long' bitmasks continuously to save memory. It's comparable to percpu_ida and in some cases faster, but slower than percpu_freelist hlist+spinlock is the simplest free list with single spinlock. As expeceted it has very bad scaling in SMP. kmalloc is existing implementation which is still available via BPF_F_NO_PREALLOC flag. It's significantly slower in single cpu and in 8 cpu setup it's 3 times slower than pre-allocation with pcpu_freelist, but saves memory, so in cases where map->max_entries can be large and number of map update/delete per second is low, it may make sense to use it. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 + include/uapi/linux/bpf.h | 3 + kernel/bpf/hashtab.c | 240 +++++++++++++++++++++++++++++++++-------------- kernel/bpf/syscall.c | 15 ++- 4 files changed, 186 insertions(+), 74 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4b070827200d..efd1d4ca95c6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -37,6 +37,7 @@ struct bpf_map { u32 key_size; u32 value_size; u32 max_entries; + u32 map_flags; u32 pages; struct user_struct *user; const struct bpf_map_ops *ops; @@ -178,6 +179,7 @@ struct bpf_map *__bpf_map_get(struct fd f); void bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); +int bpf_map_precharge_memlock(u32 pages); extern int sysctl_unprivileged_bpf_disabled; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9221f653fee3..0e30b19012a5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -101,12 +101,15 @@ enum bpf_prog_type { #define BPF_NOEXIST 1 /* create new element if it didn't exist */ #define BPF_EXIST 2 /* update existing element */ +#define BPF_F_NO_PREALLOC (1U << 0) + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ __u32 key_size; /* size of key in bytes */ __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ + __u32 map_flags; /* prealloc or not */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a68e95133fcd..fff3650d52fc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -13,6 +14,7 @@ #include #include #include +#include "percpu_freelist.h" struct bucket { struct hlist_head head; @@ -22,6 +24,8 @@ struct bucket { struct bpf_htab { struct bpf_map map; struct bucket *buckets; + void *elems; + struct pcpu_freelist freelist; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ @@ -29,15 +33,86 @@ struct bpf_htab { /* each htab element is struct htab_elem + key + value */ struct htab_elem { - struct hlist_node hash_node; - struct rcu_head rcu; union { - u32 hash; - u32 key_size; + struct hlist_node hash_node; + struct bpf_htab *htab; + struct pcpu_freelist_node fnode; }; + struct rcu_head rcu; + u32 hash; char key[0] __aligned(8); }; +static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, + void __percpu *pptr) +{ + *(void __percpu **)(l->key + key_size) = pptr; +} + +static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) +{ + return *(void __percpu **)(l->key + key_size); +} + +static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) +{ + return (struct htab_elem *) (htab->elems + i * htab->elem_size); +} + +static void htab_free_elems(struct bpf_htab *htab) +{ + int i; + + if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + goto free_elems; + + for (i = 0; i < htab->map.max_entries; i++) { + void __percpu *pptr; + + pptr = htab_elem_get_ptr(get_htab_elem(htab, i), + htab->map.key_size); + free_percpu(pptr); + } +free_elems: + vfree(htab->elems); +} + +static int prealloc_elems_and_freelist(struct bpf_htab *htab) +{ + int err = -ENOMEM, i; + + htab->elems = vzalloc(htab->elem_size * htab->map.max_entries); + if (!htab->elems) + return -ENOMEM; + + if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + goto skip_percpu_elems; + + for (i = 0; i < htab->map.max_entries; i++) { + u32 size = round_up(htab->map.value_size, 8); + void __percpu *pptr; + + pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); + if (!pptr) + goto free_elems; + htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, + pptr); + } + +skip_percpu_elems: + err = pcpu_freelist_init(&htab->freelist); + if (err) + goto free_elems; + + pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, + htab->map.max_entries); + return 0; + +free_elems: + htab_free_elems(htab); + return err; +} + /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { @@ -46,6 +121,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) int err, i; u64 cost; + if (attr->map_flags & ~BPF_F_NO_PREALLOC) + /* reserved bits should not be used */ + return ERR_PTR(-EINVAL); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -55,6 +134,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.key_size = attr->key_size; htab->map.value_size = attr->value_size; htab->map.max_entries = attr->max_entries; + htab->map.map_flags = attr->map_flags; /* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set @@ -92,7 +172,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (percpu) htab->elem_size += sizeof(void *); else - htab->elem_size += htab->map.value_size; + htab->elem_size += round_up(htab->map.value_size, 8); /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || @@ -112,6 +192,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + /* if map size is larger than memlock limit, reject it early */ + err = bpf_map_precharge_memlock(htab->map.pages); + if (err) + goto free_htab; + err = -ENOMEM; htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), GFP_USER | __GFP_NOWARN); @@ -127,10 +212,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } - atomic_set(&htab->count, 0); + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { + err = prealloc_elems_and_freelist(htab); + if (err) + goto free_buckets; + } return &htab->map; +free_buckets: + kvfree(htab->buckets); free_htab: kfree(htab); return ERR_PTR(err); @@ -249,42 +340,42 @@ find_first_elem: } } - /* itereated over all buckets and all elements */ + /* iterated over all buckets and all elements */ return -ENOENT; } - -static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, - void __percpu *pptr) -{ - *(void __percpu **)(l->key + key_size) = pptr; -} - -static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) -{ - return *(void __percpu **)(l->key + key_size); -} - -static void htab_percpu_elem_free(struct htab_elem *l) +static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { - free_percpu(htab_elem_get_ptr(l, l->key_size)); + if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) + free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); kfree(l); + } -static void htab_percpu_elem_free_rcu(struct rcu_head *head) +static void htab_elem_free_rcu(struct rcu_head *head) { struct htab_elem *l = container_of(head, struct htab_elem, rcu); + struct bpf_htab *htab = l->htab; - htab_percpu_elem_free(l); + /* must increment bpf_prog_active to avoid kprobe+bpf triggering while + * we're calling kfree, otherwise deadlock is possible if kprobes + * are placed somewhere inside of slub + */ + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + htab_elem_free(htab, l); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); } -static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size) +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { - if (percpu) { - l->key_size = key_size; - call_rcu(&l->rcu, htab_percpu_elem_free_rcu); + if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { + pcpu_freelist_push(&htab->freelist, &l->fnode); } else { - kfree_rcu(l, rcu); + atomic_dec(&htab->count); + l->htab = htab; + call_rcu(&l->rcu, htab_elem_free_rcu); } } @@ -293,23 +384,39 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, bool percpu, bool onallcpus) { u32 size = htab->map.value_size; + bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); struct htab_elem *l_new; void __percpu *pptr; - l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return NULL; + if (prealloc) { + l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); + if (!l_new) + return ERR_PTR(-E2BIG); + } else { + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); + } memcpy(l_new->key, key, key_size); if (percpu) { /* round up value_size to 8 bytes */ size = round_up(size, 8); - /* alloc_percpu zero-fills */ - pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN); - if (!pptr) { - kfree(l_new); - return NULL; + if (prealloc) { + pptr = htab_elem_get_ptr(l_new, key_size); + } else { + /* alloc_percpu zero-fills */ + pptr = __alloc_percpu_gfp(size, 8, + GFP_ATOMIC | __GFP_NOWARN); + if (!pptr) { + kfree(l_new); + return ERR_PTR(-ENOMEM); + } } if (!onallcpus) { @@ -324,7 +431,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, off += size; } } - htab_elem_set_ptr(l_new, key_size, pptr); + if (!prealloc) + htab_elem_set_ptr(l_new, key_size, pptr); } else { memcpy(l_new->key + round_up(key_size, 8), value, size); } @@ -336,12 +444,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, u64 map_flags) { - if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries)) - /* if elem with this 'key' doesn't exist and we've reached - * max_entries limit, fail insertion of new elem - */ - return -E2BIG; - if (l_old && map_flags == BPF_NOEXIST) /* elem already exists */ return -EEXIST; @@ -375,13 +477,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, hash = htab_map_hash(key, key_size); - /* allocate new element outside of the lock, since - * we're most likley going to insert it - */ - l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); - if (!l_new) - return -ENOMEM; - b = __select_bucket(htab, hash); head = &b->head; @@ -394,21 +489,24 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (ret) goto err; + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); + if (IS_ERR(l_new)) { + /* all pre-allocated elements are in use or memory exhausted */ + ret = PTR_ERR(l_new); + goto err; + } + /* add new element to the head of the list, so that * concurrent search will find it before old elem */ hlist_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_del_rcu(&l_old->hash_node); - kfree_rcu(l_old, rcu); - } else { - atomic_inc(&htab->count); + free_htab_elem(htab, l_old); } - raw_spin_unlock_irqrestore(&b->lock, flags); - return 0; + ret = 0; err: raw_spin_unlock_irqrestore(&b->lock, flags); - kfree(l_new); return ret; } @@ -466,12 +564,11 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } else { l_new = alloc_htab_elem(htab, key, value, key_size, hash, true, onallcpus); - if (!l_new) { - ret = -ENOMEM; + if (IS_ERR(l_new)) { + ret = PTR_ERR(l_new); goto err; } hlist_add_head_rcu(&l_new->hash_node, head); - atomic_inc(&htab->count); } ret = 0; err: @@ -489,7 +586,6 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH; struct hlist_head *head; struct bucket *b; struct htab_elem *l; @@ -511,8 +607,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) if (l) { hlist_del_rcu(&l->hash_node); - atomic_dec(&htab->count); - free_htab_elem(l, percpu, key_size); + free_htab_elem(htab, l); ret = 0; } @@ -531,17 +626,10 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - atomic_dec(&htab->count); - if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) { - l->key_size = htab->map.key_size; - htab_percpu_elem_free(l); - } else { - kfree(l); - } + htab_elem_free(htab, l); } } } - /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -554,10 +642,16 @@ static void htab_map_free(struct bpf_map *map) */ synchronize_rcu(); - /* some of kfree_rcu() callbacks for elements of this map may not have - * executed. It's ok. Proceed to free residual elements and map itself + /* some of free_htab_elem() callbacks for elements of this map may + * not have executed. Wait for them. */ - delete_all_elements(htab); + rcu_barrier(); + if (htab->map.map_flags & BPF_F_NO_PREALLOC) { + delete_all_elements(htab); + } else { + htab_free_elems(htab); + pcpu_freelist_destroy(&htab->freelist); + } kvfree(htab->buckets); kfree(htab); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dc99f6a000f5..cbd94b2144ff 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -48,6 +48,19 @@ void bpf_register_map_type(struct bpf_map_type_list *tl) list_add(&tl->list_node, &bpf_map_types); } +int bpf_map_precharge_memlock(u32 pages) +{ + struct user_struct *user = get_current_user(); + unsigned long memlock_limit, cur; + + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + cur = atomic_long_read(&user->locked_vm); + free_uid(user); + if (cur + pages > memlock_limit) + return -EPERM; + return 0; +} + static int bpf_map_charge_memlock(struct bpf_map *map) { struct user_struct *user = get_current_user(); @@ -153,7 +166,7 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD max_entries +#define BPF_MAP_CREATE_LAST_FIELD map_flags /* called via syscall */ static int map_create(union bpf_attr *attr) { -- cgit From 823707b68d6e6c4b1be619b039c7045fef1740e6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 7 Mar 2016 21:57:16 -0800 Subject: bpf: check for reserved flag bits in array and stack maps Suggested-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 2 +- kernel/bpf/stackmap.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index bd3bdf2486a7..76d5a794e426 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -53,7 +53,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size == 0) + attr->value_size == 0 || attr->map_flags) return ERR_PTR(-EINVAL); if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 8a60ee14a977..f0a02c344358 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -35,6 +35,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); + if (attr->map_flags) + return ERR_PTR(-EINVAL); + /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || value_size < 8 || value_size % 8 || -- cgit From 557c0c6e7df8e14a46bd7560d193fa5bbc00a858 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 7 Mar 2016 21:57:17 -0800 Subject: bpf: convert stackmap to pre-allocation It was observed that calling bpf_get_stackid() from a kprobe inside slub or from spin_unlock causes similar deadlock as with hashmap, therefore convert stackmap to use pre-allocated memory. The call_rcu is no longer feasible mechanism, since delayed freeing causes bpf_get_stackid() to fail unpredictably when number of actual stacks is significantly less than user requested max_entries. Since elements are no longer freed into slub, we can push elements into freelist immediately and let them be recycled. However the very unlikley race between user space map_lookup() and program-side recycling is possible: cpu0 cpu1 ---- ---- user does lookup(stackidX) starts copying ips into buffer delete(stackidX) calls bpf_get_stackid() which recyles the element and overwrites with new stack trace To avoid user space seeing a partial stack trace consisting of two merged stack traces, do bucket = xchg(, NULL); copy; xchg(,bucket); to preserve consistent stack trace delivery to user space. Now we can move memset(,0) of left-over element value from critical path of bpf_get_stackid() into slow-path of user space lookup. Also disallow lookup() from bpf program, since it's useless and program shouldn't be messing with collected stack trace. Note that similar race between user space lookup and kernel side updates is also present in hashmap, but it's not a new race. bpf programs were always allowed to modify hash and array map elements while user space is copying them. Fixes: d5a3b1f69186 ("bpf: introduce BPF_MAP_TYPE_STACK_TRACE") Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + kernel/bpf/stackmap.c | 86 ++++++++++++++++++++++++++++++++++++++++----------- kernel/bpf/syscall.c | 2 ++ 3 files changed, 71 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index efd1d4ca95c6..21ee41b92e8a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -195,6 +195,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, u64 flags); +int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index f0a02c344358..499d9e933f8e 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -10,9 +10,10 @@ #include #include #include +#include "percpu_freelist.h" struct stack_map_bucket { - struct rcu_head rcu; + struct pcpu_freelist_node fnode; u32 hash; u32 nr; u64 ip[]; @@ -20,10 +21,34 @@ struct stack_map_bucket { struct bpf_stack_map { struct bpf_map map; + void *elems; + struct pcpu_freelist freelist; u32 n_buckets; - struct stack_map_bucket __rcu *buckets[]; + struct stack_map_bucket *buckets[]; }; +static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) +{ + u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; + int err; + + smap->elems = vzalloc(elem_size * smap->map.max_entries); + if (!smap->elems) + return -ENOMEM; + + err = pcpu_freelist_init(&smap->freelist); + if (err) + goto free_elems; + + pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size, + smap->map.max_entries); + return 0; + +free_elems: + vfree(smap->elems); + return err; +} + /* Called from syscall */ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) { @@ -70,12 +95,22 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + err = bpf_map_precharge_memlock(smap->map.pages); + if (err) + goto free_smap; + err = get_callchain_buffers(); if (err) goto free_smap; + err = prealloc_elems_and_freelist(smap); + if (err) + goto put_buffers; + return &smap->map; +put_buffers: + put_callchain_buffers(); free_smap: kvfree(smap); return ERR_PTR(err); @@ -121,7 +156,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) ips = trace->ip + skip + init_nr; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); id = hash & (smap->n_buckets - 1); - bucket = rcu_dereference(smap->buckets[id]); + bucket = READ_ONCE(smap->buckets[id]); if (bucket && bucket->hash == hash) { if (flags & BPF_F_FAST_STACK_CMP) @@ -135,19 +170,18 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) if (bucket && !(flags & BPF_F_REUSE_STACKID)) return -EEXIST; - new_bucket = kmalloc(sizeof(struct stack_map_bucket) + map->value_size, - GFP_ATOMIC | __GFP_NOWARN); + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; memcpy(new_bucket->ip, ips, trace_len); - memset(new_bucket->ip + trace_len / 8, 0, map->value_size - trace_len); new_bucket->hash = hash; new_bucket->nr = trace_nr; old_bucket = xchg(&smap->buckets[id], new_bucket); if (old_bucket) - kfree_rcu(old_bucket, rcu); + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return id; } @@ -160,17 +194,34 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; -/* Called from syscall or from eBPF program */ +/* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +/* Called from syscall */ +int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); - struct stack_map_bucket *bucket; - u32 id = *(u32 *)key; + struct stack_map_bucket *bucket, *old_bucket; + u32 id = *(u32 *)key, trace_len; if (unlikely(id >= smap->n_buckets)) - return NULL; - bucket = rcu_dereference(smap->buckets[id]); - return bucket ? bucket->ip : NULL; + return -ENOENT; + + bucket = xchg(&smap->buckets[id], NULL); + if (!bucket) + return -ENOENT; + + trace_len = bucket->nr * sizeof(u64); + memcpy(value, bucket->ip, trace_len); + memset(value + trace_len, 0, map->value_size - trace_len); + + old_bucket = xchg(&smap->buckets[id], bucket); + if (old_bucket) + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return 0; } static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -196,7 +247,7 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key) old_bucket = xchg(&smap->buckets[id], NULL); if (old_bucket) { - kfree_rcu(old_bucket, rcu); + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return 0; } else { return -ENOENT; @@ -207,13 +258,12 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key) static void stack_map_free(struct bpf_map *map) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); - int i; + /* wait for bpf programs to complete before freeing stack map */ synchronize_rcu(); - for (i = 0; i < smap->n_buckets; i++) - if (smap->buckets[i]) - kfree_rcu(smap->buckets[i], rcu); + vfree(smap->elems); + pcpu_freelist_destroy(&smap->freelist); kvfree(smap); put_callchain_buffers(); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cbd94b2144ff..2978d0d08869 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -290,6 +290,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { + err = bpf_stackmap_copy(map, key, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); -- cgit From e237a5518425155faa508a087f28269f58074b92 Mon Sep 17 00:00:00 2001 From: Chen Fan Date: Mon, 15 Feb 2016 12:52:01 +0800 Subject: x86/ACPI/PCI: Recognize that Interrupt Line 255 means "not connected" Per the x86-specific footnote to PCI spec r3.0, sec 6.2.4, the value 255 in the Interrupt Line register means "unknown" or "no connection." Previously, when we couldn't derive an IRQ from the _PRT, we fell back to using the value from Interrupt Line as an IRQ. It's questionable whether we should do that at all, but the spec clearly suggests we shouldn't do it for the value 255 on x86. Calling request_irq() with IRQ 255 may succeed, but the driver won't receive any interrupts. Or, if IRQ 255 is shared with another device, it may succeed, and the driver's ISR will be called at random times when the *other* device interrupts. Or it may fail if another device is using IRQ 255 with incompatible flags. What we *want* is for request_irq() to fail predictably so the driver can fall back to polling. On x86, assume 255 in the Interrupt Line means the INTx line is not connected. In that case, set dev->irq to IRQ_NOTCONNECTED so request_irq() will fail gracefully with -ENOTCONN. We found this problem on a system where Secure Boot firmware assigned Interrupt Line 255 to an i801_smbus device and another device was already using MSI-X IRQ 255. This was in v3.10, where i801_probe() fails if request_irq() fails: i801_smbus 0000:00:1f.3: enabling device (0140 -> 0143) i801_smbus 0000:00:1f.3: can't derive routing for PCI INT C i801_smbus 0000:00:1f.3: PCI INT C: no GSI genirq: Flags mismatch irq 255. 00000080 (i801_smbus) vs. 00000000 (megasa) CPU: 0 PID: 2487 Comm: kworker/0:1 Not tainted 3.10.0-229.el7.x86_64 #1 Hardware name: FUJITSU PRIMEQUEST 2800E2/D3736, BIOS PRIMEQUEST 2000 Serie5 Call Trace: dump_stack+0x19/0x1b __setup_irq+0x54a/0x570 request_threaded_irq+0xcc/0x170 i801_probe+0x32f/0x508 [i2c_i801] local_pci_probe+0x45/0xa0 i801_smbus 0000:00:1f.3: Failed to allocate irq 255: -16 i801_smbus: probe of 0000:00:1f.3 failed with error -16 After aeb8a3d16ae0 ("i2c: i801: Check if interrupts are disabled"), i801_probe() will fall back to polling if request_irq() fails. But we still need this patch because request_irq() may succeed or fail depending on other devices in the system. If request_irq() fails, i801_smbus will work by falling back to polling, but if it succeeds, i801_smbus won't work because it expects interrupts that it may not receive. Signed-off-by: Chen Fan Acked-by: Thomas Gleixner Acked-by: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki --- drivers/acpi/pci_irq.c | 29 +++++++++++++++++++++++++---- include/linux/interrupt.h | 10 ++++++++++ kernel/irq/manage.c | 9 ++++++++- 3 files changed, 43 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index c8e169e46673..2c45dd3acc17 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -33,6 +33,7 @@ #include #include #include +#include #define PREFIX "ACPI: " @@ -387,6 +388,23 @@ static inline int acpi_isa_register_gsi(struct pci_dev *dev) } #endif +static inline bool acpi_pci_irq_valid(struct pci_dev *dev, u8 pin) +{ +#ifdef CONFIG_X86 + /* + * On x86 irq line 0xff means "unknown" or "no connection" + * (PCI 3.0, Section 6.2.4, footnote on page 223). + */ + if (dev->irq == 0xff) { + dev->irq = IRQ_NOTCONNECTED; + dev_warn(&dev->dev, "PCI INT %c: not connected\n", + pin_name(pin)); + return false; + } +#endif + return true; +} + int acpi_pci_irq_enable(struct pci_dev *dev) { struct acpi_prt_entry *entry; @@ -431,11 +449,14 @@ int acpi_pci_irq_enable(struct pci_dev *dev) } else gsi = -1; - /* - * No IRQ known to the ACPI subsystem - maybe the BIOS / - * driver reported one, then use it. Exit in any case. - */ if (gsi < 0) { + /* + * No IRQ known to the ACPI subsystem - maybe the BIOS / + * driver reported one, then use it. Exit in any case. + */ + if (!acpi_pci_irq_valid(dev, pin)) + return 0; + if (acpi_isa_register_gsi(dev)) dev_warn(&dev->dev, "PCI INT %c: no GSI\n", pin_name(pin)); diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 0e95fcc75b2a..358076eda364 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -125,6 +125,16 @@ struct irqaction { extern irqreturn_t no_action(int cpl, void *dev_id); +/* + * If a (PCI) device interrupt is not connected we set dev->irq to + * IRQ_NOTCONNECTED. This causes request_irq() to fail with -ENOTCONN, so we + * can distingiush that case from other error returns. + * + * 0x80000000 is guaranteed to be outside the available range of interrupts + * and easy to distinguish from other possible incorrect values. + */ +#define IRQ_NOTCONNECTED (1U << 31) + extern int __must_check request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 841187239adc..e79e60f50bce 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1609,6 +1609,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, struct irq_desc *desc; int retval; + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out @@ -1699,9 +1702,13 @@ EXPORT_SYMBOL(request_threaded_irq); int request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; int ret; + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + + desc = irq_to_desc(irq); if (!desc) return -EINVAL; -- cgit From 34e2c555f3e13c90e9284e23d00f03be8a6e06c5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Feb 2016 20:20:42 +0100 Subject: cpufreq: Add mechanism for registering utilization update callbacks Introduce a mechanism by which parts of the cpufreq subsystem ("setpolicy" drivers or the core) can register callbacks to be executed from cpufreq_update_util() which is invoked by the scheduler's update_load_avg() on CPU utilization changes. This allows the "setpolicy" drivers to dispense with their timers and do all of the computations they need and frequency/voltage adjustments in the update_load_avg() code path, among other things. The update_load_avg() changes were suggested by Peter Zijlstra. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Acked-by: Peter Zijlstra (Intel) Acked-by: Ingo Molnar --- drivers/cpufreq/cpufreq.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/cpufreq.h | 34 ++++++++++++++++++++++++++++++++++ kernel/sched/deadline.c | 4 ++++ kernel/sched/fair.c | 26 +++++++++++++++++++++++++- kernel/sched/rt.c | 4 ++++ kernel/sched/sched.h | 1 + 6 files changed, 113 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 34b17447e0d1..e172b2a02c1d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -102,6 +102,51 @@ static LIST_HEAD(cpufreq_governor_list); static struct cpufreq_driver *cpufreq_driver; static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data); static DEFINE_RWLOCK(cpufreq_driver_lock); + +static DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer. + * @cpu: The CPU to set the pointer for. + * @data: New pointer value. + * + * Set and publish the update_util_data pointer for the given CPU. That pointer + * points to a struct update_util_data object containing a callback function + * to call from cpufreq_update_util(). That function will be called from an RCU + * read-side critical section, so it must not sleep. + * + * Callers must use RCU callbacks to free any memory that might be accessed + * via the old update_util_data pointer or invoke synchronize_rcu() right after + * this function to avoid use-after-free. + */ +void cpufreq_set_update_util_data(int cpu, struct update_util_data *data) +{ + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); +} +EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @time: Current time. + * @util: Current utilization. + * @max: Utilization ceiling. + * + * This function is called by the scheduler on every invocation of + * update_load_avg() on the CPU whose utilization is being updated. + */ +void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) +{ + struct update_util_data *data; + + rcu_read_lock(); + + data = rcu_dereference(*this_cpu_ptr(&cpufreq_update_util_data)); + if (data && data->func) + data->func(data, time, util, max); + + rcu_read_unlock(); +} + DEFINE_MUTEX(cpufreq_governor_lock); /* Flag to suspend/resume CPUFreq governors */ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index d0bf555b6bbf..704d85bf7242 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -151,6 +151,36 @@ static inline bool policy_is_shared(struct cpufreq_policy *policy) extern struct kobject *cpufreq_global_kobject; #ifdef CONFIG_CPU_FREQ +void cpufreq_update_util(u64 time, unsigned long util, unsigned long max); + +/** + * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. + * @time: Current time. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS, + * though, because they may not be coming in if RT or deadline tasks are active + * all the time (or there are RT and DL tasks only). + * + * As a workaround for that issue, this function is called by the RT and DL + * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT and DL tasks. + */ +static inline void cpufreq_trigger_update(u64 time) +{ + cpufreq_update_util(time, ULONG_MAX, 0); +} + +struct update_util_data { + void (*func)(struct update_util_data *data, + u64 time, unsigned long util, unsigned long max); +}; + +void cpufreq_set_update_util_data(int cpu, struct update_util_data *data); + unsigned int cpufreq_get(unsigned int cpu); unsigned int cpufreq_quick_get(unsigned int cpu); unsigned int cpufreq_quick_get_max(unsigned int cpu); @@ -162,6 +192,10 @@ int cpufreq_update_policy(unsigned int cpu); bool have_governor_per_policy(void); struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); #else +static inline void cpufreq_update_util(u64 time, unsigned long util, + unsigned long max) {} +static inline void cpufreq_trigger_update(u64 time) {} + static inline unsigned int cpufreq_get(unsigned int cpu) { return 0; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index cd64c979d0e1..21a0aa6f810d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -726,6 +726,10 @@ static void update_curr_dl(struct rq *rq) if (!dl_task(curr) || !on_dl_rq(dl_se)) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + /* * Consumed budget is computed considering the time as * observed by schedulable tasks (excluding time spent diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56b7d4b83947..e2987a7e489d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2824,7 +2824,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int cpu = cpu_of(rq_of(cfs_rq)); + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Track task load average for carrying it to new CPU after migrated, and @@ -2836,6 +2837,29 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); + + if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { + unsigned long max = rq->cpu_capacity_orig; + + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_clock(rq), + min(cfs_rq->avg.util_avg, max), max); + } } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86abe0ea1..27f5b03cbdbe 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -945,6 +945,10 @@ static void update_curr_rt(struct rq *rq) if (curr->sched_class != &rt_sched_class) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 10f16374df7f..f042190c8002 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -9,6 +9,7 @@ #include #include #include +#include #include "cpupri.h" #include "cpudeadline.h" -- cgit From 4e0d8f7eff3fbfa3e3ac5782669c078f590dc9e2 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 9 Mar 2016 12:47:03 -0700 Subject: resource: Change __request_region to inherit from immediate parent __request_region() sets 'flags' of a new resource from @parent as it inherits the parent's attribute. When a target resource has a conflict, this function inserts the new resource entry under the conflicted entry by updating @parent. In this case, the new resource entry needs to inherit attribute from the updated parent. This conflict is a typical case since __request_region() is used to allocate a new resource from a specific resource range. For instance, request_mem_region() calls __request_region() with @parent set to &iomem_resource, which is the root entry of the whole iomem range. When this request results in inserting a new entry "DEV-A" under "BUS-1", "DEV-A" needs to inherit from the immediate parent "BUS-1" as it holds specific attribute for the range. root (&iomem_resource) : + "BUS-1" + "DEV-A" Change __request_region() to set 'flags' and 'desc' of a new entry from the immediate parent. Signed-off-by: Toshi Kani Cc: Ingo Molnar Cc: Borislav Petkov Cc: Andrew Morton Cc: Dan Williams Signed-off-by: Dan Williams --- kernel/resource.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 4d466052426b..5a56e8f24058 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1085,15 +1085,16 @@ struct resource * __request_region(struct resource *parent, res->name = name; res->start = start; res->end = start + n - 1; - res->flags = resource_type(parent) | resource_ext_type(parent); - res->flags |= IORESOURCE_BUSY | flags; - res->desc = IORES_DESC_NONE; write_lock(&resource_lock); for (;;) { struct resource *conflict; + res->flags = resource_type(parent) | resource_ext_type(parent); + res->flags |= IORESOURCE_BUSY | flags; + res->desc = parent->desc; + conflict = __request_resource(parent, res); if (!conflict) break; -- cgit From ff3cc952d3f009e6c376cc40651b87187ce364a6 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 9 Mar 2016 12:47:04 -0700 Subject: resource: Add remove_resource interface insert_resource() and insert_resource_conflict() are called by resource producers to insert a new resource. When there is any conflict, they move conflicting resources down to the children of the new resource. There is no destructor of these interfaces, however. Add remove_resource(), which removes a resource previously inserted by insert_resource() or insert_resource_conflict(), and moves the children up to where they were before. __release_resource() is changed to have @release_child, so that this function can be used for remove_resource() as well. Also add comments to clarify that these functions are intended for producers of resources to avoid any confusion with request/release_resource() for consumers. Signed-off-by: Toshi Kani Cc: Ingo Molnar Cc: Borislav Petkov Cc: Andrew Morton Cc: Dan Williams Signed-off-by: Dan Williams --- include/linux/ioport.h | 1 + kernel/resource.c | 51 +++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 47 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index afb45597fb5f..8017b8bf45fa 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -174,6 +174,7 @@ extern void reserve_region_with_split(struct resource *root, extern struct resource *insert_resource_conflict(struct resource *parent, struct resource *new); extern int insert_resource(struct resource *parent, struct resource *new); extern void insert_resource_expand_to_fit(struct resource *root, struct resource *new); +extern int remove_resource(struct resource *old); extern void arch_remove_reservations(struct resource *avail); extern int allocate_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, diff --git a/kernel/resource.c b/kernel/resource.c index 5a56e8f24058..effb6ee2c3e8 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -233,9 +233,9 @@ static struct resource * __request_resource(struct resource *root, struct resour } } -static int __release_resource(struct resource *old) +static int __release_resource(struct resource *old, bool release_child) { - struct resource *tmp, **p; + struct resource *tmp, **p, *chd; p = &old->parent->child; for (;;) { @@ -243,7 +243,17 @@ static int __release_resource(struct resource *old) if (!tmp) break; if (tmp == old) { - *p = tmp->sibling; + if (release_child || !(tmp->child)) { + *p = tmp->sibling; + } else { + for (chd = tmp->child;; chd = chd->sibling) { + chd->parent = tmp->parent; + if (!(chd->sibling)) + break; + } + *p = tmp->child; + chd->sibling = tmp->sibling; + } old->parent = NULL; return 0; } @@ -325,7 +335,7 @@ int release_resource(struct resource *old) int retval; write_lock(&resource_lock); - retval = __release_resource(old); + retval = __release_resource(old, true); write_unlock(&resource_lock); return retval; } @@ -679,7 +689,7 @@ static int reallocate_resource(struct resource *root, struct resource *old, old->start = new.start; old->end = new.end; } else { - __release_resource(old); + __release_resource(old, true); *old = new; conflict = __request_resource(root, old); BUG_ON(conflict); @@ -825,6 +835,9 @@ static struct resource * __insert_resource(struct resource *parent, struct resou * entirely fit within the range of the new resource, then the new * resource is inserted and the conflicting resources become children of * the new resource. + * + * This function is intended for producers of resources, such as FW modules + * and bus drivers. */ struct resource *insert_resource_conflict(struct resource *parent, struct resource *new) { @@ -842,6 +855,9 @@ struct resource *insert_resource_conflict(struct resource *parent, struct resour * @new: new resource to insert * * Returns 0 on success, -EBUSY if the resource can't be inserted. + * + * This function is intended for producers of resources, such as FW modules + * and bus drivers. */ int insert_resource(struct resource *parent, struct resource *new) { @@ -885,6 +901,31 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) write_unlock(&resource_lock); } +/** + * remove_resource - Remove a resource in the resource tree + * @old: resource to remove + * + * Returns 0 on success, -EINVAL if the resource is not valid. + * + * This function removes a resource previously inserted by insert_resource() + * or insert_resource_conflict(), and moves the children (if any) up to + * where they were before. insert_resource() and insert_resource_conflict() + * insert a new resource, and move any conflicting resources down to the + * children of the new resource. + * + * insert_resource(), insert_resource_conflict() and remove_resource() are + * intended for producers of resources, such as FW modules and bus drivers. + */ +int remove_resource(struct resource *old) +{ + int retval; + + write_lock(&resource_lock); + retval = __release_resource(old, false); + write_unlock(&resource_lock); + return retval; +} + static int __adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) { -- cgit From 8095d0f225fe31eaac4a013177b77ed5283278f8 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 9 Mar 2016 12:47:05 -0700 Subject: resource: Export insert_resource and remove_resource insert_resource() and remove_resouce() are called by producers of resources, such as FW modules and bus drivers. These modules may be implemented as loadable modules. Export insert_resource() and remove_resouce() so that they can be called from such modules. link: https://lkml.org/lkml/2016/3/8/872 Signed-off-by: Toshi Kani Cc: Linus Torvalds Cc: Ingo Molnar Cc: Borislav Petkov Cc: Andrew Morton Cc: Dan Williams Signed-off-by: Dan Williams --- kernel/resource.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index effb6ee2c3e8..2e78ead30934 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -866,6 +866,7 @@ int insert_resource(struct resource *parent, struct resource *new) conflict = insert_resource_conflict(parent, new); return conflict ? -EBUSY : 0; } +EXPORT_SYMBOL_GPL(insert_resource); /** * insert_resource_expand_to_fit - Insert a resource into the resource tree @@ -925,6 +926,7 @@ int remove_resource(struct resource *old) write_unlock(&resource_lock); return retval; } +EXPORT_SYMBOL_GPL(remove_resource); static int __adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) -- cgit From f995b5f720a72dfe7a1b33a43f2841b4e72d53b7 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 9 Mar 2016 15:20:59 +0100 Subject: livepatch: Fix the error message about unresolvable ambiguity klp_find_callback() stops the search when sympos is not defined and a second symbol of the same name is found. It means that the current error message about the unresolvable ambiguity always prints "(2 matches)". Let's remove this information. The total number of occurrences is not much helpful. The author of the patch still must put a non-trivial effort into searching the right position in the object file. [jkosina@suse.cz: fixed grammar as suggested by Josh] Signed-off-by: Petr Mladek Acked-by: Josh Poimboeuf Acked-by: Chris J Arges Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index bc2c85c064c1..780f00cdb7e5 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -190,8 +190,8 @@ static int klp_find_object_symbol(const char *objname, const char *name, if (args.addr == 0) pr_err("symbol '%s' not found in symbol table\n", name); else if (args.count > 1 && sympos == 0) { - pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n", - args.count, name, objname); + pr_err("unresolvable ambiguity for symbol '%s' in object '%s'\n", + name, objname); } else if (sympos != args.count && sympos > 0) { pr_err("symbol position %lu for symbol '%s' in object '%s' not found\n", sympos, name, objname ? objname : "vmlinux"); -- cgit From d77a117e6871ff78a06def46583d23752593de60 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 9 Mar 2016 14:08:10 -0800 Subject: list: kill list_force_poison() Given we have uninitialized list_heads being passed to list_add() it will always be the case that those uninitialized values randomly trigger the poison value. Especially since a list_add() operation will seed the stack with the poison value for later stack allocations to trip over. For example, see these two false positive reports: list_add attempted on force-poisoned entry WARNING: at lib/list_debug.c:34 [..] NIP [c00000000043c390] __list_add+0xb0/0x150 LR [c00000000043c38c] __list_add+0xac/0x150 Call Trace: __list_add+0xac/0x150 (unreliable) __down+0x4c/0xf8 down+0x68/0x70 xfs_buf_lock+0x4c/0x150 [xfs] list_add attempted on force-poisoned entry(0000000000000500), new->next == d0000000059ecdb0, new->prev == 0000000000000500 WARNING: at lib/list_debug.c:33 [..] NIP [c00000000042db78] __list_add+0xa8/0x140 LR [c00000000042db74] __list_add+0xa4/0x140 Call Trace: __list_add+0xa4/0x140 (unreliable) rwsem_down_read_failed+0x6c/0x1a0 down_read+0x58/0x60 xfs_log_commit_cil+0x7c/0x600 [xfs] Fixes: commit 5c2c2587b132 ("mm, dax, pmem: introduce {get|put}_dev_pagemap() for dax-gup") Signed-off-by: Dan Williams Reported-by: Eryu Guan Tested-by: Eryu Guan Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list.h | 11 ----------- kernel/memremap.c | 9 +++++++-- lib/list_debug.c | 9 --------- 3 files changed, 7 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/list.h b/include/linux/list.h index 30cf4200ab40..5356f4d661a7 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -113,17 +113,6 @@ extern void __list_del_entry(struct list_head *entry); extern void list_del(struct list_head *entry); #endif -#ifdef CONFIG_DEBUG_LIST -/* - * See devm_memremap_pages() which wants DEBUG_LIST=y to assert if one - * of the pages it allocates is ever passed to list_add() - */ -extern void list_force_poison(struct list_head *entry); -#else -/* fallback to the less strict LIST_POISON* definitions */ -#define list_force_poison list_del -#endif - /** * list_replace - replace old entry by new one * @old : the element to be replaced diff --git a/kernel/memremap.c b/kernel/memremap.c index b981a7b023f0..778191e3e887 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -351,8 +351,13 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, for_each_device_pfn(pfn, page_map) { struct page *page = pfn_to_page(pfn); - /* ZONE_DEVICE pages must never appear on a slab lru */ - list_force_poison(&page->lru); + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back + * pointer. It is a bug if a ZONE_DEVICE page is ever + * freed or placed on a driver-private list. Seed the + * storage with LIST_POISON* values. + */ + list_del(&page->lru); page->pgmap = pgmap; } devres_add(dev, page_map); diff --git a/lib/list_debug.c b/lib/list_debug.c index 3345a089ef7b..3859bf63561c 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -12,13 +12,6 @@ #include #include -static struct list_head force_poison; -void list_force_poison(struct list_head *entry) -{ - entry->next = &force_poison; - entry->prev = &force_poison; -} - /* * Insert a new entry between two known consecutive entries. * @@ -30,8 +23,6 @@ void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { - WARN(new->next == &force_poison || new->prev == &force_poison, - "list_add attempted on force-poisoned entry\n"); WARN(next->prev != prev, "list_add corruption. next->prev should be " "prev (%p), but was %p. (next=%p).\n", -- cgit From 5f29a77cd95771edebbf41e5800fdd557c69302a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 9 Mar 2016 14:08:13 -0800 Subject: mm: fix mixed zone detection in devm_memremap_pages The check for whether we overlap "System RAM" needs to be done at section granularity. For example a system with the following mapping: 100000000-37bffffff : System RAM 37c000000-837ffffff : Persistent Memory ...is unable to use devm_memremap_pages() as it would result in two zones colliding within a given section. Signed-off-by: Dan Williams Cc: Ross Zwisler Reviewed-by: Toshi Kani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 778191e3e887..60baf4d3401e 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -270,13 +270,16 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { - int is_ram = region_intersects(res->start, resource_size(res), - "System RAM"); resource_size_t key, align_start, align_size, align_end; struct dev_pagemap *pgmap; struct page_map *page_map; + int error, nid, is_ram; unsigned long pfn; - int error, nid; + + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) + - align_start; + is_ram = region_intersects(align_start, align_size, "System RAM"); if (is_ram == REGION_MIXED) { WARN_ONCE(1, "%s attempted on mixed region %pr\n", @@ -314,8 +317,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, mutex_lock(&pgmap_lock); error = 0; - align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(resource_size(res), SECTION_SIZE); align_end = align_start + align_size - 1; for (key = align_start; key <= align_end; key += SECTION_SIZE) { struct dev_pagemap *dup; -- cgit From e1b77c92981a522223bd1ac118fdcade6b7ad086 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 9 Mar 2016 14:08:18 -0800 Subject: sched/kasan: remove stale KASAN poison after hotplug Functions which the compiler has instrumented for KASAN place poison on the stack shadow upon entry and remove this poision prior to returning. In the case of CPU hotplug, CPUs exit the kernel a number of levels deep in C code. Any instrumented functions on this critical path will leave portions of the stack shadow poisoned. When a CPU is subsequently brought back into the kernel via a different path, depending on stackframe, layout calls to instrumented functions may hit this stale poison, resulting in (spurious) KASAN splats to the console. To avoid this, clear any stale poison from the idle thread for a CPU prior to bringing a CPU online. Signed-off-by: Mark Rutland Acked-by: Catalin Marinas Reviewed-by: Andrey Ryabinin Reviewed-by: Ingo Molnar Cc: Alexander Potapenko Cc: Lorenzo Pieralisi Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9503d590e5ef..41f6b2215aa8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,6 +26,7 @@ * Thomas Gleixner, Mike Kravetz */ +#include #include #include #include @@ -5096,6 +5097,8 @@ void init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); + kasan_unpoison_task_stack(idle); + #ifdef CONFIG_SMP /* * Its possible that init_idle() gets called multiple times on a task, -- cgit From ac343e882a8377caef5fa75d9093cb77e9d4bf6d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 9 Mar 2016 14:08:32 -0800 Subject: memremap: check pfn validity before passing to pfn_to_page() In memremap's helper function try_ram_remap(), we dereference a struct page pointer that was derived from a PFN that is known to be covered by a 'System RAM' iomem region, and is thus assumed to be a 'valid' PFN, i.e., a PFN that has a struct page associated with it and is covered by the kernel direct mapping. However, the assumption that there is a 1:1 relation between the System RAM iomem region and the kernel direct mapping is not universally valid on all architectures, and on ARM and arm64, 'System RAM' may include regions for which pfn_valid() returns false. Generally speaking, both __va() and pfn_to_page() should only ever be called on PFNs/physical addresses for which pfn_valid() returns true, so add that check to try_ram_remap(). Signed-off-by: Ard Biesheuvel Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 60baf4d3401e..6cf54615a9c4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -29,10 +29,10 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) static void *try_ram_remap(resource_size_t offset, size_t size) { - struct page *page = pfn_to_page(offset >> PAGE_SHIFT); + unsigned long pfn = PHYS_PFN(offset); /* In the simple case just return the existing linear address */ - if (!PageHighMem(page)) + if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) return __va(offset); return NULL; /* fallback to ioremap_cache */ } -- cgit From b8cdc05173f05d212627b7aba7ec47fa334a79f2 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 9 Mar 2016 18:56:49 -0800 Subject: bpf: bpf_stackmap_copy depends on CONFIG_PERF_EVENTS 0-day bot reported build error: kernel/built-in.o: In function `map_lookup_elem': >> kernel/bpf/.tmp_syscall.o:(.text+0x329b3c): undefined reference to `bpf_stackmap_copy' when CONFIG_BPF_SYSCALL is set and CONFIG_PERF_EVENTS is not. Add weak definition to resolve it. This code path in map_lookup_elem() is never taken when CONFIG_PERF_EVENTS is not set. Fixes: 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation") Reported-by: Fengguang Wu Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2978d0d08869..2a2efe1bc76c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -244,6 +244,11 @@ static void __user *u64_to_ptr(__u64 val) return (void __user *) (unsigned long) val; } +int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +{ + return -ENOTSUPP; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value -- cgit From cdc4e47da8f4c32eeb6b2061a8a834f4362a12b7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 9 Mar 2016 20:02:33 -0800 Subject: bpf: avoid copying junk bytes in bpf_get_current_comm() Lots of places in the kernel use memcpy(buf, comm, TASK_COMM_LEN); but the result is typically passed to print("%s", buf) and extra bytes after zero don't cause any harm. In bpf the result of bpf_get_current_comm() is used as the part of map key and was causing spurious hash map mismatches. Use strlcpy() to guarantee zero-terminated string. bpf verifier checks that output buffer is zero-initialized, so even for short task names the output buffer don't have junk bytes. Note it's not a security concern, since kprobe+bpf is root only. Fixes: ffeedafbf023 ("bpf: introduce current->pid, tgid, uid, gid, comm accessors") Reported-by: Tobias Waldekranz Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4504ca66118d..50da680c479f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -166,7 +166,7 @@ static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) if (!task) return -EINVAL; - memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); + strlcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); return 0; } -- cgit From 90d1098478fb08a1ef166fe91622d8046869e17b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 9 Mar 2016 17:55:35 -0800 Subject: locking/csd_lock: Explicitly inline csd_lock*() helpers While the compiler tends to already to it for us (except for csd_unlock), make it explicit. These helpers mainly deal with the ->flags, are short-lived and can be called, for example, from smp_call_function_many(). Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Link: http://lkml.kernel.org/r/1457574936-19065-2-git-send-email-dbueso@suse.de Signed-off-by: Ingo Molnar --- kernel/smp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index d903c02223af..5099db15c5fb 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -105,13 +105,13 @@ void __init call_function_init(void) * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static void csd_lock_wait(struct call_single_data *csd) +static __always_inline void csd_lock_wait(struct call_single_data *csd) { while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK) cpu_relax(); } -static void csd_lock(struct call_single_data *csd) +static __always_inline void csd_lock(struct call_single_data *csd) { csd_lock_wait(csd); csd->flags |= CSD_FLAG_LOCK; @@ -124,7 +124,7 @@ static void csd_lock(struct call_single_data *csd) smp_wmb(); } -static void csd_unlock(struct call_single_data *csd) +static __always_inline void csd_unlock(struct call_single_data *csd) { WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); -- cgit From 38460a2178d225b39ade5ac66586c3733391cf86 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 9 Mar 2016 17:55:36 -0800 Subject: locking/csd_lock: Use smp_cond_acquire() in csd_lock_wait() We can micro-optimize this call and mildly relax the barrier requirements by relying on ctrl + rmb, keeping the acquire semantics. In addition, this is pretty much the now standard for busy-waiting under such restraints. Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Link: http://lkml.kernel.org/r/1457574936-19065-3-git-send-email-dbueso@suse.de Signed-off-by: Ingo Molnar --- kernel/smp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 5099db15c5fb..300d29391e07 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -107,8 +107,7 @@ void __init call_function_init(void) */ static __always_inline void csd_lock_wait(struct call_single_data *csd) { - while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK) - cpu_relax(); + smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK)); } static __always_inline void csd_lock(struct call_single_data *csd) -- cgit From 52b2a05fa7c8cfceebb59117a95decd68cf7e465 Mon Sep 17 00:00:00 2001 From: Quan Nguyen Date: Thu, 3 Mar 2016 21:56:52 +0700 Subject: genirq: Export IRQ functions for module use Export irq_chip_*_parent(), irq_domain_create_hierarchy(), irq_domain_set_hwirq_and_chip(), irq_domain_reset_irq_data(), irq_domain_alloc/free_irqs_parent() So gpio drivers can be built as modules. First user: gpio-xgene-sb Signed-off-by: Quan Nguyen Acked-by: Linus Walleij Cc: Phong Vo Cc: Marc Zyngier Cc: patches@apm.com Cc: Loc Ho Cc: Keyur Chudgar Cc: Jiang Liu Link: https://lists.01.org/pipermail/kbuild-all/2016-February/017914.html Link: http://lkml.kernel.org/r/1457017012-10628-1-git-send-email-qnguyen@apm.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 4 ++++ kernel/irq/irqdomain.c | 5 +++++ 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 5797909f4e5b..2f9f2b0e79f2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -961,6 +961,7 @@ void irq_chip_mask_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_mask(data); } +EXPORT_SYMBOL_GPL(irq_chip_mask_parent); /** * irq_chip_unmask_parent - Unmask the parent interrupt @@ -971,6 +972,7 @@ void irq_chip_unmask_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_unmask(data); } +EXPORT_SYMBOL_GPL(irq_chip_unmask_parent); /** * irq_chip_eoi_parent - Invoke EOI on the parent interrupt @@ -981,6 +983,7 @@ void irq_chip_eoi_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_eoi(data); } +EXPORT_SYMBOL_GPL(irq_chip_eoi_parent); /** * irq_chip_set_affinity_parent - Set affinity on the parent interrupt @@ -1016,6 +1019,7 @@ int irq_chip_set_type_parent(struct irq_data *data, unsigned int type) return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_chip_set_type_parent); /** * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 86811541f073..3a519a01118b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -893,6 +893,7 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, return domain; } +EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy); static void irq_domain_insert_irq(int virq) { @@ -1043,6 +1044,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, return 0; } +EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip); /** * irq_domain_set_info - Set the complete data for a @virq in @domain @@ -1076,6 +1078,7 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data) irq_data->chip = &no_irq_chip; irq_data->chip_data = NULL; } +EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); /** * irq_domain_free_irqs_common - Clear irq_data and free the parent @@ -1273,6 +1276,7 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain, nr_irqs, arg); return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); /** * irq_domain_free_irqs_parent - Free interrupts from parent domain @@ -1290,6 +1294,7 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, irq_domain_free_irqs_recursive(domain->parent, irq_base, nr_irqs); } +EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate -- cgit From adaf9fcd136970e480d7ca834c0cf25ce922ea74 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Mar 2016 20:44:47 +0100 Subject: cpufreq: Move scheduler-related code to the sched directory Create cpufreq.c under kernel/sched/ and move the cpufreq code related to the scheduler to that file and to sched.h. Redefine cpufreq_update_util() as a static inline function to avoid function calls at its call sites in the scheduler code (as suggested by Peter Zijlstra). Also move the definition of struct update_util_data and declaration of cpufreq_set_update_util_data() from include/linux/cpufreq.h to include/linux/sched.h. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) --- drivers/cpufreq/cpufreq.c | 53 -------------------------------------- drivers/cpufreq/cpufreq_governor.c | 1 + include/linux/cpufreq.h | 34 ------------------------ include/linux/sched.h | 9 +++++++ kernel/sched/Makefile | 1 + kernel/sched/cpufreq.c | 37 ++++++++++++++++++++++++++ kernel/sched/sched.h | 49 ++++++++++++++++++++++++++++++++++- 7 files changed, 96 insertions(+), 88 deletions(-) create mode 100644 kernel/sched/cpufreq.c (limited to 'kernel') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 6eca12ab71d7..58e1a39b4d22 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -103,59 +103,6 @@ static struct cpufreq_driver *cpufreq_driver; static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data); static DEFINE_RWLOCK(cpufreq_driver_lock); -static DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); - -/** - * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer. - * @cpu: The CPU to set the pointer for. - * @data: New pointer value. - * - * Set and publish the update_util_data pointer for the given CPU. That pointer - * points to a struct update_util_data object containing a callback function - * to call from cpufreq_update_util(). That function will be called from an RCU - * read-side critical section, so it must not sleep. - * - * Callers must use RCU-sched callbacks to free any memory that might be - * accessed via the old update_util_data pointer or invoke synchronize_sched() - * right after this function to avoid use-after-free. - */ -void cpufreq_set_update_util_data(int cpu, struct update_util_data *data) -{ - if (WARN_ON(data && !data->func)) - return; - - rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); -} -EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data); - -/** - * cpufreq_update_util - Take a note about CPU utilization changes. - * @time: Current time. - * @util: Current utilization. - * @max: Utilization ceiling. - * - * This function is called by the scheduler on every invocation of - * update_load_avg() on the CPU whose utilization is being updated. - * - * It can only be called from RCU-sched read-side critical sections. - */ -void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) -{ - struct update_util_data *data; - -#ifdef CONFIG_LOCKDEP - WARN_ON(debug_locks && !rcu_read_lock_sched_held()); -#endif - - data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); - /* - * If this isn't inside of an RCU-sched read-side critical section, data - * may become NULL after the check below. - */ - if (data) - data->func(data, time, util, max); -} - /* Flag to suspend/resume CPUFreq governors */ static bool cpufreq_suspended; diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index db46190bb246..1c25ef405616 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -18,6 +18,7 @@ #include #include +#include #include #include "cpufreq_governor.h" diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index a50c5b2e3bf2..a5ea52f793f3 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -146,36 +146,6 @@ static inline bool policy_is_shared(struct cpufreq_policy *policy) extern struct kobject *cpufreq_global_kobject; #ifdef CONFIG_CPU_FREQ -void cpufreq_update_util(u64 time, unsigned long util, unsigned long max); - -/** - * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. - * @time: Current time. - * - * The way cpufreq is currently arranged requires it to evaluate the CPU - * performance state (frequency/voltage) on a regular basis to prevent it from - * being stuck in a completely inadequate performance level for too long. - * That is not guaranteed to happen if the updates are only triggered from CFS, - * though, because they may not be coming in if RT or deadline tasks are active - * all the time (or there are RT and DL tasks only). - * - * As a workaround for that issue, this function is called by the RT and DL - * sched classes to trigger extra cpufreq updates to prevent it from stalling, - * but that really is a band-aid. Going forward it should be replaced with - * solutions targeted more specifically at RT and DL tasks. - */ -static inline void cpufreq_trigger_update(u64 time) -{ - cpufreq_update_util(time, ULONG_MAX, 0); -} - -struct update_util_data { - void (*func)(struct update_util_data *data, - u64 time, unsigned long util, unsigned long max); -}; - -void cpufreq_set_update_util_data(int cpu, struct update_util_data *data); - unsigned int cpufreq_get(unsigned int cpu); unsigned int cpufreq_quick_get(unsigned int cpu); unsigned int cpufreq_quick_get_max(unsigned int cpu); @@ -187,10 +157,6 @@ int cpufreq_update_policy(unsigned int cpu); bool have_governor_per_policy(void); struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); #else -static inline void cpufreq_update_util(u64 time, unsigned long util, - unsigned long max) {} -static inline void cpufreq_trigger_update(u64 time) {} - static inline unsigned int cpufreq_get(unsigned int cpu) { return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index a10494a94cc3..913e755ef7b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3207,4 +3207,13 @@ static inline unsigned long rlimit_max(unsigned int limit) return task_rlimit_max(current, limit); } +#ifdef CONFIG_CPU_FREQ +struct update_util_data { + void (*func)(struct update_util_data *data, + u64 time, unsigned long util, unsigned long max); +}; + +void cpufreq_set_update_util_data(int cpu, struct update_util_data *data); +#endif /* CONFIG_CPU_FREQ */ + #endif diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..9507522164ac 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ) += cpufreq.o diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c new file mode 100644 index 000000000000..928c4ba32f68 --- /dev/null +++ b/kernel/sched/cpufreq.c @@ -0,0 +1,37 @@ +/* + * Scheduler code and data structures related to cpufreq. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "sched.h" + +DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer. + * @cpu: The CPU to set the pointer for. + * @data: New pointer value. + * + * Set and publish the update_util_data pointer for the given CPU. That pointer + * points to a struct update_util_data object containing a callback function + * to call from cpufreq_update_util(). That function will be called from an RCU + * read-side critical section, so it must not sleep. + * + * Callers must use RCU-sched callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_sched() + * right after this function to avoid use-after-free. + */ +void cpufreq_set_update_util_data(int cpu, struct update_util_data *data) +{ + if (WARN_ON(data && !data->func)) + return; + + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); +} +EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f042190c8002..faf7e2758dd0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -9,7 +9,6 @@ #include #include #include -#include #include "cpupri.h" #include "cpudeadline.h" @@ -1739,3 +1738,51 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @time: Current time. + * @util: Current utilization. + * @max: Utilization ceiling. + * + * This function is called by the scheduler on every invocation of + * update_load_avg() on the CPU whose utilization is being updated. + * + * It can only be called from RCU-sched read-side critical sections. + */ +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + if (data) + data->func(data, time, util, max); +} + +/** + * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. + * @time: Current time. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS, + * though, because they may not be coming in if RT or deadline tasks are active + * all the time (or there are RT and DL tasks only). + * + * As a workaround for that issue, this function is called by the RT and DL + * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT and DL tasks. + */ +static inline void cpufreq_trigger_update(u64 time) +{ + cpufreq_update_util(time, ULONG_MAX, 0); +} +#else +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} +static inline void cpufreq_trigger_update(u64 time) {} +#endif /* CONFIG_CPU_FREQ */ -- cgit From 2a58c527bb566b7abad96289fa5b973040c33c82 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Mar 2016 20:42:08 +0100 Subject: cpu/hotplug: Fix smpboot thread ordering Commit 931ef163309e moved the smpboot thread park/unpark invocation to the state machine. The move of the unpark invocation was premature as it depends on work in progress patches. As a result cpu down can fail, because rcu synchronization in takedown_cpu() eventually requires a functional softirq thread. I never encountered the problem in testing, but 0day testing managed to provide a reliable reproducer. Remove the smpboot_threads_park() call from the state machine for now and put it back into the original place after the rcu synchronization. I'm embarrassed as I knew about the dependency and still managed to get it wrong. Hotplug induced brain melt seems to be the only sensible explanation for that. Fixes: 931ef163309e "cpu/hotplug: Unpark smpboot threads from the state machine" Reported-by: Fengguang Wu Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra --- kernel/cpu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 373e831e0faa..bcee286b7c73 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -706,8 +706,9 @@ static int takedown_cpu(unsigned int cpu) else synchronize_rcu(); - /* Park the hotplug thread */ + /* Park the smpboot threads */ kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread); + smpboot_park_threads(cpu); /* * Prevent irq alloc/free while the dying cpu reorganizes the @@ -1206,7 +1207,7 @@ static struct cpuhp_step cpuhp_ap_states[] = { [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot:threads", .startup = smpboot_unpark_threads, - .teardown = smpboot_park_threads, + .teardown = NULL, }, [CPUHP_AP_NOTIFY_ONLINE] = { .name = "notify:online", -- cgit From 22aceb317678057dced5f1d6e3ac15acdb863e7b Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 10 Mar 2016 12:07:38 +0100 Subject: workqueue: Fix comment for work_on_cpu() Function is processed in thread context, not in user context. Cc: Tejun Heo Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 16f4986205e9..c3692d9eda55 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4695,7 +4695,7 @@ static void work_for_cpu_fn(struct work_struct *work) } /** - * work_on_cpu - run a function in user context on a particular cpu + * work_on_cpu - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function arg -- cgit From d10ef6f9380b8853c4b48eb104268fccfdc0b0c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Mar 2016 10:36:13 +0100 Subject: cpu/hotplug: Document states better Requested-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index bcee286b7c73..6ea42e8da861 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1172,6 +1172,10 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown = NULL, .cant_stop = true, }, + /* + * Preparatory and dead notifiers. Will be replaced once the notifiers + * are converted to states. + */ [CPUHP_NOTIFY_PREPARE] = { .name = "notify:prepare", .startup = notify_prepare, @@ -1179,12 +1183,17 @@ static struct cpuhp_step cpuhp_bp_states[] = { .skip_onerr = true, .cant_stop = true, }, + /* Kicks the plugged cpu into life */ [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", .startup = bringup_cpu, .teardown = NULL, .cant_stop = true, }, + /* + * Handled on controll processor until the plugged processor manages + * this itself. + */ [CPUHP_TEARDOWN_CPU] = { .name = "cpu:teardown", .startup = NULL, @@ -1197,6 +1206,23 @@ static struct cpuhp_step cpuhp_bp_states[] = { /* Application processor state steps */ static struct cpuhp_step cpuhp_ap_states[] = { #ifdef CONFIG_SMP + /* Final state before CPU kills itself */ + [CPUHP_AP_IDLE_DEAD] = { + .name = "idle:dead", + }, + /* + * Last state before CPU enters the idle loop to die. Transient state + * for synchronization. + */ + [CPUHP_AP_OFFLINE] = { + .name = "ap:offline", + .cant_stop = true, + }, + /* + * Low level startup/teardown notifiers. Run with interrupts + * disabled. Will be removed once the notifiers are converted to + * states. + */ [CPUHP_AP_NOTIFY_STARTING] = { .name = "notify:starting", .startup = notify_starting, @@ -1204,17 +1230,32 @@ static struct cpuhp_step cpuhp_ap_states[] = { .skip_onerr = true, .cant_stop = true, }, + /* Entry state on starting. Interrupts enabled from here on. Transient + * state for synchronsization */ + [CPUHP_AP_ONLINE] = { + .name = "ap:online", + }, + /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot:threads", .startup = smpboot_unpark_threads, .teardown = NULL, }, + /* + * Online/down_prepare notifiers. Will be removed once the notifiers + * are converted to states. + */ [CPUHP_AP_NOTIFY_ONLINE] = { .name = "notify:online", .startup = notify_online, .teardown = notify_down_prepare, }, #endif + /* + * The dynamically registered state space is here + */ + + /* CPU is fully up and running. */ [CPUHP_ONLINE] = { .name = "online", .startup = NULL, @@ -1232,7 +1273,11 @@ static int cpuhp_cb_check(enum cpuhp_state state) static bool cpuhp_is_ap_state(enum cpuhp_state state) { - return state > CPUHP_BRINGUP_CPU; + /* + * The extra check for CPUHP_TEARDOWN_CPU is only for documentation + * purposes as that state is handled explicitely in cpu_down. + */ + return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; } static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) -- cgit From 25528213fe9f75f4e286f08d35a73ca2bb634a50 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Mar 2016 14:52:49 -0700 Subject: tags: Fix DEFINE_PER_CPU expansions $ make tags GEN tags ctags: Warning: drivers/acpi/processor_idle.c:64: null expansion of name pattern "\1" ctags: Warning: drivers/xen/events/events_2l.c:41: null expansion of name pattern "\1" ctags: Warning: kernel/locking/lockdep.c:151: null expansion of name pattern "\1" ctags: Warning: kernel/rcu/rcutorture.c:133: null expansion of name pattern "\1" ctags: Warning: kernel/rcu/rcutorture.c:135: null expansion of name pattern "\1" ctags: Warning: kernel/workqueue.c:323: null expansion of name pattern "\1" ctags: Warning: net/ipv4/syncookies.c:53: null expansion of name pattern "\1" ctags: Warning: net/ipv6/syncookies.c:44: null expansion of name pattern "\1" ctags: Warning: net/rds/page.c:45: null expansion of name pattern "\1" Which are all the result of the DEFINE_PER_CPU pattern: scripts/tags.sh:200: '/\ Acked-by: David S. Miller Acked-by: Rafael J. Wysocki Cc: Tejun Heo Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/acpi/processor_idle.c | 4 ++-- drivers/xen/events/events_2l.c | 5 +++-- kernel/locking/lockdep.c | 3 +-- kernel/rcu/rcutorture.c | 6 ++---- kernel/workqueue.c | 3 +-- net/ipv4/syncookies.c | 3 +-- net/ipv6/syncookies.c | 3 +-- net/rds/page.c | 4 ++-- 8 files changed, 13 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 175c86bee3a9..9ca2b2fefd76 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -61,8 +61,8 @@ module_param(latency_factor, uint, 0644); static DEFINE_PER_CPU(struct cpuidle_device *, acpi_cpuidle_device); -static DEFINE_PER_CPU(struct acpi_processor_cx * [CPUIDLE_STATE_MAX], - acpi_cstate); +static +DEFINE_PER_CPU(struct acpi_processor_cx * [CPUIDLE_STATE_MAX], acpi_cstate); static int disabled_by_idle_boot_param(void) { diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c index 7dd46312c180..403fe3955393 100644 --- a/drivers/xen/events/events_2l.c +++ b/drivers/xen/events/events_2l.c @@ -38,8 +38,9 @@ /* Find the first set bit in a evtchn mask */ #define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) -static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD], - cpu_evtchn_mask); +#define EVTCHN_MASK_SIZE (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD) + +static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_MASK_SIZE], cpu_evtchn_mask); static unsigned evtchn_2l_max_channels(void) { diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index f894a2cd9b2a..53ab2f85d77e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -148,8 +148,7 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock) } #ifdef CONFIG_LOCK_STAT -static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], - cpu_lock_stats); +static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], cpu_lock_stats); static inline u64 lockstat_clock(void) { diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d2988d047d66..4d5cc6aa7e1e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -130,10 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current; static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], - rcu_torture_count) = { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], - rcu_torture_batch) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 }; static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7ff5dc7d2ac5..16e13d8628a3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -320,8 +320,7 @@ static bool wq_debug_force_rr_cpu = false; module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); /* the per-cpu worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - cpu_worker_pools); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 643a86c49020..2d5589b61e9f 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -50,8 +50,7 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; #define TSBITS 6 #define TSMASK (((__u32)1 << TSBITS) - 1) -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], - ipv4_cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch); static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 2906ef20795e..aae3e5ca63ea 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -41,8 +41,7 @@ static __u16 const msstab[] = { 9000 - 60, }; -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], - ipv6_cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch); static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) diff --git a/net/rds/page.c b/net/rds/page.c index 5a14e6d6a926..616f21f4e7d7 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -42,8 +42,8 @@ struct rds_page_remainder { unsigned long r_offset; }; -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, - rds_page_remainders); +static +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); /* * returns 0 on success or -errno on failure. -- cgit From 07061aab2f750bbf61337b922aa8a245b5da85e1 Mon Sep 17 00:00:00 2001 From: Andreas Ziegler Date: Tue, 15 Mar 2016 14:55:33 -0700 Subject: mm: fix two typos in comments for to_vmem_altmap() Commit 4b94ffdc4163 ("x86, mm: introduce vmem_altmap to augment vmemmap_populate()"), introduced the to_vmem_altmap() function. The comments in this function contain two typos (one misspelling of the Kconfig option CONFIG_SPARSEMEM_VMEMMAP, and one missing letter 'n'), let's fix them up. Signed-off-by: Andreas Ziegler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index fb9b88787ebc..584febd13e2e 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -391,7 +391,7 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) /* * 'memmap_start' is the virtual address for the first "struct * page" in this range of the vmemmap array. In the case of - * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple + * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple * pointer arithmetic, so we can perform this to_vmem_altmap() * conversion without concern for the initialization state of * the struct page fields. @@ -400,7 +400,7 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) struct dev_pagemap *pgmap; /* - * Uncoditionally retrieve a dev_pagemap associated with the + * Unconditionally retrieve a dev_pagemap associated with the * given physical address, this is only for use in the * arch_{add|remove}_memory() for setting up and tearing down * the memmap. -- cgit From 1414c7f4f7d72d138fff35f00151d15749b5beda Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 15 Mar 2016 14:56:30 -0700 Subject: mm/page_poisoning.c: allow for zero poisoning By default, page poisoning uses a poison value (0xaa) on free. If this is changed to 0, the page is not only sanitized but zeroing on alloc with __GFP_ZERO can be skipped as well. The tradeoff is that detecting corruption from the poisoning is harder to detect. This feature also cannot be used with hibernation since pages are not guaranteed to be zeroed after hibernation. Credit to Grsecurity/PaX team for inspiring this work Signed-off-by: Laura Abbott Acked-by: Rafael J. Wysocki Cc: "Kirill A. Shutemov" Cc: Vlastimil Babka Cc: Michal Hocko Cc: Kees Cook Cc: Mathias Krause Cc: Dave Hansen Cc: Jianyu Zhan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ include/linux/poison.h | 4 ++++ kernel/power/hibernate.c | 17 +++++++++++++++++ mm/Kconfig.debug | 14 ++++++++++++++ mm/page_alloc.c | 11 ++++++++++- mm/page_ext.c | 10 ++++++++-- mm/page_poison.c | 7 +++++-- 7 files changed, 60 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index 99dcc8f36e28..b97243d6aa49 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2179,10 +2179,12 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, #ifdef CONFIG_PAGE_POISONING extern bool page_poisoning_enabled(void); extern void kernel_poison_pages(struct page *page, int numpages, int enable); +extern bool page_is_poisoned(struct page *page); #else static inline bool page_poisoning_enabled(void) { return false; } static inline void kernel_poison_pages(struct page *page, int numpages, int enable) { } +static inline bool page_is_poisoned(struct page *page) { return false; } #endif #ifdef CONFIG_DEBUG_PAGEALLOC diff --git a/include/linux/poison.h b/include/linux/poison.h index 4a27153574e2..51334edec506 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -30,7 +30,11 @@ #define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) /********** mm/debug-pagealloc.c **********/ +#ifdef CONFIG_PAGE_POISONING_ZERO +#define PAGE_POISON 0x00 +#else #define PAGE_POISON 0xaa +#endif /********** mm/page_alloc.c ************/ diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b7342a24f559..aa0f26b58426 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1158,6 +1158,22 @@ static int __init kaslr_nohibernate_setup(char *str) return nohibernate_setup(str); } +static int __init page_poison_nohibernate_setup(char *str) +{ +#ifdef CONFIG_PAGE_POISONING_ZERO + /* + * The zeroing option for page poison skips the checks on alloc. + * since hibernation doesn't save free pages there's no way to + * guarantee the pages will still be zeroed. + */ + if (!strcmp(str, "on")) { + pr_info("Disabling hibernation due to page poisoning\n"); + return nohibernate_setup(str); + } +#endif + return 1; +} + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); @@ -1166,3 +1182,4 @@ __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); __setup("kaslr", kaslr_nohibernate_setup); +__setup("page_poison=", page_poison_nohibernate_setup); diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 1f99f9a0deae..5c50b238b770 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -65,3 +65,17 @@ config PAGE_POISONING_NO_SANITY If you are only interested in sanitization, say Y. Otherwise say N. + +config PAGE_POISONING_ZERO + bool "Use zero for poisoning instead of random data" + depends on PAGE_POISONING + ---help--- + Instead of using the existing poison value, fill the pages with + zeros. This makes it harder to detect when errors are occurring + due to sanitization but the zeroing at free means that it is + no longer necessary to write zeros when GFP_ZERO is used on + allocation. + + Enabling page poisoning with this option will disable hibernation + + If unsure, say N diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a08349fbab2..50897dcaefdb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1405,15 +1405,24 @@ static inline int check_new_page(struct page *page) return 0; } +static inline bool free_pages_prezeroed(bool poisoned) +{ + return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && + page_poisoning_enabled() && poisoned; +} + static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, int alloc_flags) { int i; + bool poisoned = true; for (i = 0; i < (1 << order); i++) { struct page *p = page + i; if (unlikely(check_new_page(p))) return 1; + if (poisoned) + poisoned &= page_is_poisoned(p); } set_page_private(page, 0); @@ -1424,7 +1433,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, kernel_poison_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); - if (gfp_flags & __GFP_ZERO) + if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) clear_highpage(page + i); diff --git a/mm/page_ext.c b/mm/page_ext.c index 292ca7b8debd..2d864e64f7fe 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -106,12 +106,15 @@ struct page_ext *lookup_page_ext(struct page *page) struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; -#ifdef CONFIG_DEBUG_VM +#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. + * + * This check is also necessary for ensuring page poisoning + * works as expected when enabled */ if (unlikely(!base)) return NULL; @@ -180,12 +183,15 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#ifdef CONFIG_DEBUG_VM +#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. + * + * This check is also necessary for ensuring page poisoning + * works as expected when enabled */ if (!section->page_ext) return NULL; diff --git a/mm/page_poison.c b/mm/page_poison.c index 89d3bc773633..479e7ea2bea6 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -71,11 +71,14 @@ static inline void clear_page_poison(struct page *page) __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } -static inline bool page_poison(struct page *page) +bool page_is_poisoned(struct page *page) { struct page_ext *page_ext; page_ext = lookup_page_ext(page); + if (!page_ext) + return false; + return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } @@ -137,7 +140,7 @@ static void unpoison_page(struct page *page) { void *addr; - if (!page_poison(page)) + if (!page_is_poisoned(page)) return; addr = kmap_atomic(page); -- cgit From 2213e9a66bb87d8344a1256b4ef568220d9587fb Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 15 Mar 2016 14:58:19 -0700 Subject: kallsyms: add support for relative offsets in kallsyms address table Similar to how relative extables are implemented, it is possible to emit the kallsyms table in such a way that it contains offsets relative to some anchor point in the kernel image rather than absolute addresses. On 64-bit architectures, it cuts the size of the kallsyms address table in half, since offsets between kernel symbols can typically be expressed in 32 bits. This saves several hundreds of kilobytes of permanent .rodata on average. In addition, the kallsyms address table is no longer subject to dynamic relocation when CONFIG_RELOCATABLE is in effect, so the relocation work done after decompression now doesn't have to do relocation updates for all these values. This saves up to 24 bytes (i.e., the size of a ELF64 RELA relocation table entry) per value, which easily adds up to a couple of megabytes of uncompressed __init data on ppc64 or arm64. Even if these relocation entries typically compress well, the combined size reduction of 2.8 MB uncompressed for a ppc64_defconfig build (of which 2.4 MB is __init data) results in a ~500 KB space saving in the compressed image. Since it is useful for some architectures (like x86) to retain the ability to emit absolute values as well, this patch also adds support for capturing both absolute and relative values when KALLSYMS_ABSOLUTE_PERCPU is in effect, by emitting absolute per-cpu addresses as positive 32-bit values, and addresses relative to the lowest encountered relative symbol as negative values, which are subtracted from the runtime address of this base symbol to produce the actual address. Support for the above is enabled by default for all architectures except IA-64 and Tile-GX, whose symbols are too far apart to capture in this manner. Signed-off-by: Ard Biesheuvel Tested-by: Guenter Roeck Reviewed-by: Kees Cook Tested-by: Kees Cook Cc: Heiko Carstens Cc: Michael Ellerman Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Benjamin Herrenschmidt Cc: Michal Marek Cc: Rusty Russell Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/Kconfig | 18 +++++++++++ kernel/kallsyms.c | 42 ++++++++++++++++++++------ scripts/kallsyms.c | 79 ++++++++++++++++++++++++++++++++++++++++++------- scripts/link-vmlinux.sh | 4 +++ scripts/namespace.pl | 2 ++ 5 files changed, 126 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index b17824a875fa..fd664b3ab99e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1424,6 +1424,24 @@ config KALLSYMS_ABSOLUTE_PERCPU bool default X86_64 && SMP +config KALLSYMS_BASE_RELATIVE + bool + depends on KALLSYMS + default !IA64 && !(TILE && 64BIT) + help + Instead of emitting them as absolute values in the native word size, + emit the symbol references in the kallsyms table as 32-bit entries, + each containing a relative value in the range [base, base + U32_MAX] + or, when KALLSYMS_ABSOLUTE_PERCPU is in effect, each containing either + an absolute value in the range [0, S32_MAX] or a relative value in the + range [base, base + S32_MAX], where base is the lowest relative symbol + address encountered in the image. + + On 64-bit builds, this reduces the size of the address table by 50%, + but more importantly, it results in entries whose values are build + time constants, and no relocation pass is required at runtime to fix + up the entries based on the runtime load address of the kernel. + config PRINTK default y bool "Enable support for printk" if EXPERT diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 5c5987f10819..fafd1a3ef0da 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -38,6 +38,7 @@ * during the second link stage. */ extern const unsigned long kallsyms_addresses[] __weak; +extern const int kallsyms_offsets[] __weak; extern const u8 kallsyms_names[] __weak; /* @@ -47,6 +48,9 @@ extern const u8 kallsyms_names[] __weak; extern const unsigned long kallsyms_num_syms __attribute__((weak, section(".rodata"))); +extern const unsigned long kallsyms_relative_base +__attribute__((weak, section(".rodata"))); + extern const u8 kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; @@ -176,6 +180,23 @@ static unsigned int get_symbol_offset(unsigned long pos) return name - kallsyms_names; } +static unsigned long kallsyms_sym_address(int idx) +{ + if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) + return kallsyms_addresses[idx]; + + /* values are unsigned offsets if --absolute-percpu is not in effect */ + if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU)) + return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; + + /* ...otherwise, positive offsets are absolute values */ + if (kallsyms_offsets[idx] >= 0) + return kallsyms_offsets[idx]; + + /* ...and negative offsets are relative to kallsyms_relative_base - 1 */ + return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; +} + /* Lookup the address for this symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name) { @@ -187,7 +208,7 @@ unsigned long kallsyms_lookup_name(const char *name) off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (strcmp(namebuf, name) == 0) - return kallsyms_addresses[i]; + return kallsyms_sym_address(i); } return module_kallsyms_lookup_name(name); } @@ -204,7 +225,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, for (i = 0, off = 0; i < kallsyms_num_syms; i++) { off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); + ret = fn(data, namebuf, NULL, kallsyms_sym_address(i)); if (ret != 0) return ret; } @@ -220,7 +241,10 @@ static unsigned long get_symbol_pos(unsigned long addr, unsigned long i, low, high, mid; /* This kernel should never had been booted. */ - BUG_ON(!kallsyms_addresses); + if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) + BUG_ON(!kallsyms_addresses); + else + BUG_ON(!kallsyms_offsets); /* Do a binary search on the sorted kallsyms_addresses array. */ low = 0; @@ -228,7 +252,7 @@ static unsigned long get_symbol_pos(unsigned long addr, while (high - low > 1) { mid = low + (high - low) / 2; - if (kallsyms_addresses[mid] <= addr) + if (kallsyms_sym_address(mid) <= addr) low = mid; else high = mid; @@ -238,15 +262,15 @@ static unsigned long get_symbol_pos(unsigned long addr, * Search for the first aliased symbol. Aliased * symbols are symbols with the same address. */ - while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) + while (low && kallsyms_sym_address(low-1) == kallsyms_sym_address(low)) --low; - symbol_start = kallsyms_addresses[low]; + symbol_start = kallsyms_sym_address(low); /* Search for next non-aliased symbol. */ for (i = low + 1; i < kallsyms_num_syms; i++) { - if (kallsyms_addresses[i] > symbol_start) { - symbol_end = kallsyms_addresses[i]; + if (kallsyms_sym_address(i) > symbol_start) { + symbol_end = kallsyms_sym_address(i); break; } } @@ -470,7 +494,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) unsigned off = iter->nameoff; iter->module_name[0] = '\0'; - iter->value = kallsyms_addresses[iter->pos]; + iter->value = kallsyms_sym_address(iter->pos); iter->type = kallsyms_get_symbol_type(off); diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index d39a1eeb080e..638b143ee60f 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -22,6 +22,7 @@ #include #include #include +#include #ifndef ARRAY_SIZE #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) @@ -43,6 +44,7 @@ struct addr_range { }; static unsigned long long _text; +static unsigned long long relative_base; static struct addr_range text_ranges[] = { { "_stext", "_etext" }, { "_sinittext", "_einittext" }, @@ -62,6 +64,7 @@ static int all_symbols = 0; static int absolute_percpu = 0; static char symbol_prefix_char = '\0'; static unsigned long long kernel_start_addr = 0; +static int base_relative = 0; int token_profit[0x10000]; @@ -75,7 +78,7 @@ static void usage(void) fprintf(stderr, "Usage: kallsyms [--all-symbols] " "[--symbol-prefix=] " "[--page-offset=] " - "< in.map > out.S\n"); + "[--base-relative] < in.map > out.S\n"); exit(1); } @@ -205,6 +208,8 @@ static int symbol_valid(struct sym_entry *s) */ static char *special_symbols[] = { "kallsyms_addresses", + "kallsyms_offsets", + "kallsyms_relative_base", "kallsyms_num_syms", "kallsyms_names", "kallsyms_markers", @@ -349,16 +354,48 @@ static void write_src(void) printf("\t.section .rodata, \"a\"\n"); - /* Provide proper symbols relocatability by their '_text' - * relativeness. The symbol names cannot be used to construct - * normal symbol references as the list of symbols contains - * symbols that are declared static and are private to their - * .o files. This prevents .tmp_kallsyms.o or any other - * object from referencing them. + /* Provide proper symbols relocatability by their relativeness + * to a fixed anchor point in the runtime image, either '_text' + * for absolute address tables, in which case the linker will + * emit the final addresses at build time. Otherwise, use the + * offset relative to the lowest value encountered of all relative + * symbols, and emit non-relocatable fixed offsets that will be fixed + * up at runtime. + * + * The symbol names cannot be used to construct normal symbol + * references as the list of symbols contains symbols that are + * declared static and are private to their .o files. This prevents + * .tmp_kallsyms.o or any other object from referencing them. */ - output_label("kallsyms_addresses"); + if (!base_relative) + output_label("kallsyms_addresses"); + else + output_label("kallsyms_offsets"); + for (i = 0; i < table_cnt; i++) { - if (!symbol_absolute(&table[i])) { + if (base_relative) { + long long offset; + int overflow; + + if (!absolute_percpu) { + offset = table[i].addr - relative_base; + overflow = (offset < 0 || offset > UINT_MAX); + } else if (symbol_absolute(&table[i])) { + offset = table[i].addr; + overflow = (offset < 0 || offset > INT_MAX); + } else { + offset = relative_base - table[i].addr - 1; + overflow = (offset < INT_MIN || offset >= 0); + } + if (overflow) { + fprintf(stderr, "kallsyms failure: " + "%s symbol value %#llx out of range in relative mode\n", + symbol_absolute(&table[i]) ? "absolute" : "relative", + table[i].addr); + exit(EXIT_FAILURE); + } + printf("\t.long\t%#x\n", (int)offset); + } else if (!symbol_absolute(&table[i])) { if (_text <= table[i].addr) printf("\tPTR\t_text + %#llx\n", table[i].addr - _text); @@ -371,6 +408,12 @@ static void write_src(void) } printf("\n"); + if (base_relative) { + output_label("kallsyms_relative_base"); + printf("\tPTR\t_text - %#llx\n", _text - relative_base); + printf("\n"); + } + output_label("kallsyms_num_syms"); printf("\tPTR\t%d\n", table_cnt); printf("\n"); @@ -695,6 +738,18 @@ static void make_percpus_absolute(void) } } +/* find the minimum non-absolute symbol address */ +static void record_relative_base(void) +{ + unsigned int i; + + relative_base = -1ULL; + for (i = 0; i < table_cnt; i++) + if (!symbol_absolute(&table[i]) && + table[i].addr < relative_base) + relative_base = table[i].addr; +} + int main(int argc, char **argv) { if (argc >= 2) { @@ -713,7 +768,9 @@ int main(int argc, char **argv) } else if (strncmp(argv[i], "--page-offset=", 14) == 0) { const char *p = &argv[i][14]; kernel_start_addr = strtoull(p, NULL, 16); - } else + } else if (strcmp(argv[i], "--base-relative") == 0) + base_relative = 1; + else usage(); } } else if (argc != 1) @@ -722,6 +779,8 @@ int main(int argc, char **argv) read_map(stdin); if (absolute_percpu) make_percpus_absolute(); + if (base_relative) + record_relative_base(); sort_symbols(); optimize_token_table(); write_src(); diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 7a08bf9a9576..453ede9d2f3d 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -90,6 +90,10 @@ kallsyms() kallsymopt="${kallsymopt} --absolute-percpu" fi + if [ -n "${CONFIG_KALLSYMS_BASE_RELATIVE}" ]; then + kallsymopt="${kallsymopt} --base-relative" + fi + local aflags="${KBUILD_AFLAGS} ${KBUILD_AFLAGS_KERNEL} \ ${NOSTDINC_FLAGS} ${LINUXINCLUDE} ${KBUILD_CPPFLAGS}" diff --git a/scripts/namespace.pl b/scripts/namespace.pl index a71be6b7cdec..9f3c9d47a4a5 100755 --- a/scripts/namespace.pl +++ b/scripts/namespace.pl @@ -117,6 +117,8 @@ my %nameexception = ( 'kallsyms_names' => 1, 'kallsyms_num_syms' => 1, 'kallsyms_addresses'=> 1, + 'kallsyms_offsets' => 1, + 'kallsyms_relative_base'=> 1, '__this_module' => 1, '_etext' => 1, '_edata' => 1, -- cgit From 2b021cbf3cb6208f0d40fd2f1869f237934340ed Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 15 Mar 2016 20:43:04 -0400 Subject: cgroup: ignore css_sets associated with dead cgroups during migration Before 2e91fa7f6d45 ("cgroup: keep zombies associated with their original cgroups"), all dead tasks were associated with init_css_set. If a zombie task is requested for migration, while migration prep operations would still be performed on init_css_set, the actual migration would ignore zombie tasks. As init_css_set is always valid, this worked fine. However, after 2e91fa7f6d45, zombie tasks stay with the css_set it was associated with at the time of death. Let's say a task T associated with cgroup A on hierarchy H-1 and cgroup B on hiearchy H-2. After T becomes a zombie, it would still remain associated with A and B. If A only contains zombie tasks, it can be removed. On removal, A gets marked offline but stays pinned until all zombies are drained. At this point, if migration is initiated on T to a cgroup C on hierarchy H-2, migration path would try to prepare T's css_set for migration and trigger the following. WARNING: CPU: 0 PID: 1576 at kernel/cgroup.c:474 cgroup_get+0x121/0x160() CPU: 0 PID: 1576 Comm: bash Not tainted 4.4.0-work+ #289 ... Call Trace: [] dump_stack+0x4e/0x82 [] warn_slowpath_common+0x78/0xb0 [] warn_slowpath_null+0x15/0x20 [] cgroup_get+0x121/0x160 [] link_css_set+0x7b/0x90 [] find_css_set+0x3bc/0x5e0 [] cgroup_migrate_prepare_dst+0x89/0x1f0 [] cgroup_attach_task+0x157/0x230 [] __cgroup_procs_write+0x2b7/0x470 [] cgroup_tasks_write+0xc/0x10 [] cgroup_file_write+0x30/0x1b0 [] kernfs_fop_write+0x13c/0x180 [] __vfs_write+0x23/0xe0 [] vfs_write+0xa4/0x1a0 [] SyS_write+0x44/0xa0 [] entry_SYSCALL_64_fastpath+0x12/0x6f It doesn't make sense to prepare migration for css_sets pointing to dead cgroups as they are guaranteed to contain only zombies which are ignored later during migration. This patch makes cgroup destruction path mark all affected css_sets as dead and updates the migration path to ignore them during preparation. Signed-off-by: Tejun Heo Fixes: 2e91fa7f6d45 ("cgroup: keep zombies associated with their original cgroups") Cc: stable@vger.kernel.org # v4.4+ --- include/linux/cgroup-defs.h | 3 +++ kernel/cgroup.c | 20 ++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 34b42f03fcd8..3e39ae5bc799 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -212,6 +212,9 @@ struct css_set { /* all css_task_iters currently walking this cset */ struct list_head task_iters; + /* dead and being drained, ignore for migration */ + bool dead; + /* For RCU-protected deletion */ struct rcu_head rcu_head; }; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e22df5d81e59..d57318950076 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2516,6 +2516,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); + /* + * If ->dead, @src_set is associated with one or more dead cgroups + * and doesn't contain any migratable tasks. Ignore it early so + * that the rest of migration path doesn't get confused by it. + */ + if (src_cset->dead) + return; + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); if (!list_empty(&src_cset->mg_preload_node)) @@ -5258,6 +5266,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup_subsys_state *css; + struct cgrp_cset_link *link; int ssid; lockdep_assert_held(&cgroup_mutex); @@ -5278,11 +5287,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Mark @cgrp dead. This prevents further task migration and child - * creation by disabling cgroup_lock_live_group(). + * Mark @cgrp and the associated csets dead. The former prevents + * further task migration and child creation by disabling + * cgroup_lock_live_group(). The latter makes the csets ignored by + * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; + spin_lock_bh(&css_set_lock); + list_for_each_entry(link, &cgrp->cset_links, cset_link) + link->cset->dead = true; + spin_unlock_bh(&css_set_lock); + /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); -- cgit From cfe02a8a973e7e5f66926b8ae38dfce404b19e29 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 15 Mar 2016 00:21:06 +0100 Subject: cgroup: avoid false positive gcc-6 warning When all subsystems are disabled, gcc notices that cgroup_subsys_enabled_key is a zero-length array and that any access to it must be out of bounds: In file included from ../include/linux/cgroup.h:19:0, from ../kernel/cgroup.c:31: ../kernel/cgroup.c: In function 'cgroup_add_cftypes': ../kernel/cgroup.c:261:53: error: array subscript is above array bounds [-Werror=array-bounds] return static_key_enabled(cgroup_subsys_enabled_key[ssid]); ~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ ../include/linux/jump_label.h:271:40: note: in definition of macro 'static_key_enabled' static_key_count((struct static_key *)x) > 0; \ ^ We should never call the function in this particular case, so this is not a bug. In order to silence the warning, this adds an explicit check for the CGROUP_SUBSYS_COUNT==0 case. Signed-off-by: Arnd Bergmann Signed-off-by: Tejun Heo --- kernel/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d57318950076..3fe02c152799 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -246,6 +246,9 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, */ static bool cgroup_ssid_enabled(int ssid) { + if (CGROUP_SUBSYS_COUNT == 0) + return false; + return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } -- cgit From 4c973d1620ae08f5cbe27644c5f5b974c8f594ec Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Wed, 16 Mar 2016 20:55:38 -0400 Subject: modules: split part of complete_formation() into prepare_coming_module() Put all actions in complete_formation() that are performed after module->state is set to MODULE_STATE_COMING into a separate function prepare_coming_module(). This split prepares for the removal of the livepatch module notifiers in favor of hard-coding function calls to klp_module_{coming,going} in the module loader. The complete_formation -> prepare_coming_module split will also make error handling easier since we can jump to the appropriate error label to do any module GOING cleanup after all the COMING-actions have completed. Signed-off-by: Jessica Yu Reviewed-by: Josh Poimboeuf Reviewed-by: Petr Mladek Acked-by: Rusty Russell Signed-off-by: Jiri Kosina --- kernel/module.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 794ebe8e878d..6dbfad415d51 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3392,9 +3392,6 @@ static int complete_formation(struct module *mod, struct load_info *info) mod->state = MODULE_STATE_COMING; mutex_unlock(&module_mutex); - ftrace_module_enable(mod); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); return 0; out: @@ -3402,6 +3399,14 @@ out: return err; } +static int prepare_coming_module(struct module *mod) +{ + ftrace_module_enable(mod); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_COMING, mod); + return 0; +} + static int unknown_module_param_cb(char *param, char *val, const char *modname, void *arg) { @@ -3516,13 +3521,17 @@ static int load_module(struct load_info *info, const char __user *uargs, if (err) goto ddebug_cleanup; + err = prepare_coming_module(mod); + if (err) + goto bug_cleanup; + /* Module is ready to execute: parsing args may do that. */ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, -32768, 32767, mod, unknown_module_param_cb); if (IS_ERR(after_dashes)) { err = PTR_ERR(after_dashes); - goto bug_cleanup; + goto coming_cleanup; } else if (after_dashes) { pr_warn("%s: parameters '%s' after `--' ignored\n", mod->name, after_dashes); @@ -3531,7 +3540,7 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Link in to syfs. */ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); if (err < 0) - goto bug_cleanup; + goto coming_cleanup; /* Get rid of temporary copy. */ free_copy(info); @@ -3541,15 +3550,16 @@ static int load_module(struct load_info *info, const char __user *uargs, return do_init_module(mod); + coming_cleanup: + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + bug_cleanup: /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); module_bug_cleanup(mod); mutex_unlock(&module_mutex); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - /* we can't deallocate the module until we clear memory protection */ module_disable_ro(mod); module_disable_nx(mod); -- cgit From 7e545d6eca20ce8ef7f66a63146cbff82b2ba760 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Wed, 16 Mar 2016 20:55:39 -0400 Subject: livepatch/module: remove livepatch module notifier Remove the livepatch module notifier in favor of directly enabling and disabling patches to modules in the module loader. Hard-coding the function calls ensures that ftrace_module_enable() is run before klp_module_coming() during module load, and that klp_module_going() is run before ftrace_release_mod() during module unload. This way, ftrace and livepatch code is run in the correct order during the module load/unload sequence without dependence on the module notifier call chain. Signed-off-by: Jessica Yu Reviewed-by: Petr Mladek Acked-by: Josh Poimboeuf Acked-by: Rusty Russell Signed-off-by: Jiri Kosina --- include/linux/livepatch.h | 13 ++++ kernel/livepatch/core.c | 147 ++++++++++++++++++++++------------------------ kernel/module.c | 10 ++++ 3 files changed, 94 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index c05679701e10..bd830d590465 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -24,6 +24,8 @@ #include #include +#if IS_ENABLED(CONFIG_LIVEPATCH) + #include enum klp_state { @@ -132,4 +134,15 @@ int klp_unregister_patch(struct klp_patch *); int klp_enable_patch(struct klp_patch *); int klp_disable_patch(struct klp_patch *); +/* Called from the module loader during module coming/going states */ +int klp_module_coming(struct module *mod); +void klp_module_going(struct module *mod); + +#else /* !CONFIG_LIVEPATCH */ + +static inline int klp_module_coming(struct module *mod) { return 0; } +static inline void klp_module_going(struct module *mod) { } + +#endif /* CONFIG_LIVEPATCH */ + #endif /* _LINUX_LIVEPATCH_H_ */ diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 780f00cdb7e5..d68fbf63b083 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -99,12 +99,12 @@ static void klp_find_object_module(struct klp_object *obj) /* * We do not want to block removal of patched modules and therefore * we do not take a reference here. The patches are removed by - * a going module handler instead. + * klp_module_going() instead. */ mod = find_module(obj->name); /* - * Do not mess work of the module coming and going notifiers. - * Note that the patch might still be needed before the going handler + * Do not mess work of klp_module_coming() and klp_module_going(). + * Note that the patch might still be needed before klp_module_going() * is called. Module functions can be called even in the GOING state * until mod->exit() finishes. This is especially important for * patches that modify semantic of the functions. @@ -866,103 +866,108 @@ int klp_register_patch(struct klp_patch *patch) } EXPORT_SYMBOL_GPL(klp_register_patch); -static int klp_module_notify_coming(struct klp_patch *patch, - struct klp_object *obj) +int klp_module_coming(struct module *mod) { - struct module *pmod = patch->mod; - struct module *mod = obj->mod; int ret; + struct klp_patch *patch; + struct klp_object *obj; - ret = klp_init_object_loaded(patch, obj); - if (ret) { - pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n", - pmod->name, mod->name, ret); - return ret; - } + if (WARN_ON(mod->state != MODULE_STATE_COMING)) + return -EINVAL; - if (patch->state == KLP_DISABLED) - return 0; + mutex_lock(&klp_mutex); + /* + * Each module has to know that klp_module_coming() + * has been called. We never know what module will + * get patched by a new patch. + */ + mod->klp_alive = true; - pr_notice("applying patch '%s' to loading module '%s'\n", - pmod->name, mod->name); + list_for_each_entry(patch, &klp_patches, list) { + klp_for_each_object(patch, obj) { + if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) + continue; - ret = klp_enable_object(obj); - if (ret) - pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", - pmod->name, mod->name, ret); - return ret; -} + obj->mod = mod; -static void klp_module_notify_going(struct klp_patch *patch, - struct klp_object *obj) -{ - struct module *pmod = patch->mod; - struct module *mod = obj->mod; + ret = klp_init_object_loaded(patch, obj); + if (ret) { + pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n", + patch->mod->name, obj->mod->name, ret); + goto err; + } - if (patch->state == KLP_DISABLED) - goto disabled; + if (patch->state == KLP_DISABLED) + break; + + pr_notice("applying patch '%s' to loading module '%s'\n", + patch->mod->name, obj->mod->name); + + ret = klp_enable_object(obj); + if (ret) { + pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", + patch->mod->name, obj->mod->name, ret); + goto err; + } + + break; + } + } - pr_notice("reverting patch '%s' on unloading module '%s'\n", - pmod->name, mod->name); + mutex_unlock(&klp_mutex); - klp_disable_object(obj); + return 0; -disabled: +err: + /* + * If a patch is unsuccessfully applied, return + * error to the module loader. + */ + pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", + patch->mod->name, obj->mod->name, obj->mod->name); + mod->klp_alive = false; klp_free_object_loaded(obj); + mutex_unlock(&klp_mutex); + + return ret; } -static int klp_module_notify(struct notifier_block *nb, unsigned long action, - void *data) +void klp_module_going(struct module *mod) { - int ret; - struct module *mod = data; struct klp_patch *patch; struct klp_object *obj; - if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING) - return 0; + if (WARN_ON(mod->state != MODULE_STATE_GOING && + mod->state != MODULE_STATE_COMING)) + return; mutex_lock(&klp_mutex); - /* - * Each module has to know that the notifier has been called. - * We never know what module will get patched by a new patch. + * Each module has to know that klp_module_going() + * has been called. We never know what module will + * get patched by a new patch. */ - if (action == MODULE_STATE_COMING) - mod->klp_alive = true; - else /* MODULE_STATE_GOING */ - mod->klp_alive = false; + mod->klp_alive = false; list_for_each_entry(patch, &klp_patches, list) { klp_for_each_object(patch, obj) { if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; - if (action == MODULE_STATE_COMING) { - obj->mod = mod; - ret = klp_module_notify_coming(patch, obj); - if (ret) { - obj->mod = NULL; - pr_warn("patch '%s' is in an inconsistent state!\n", - patch->mod->name); - } - } else /* MODULE_STATE_GOING */ - klp_module_notify_going(patch, obj); + if (patch->state != KLP_DISABLED) { + pr_notice("reverting patch '%s' on unloading module '%s'\n", + patch->mod->name, obj->mod->name); + klp_disable_object(obj); + } + klp_free_object_loaded(obj); break; } } mutex_unlock(&klp_mutex); - - return 0; } -static struct notifier_block klp_module_nb = { - .notifier_call = klp_module_notify, - .priority = INT_MIN+1, /* called late but before ftrace notifier */ -}; - static int __init klp_init(void) { int ret; @@ -973,21 +978,11 @@ static int __init klp_init(void) return -EINVAL; } - ret = register_module_notifier(&klp_module_nb); - if (ret) - return ret; - klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); - if (!klp_root_kobj) { - ret = -ENOMEM; - goto unregister; - } + if (!klp_root_kobj) + return -ENOMEM; return 0; - -unregister: - unregister_module_notifier(&klp_module_nb); - return ret; } module_init(klp_init); diff --git a/kernel/module.c b/kernel/module.c index 6dbfad415d51..4b65fbb10bdc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -984,6 +985,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, mod->exit(); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + klp_module_going(mod); ftrace_release_mod(mod); async_synchronize_full(); @@ -3315,6 +3317,7 @@ fail: module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + klp_module_going(mod); ftrace_release_mod(mod); free_module(mod); wake_up_all(&module_wq); @@ -3401,7 +3404,13 @@ out: static int prepare_coming_module(struct module *mod) { + int err; + ftrace_module_enable(mod); + err = klp_module_coming(mod); + if (err) + return err; + blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); return 0; @@ -3553,6 +3562,7 @@ static int load_module(struct load_info *info, const char __user *uargs, coming_cleanup: blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + klp_module_going(mod); bug_cleanup: /* module_bug_cleanup needs module_mutex protection */ -- cgit From a1ee1932aa6bea0bb074f5e3ced112664e4637ed Mon Sep 17 00:00:00 2001 From: Joshua Hunt Date: Thu, 17 Mar 2016 14:17:23 -0700 Subject: watchdog: don't run proc_watchdog_update if new value is same as old While working on a script to restore all sysctl params before a series of tests I found that writing any value into the /proc/sys/kernel/{nmi_watchdog,soft_watchdog,watchdog,watchdog_thresh} causes them to call proc_watchdog_update(). NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. There doesn't appear to be a reason for doing this work every time a write occurs, so only do it when the values change. Signed-off-by: Josh Hunt Acked-by: Don Zickus Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Cc: [4.1.x+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b3ace6ebbba3..9acb29f280ec 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -923,6 +923,9 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, * both lockup detectors are disabled if proc_watchdog_update() * returns an error. */ + if (old == new) + goto out; + err = proc_watchdog_update(); } out: @@ -967,7 +970,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write, int proc_watchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old; + int err, old, new; get_online_cpus(); mutex_lock(&watchdog_proc_mutex); @@ -987,6 +990,10 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, /* * Update the sample period. Restore on failure. */ + new = ACCESS_ONCE(watchdog_thresh); + if (old == new) + goto out; + set_sample_period(); err = proc_watchdog_update(); if (err) { -- cgit From 12580e4b54ba8a1b22ec977c200be0174ca42348 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:38 -0700 Subject: mm: memcontrol: report kernel stack usage in cgroup2 memory.stat Show how much memory is allocated to kernel stacks. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroup-v2.txt | 4 ++++ include/linux/memcontrol.h | 3 ++- kernel/fork.c | 10 +++++++++- mm/memcontrol.c | 2 ++ 4 files changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index e4e0c1d78cee..e2f4e7948a66 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -843,6 +843,10 @@ PAGE_SIZE multiple when read back. Amount of memory used to cache filesystem data, including tmpfs and shared memory. + kernel_stack + + Amount of memory allocated to kernel stacks. + slab Amount of memory used for storing in-kernel data diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e7af4834ffea..d6300313b298 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -52,9 +52,10 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, /* default hierarchy stats */ - MEMCG_SOCK = MEM_CGROUP_STAT_NSTATS, + MEMCG_KERNEL_STACK = MEM_CGROUP_STAT_NSTATS, MEMCG_SLAB_RECLAIMABLE, MEMCG_SLAB_UNRECLAIMABLE, + MEMCG_SOCK, MEMCG_NR_STAT, }; diff --git a/kernel/fork.c b/kernel/fork.c index 2e391c754ae7..accb7221d547 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -164,12 +164,20 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (page) + memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, + 1 << THREAD_SIZE_ORDER); + return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { - free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + struct page *page = virt_to_page(ti); + + memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, + -(1 << THREAD_SIZE_ORDER)); + __free_kmem_pages(page, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3ad64bf464fd..4b7dda7c2e74 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5106,6 +5106,8 @@ static int memory_stat_show(struct seq_file *m, void *v) (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); + seq_printf(m, "kernel_stack %llu\n", + (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE); seq_printf(m, "slab %llu\n", (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); -- cgit From 795ae7a0de6b834a0cc202aa55c190ef81496665 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:19:14 -0700 Subject: mm: scale kswapd watermarks in proportion to memory In machines with 140G of memory and enterprise flash storage, we have seen read and write bursts routinely exceed the kswapd watermarks and cause thundering herds in direct reclaim. Unfortunately, the only way to tune kswapd aggressiveness is through adjusting min_free_kbytes - the system's emergency reserves - which is entirely unrelated to the system's latency requirements. In order to get kswapd to maintain a 250M buffer of free memory, the emergency reserves need to be set to 1G. That is a lot of memory wasted for no good reason. On the other hand, it's reasonable to assume that allocation bursts and overall allocation concurrency scale with memory capacity, so it makes sense to make kswapd aggressiveness a function of that as well. Change the kswapd watermark scale factor from the currently fixed 25% of the tunable emergency reserve to a tunable 0.1% of memory. Beyond 1G of memory, this will produce bigger watermark steps than the current formula in default settings. Ensure that the new formula never chooses steps smaller than that, i.e. 25% of the emergency reserve. On a 140G machine, this raises the default watermark steps - the distance between min and low, and low and high - from 16M to 143M. Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Acked-by: Rik van Riel Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 18 ++++++++++++++++++ include/linux/mm.h | 1 + include/linux/mmzone.h | 2 ++ kernel/sysctl.c | 10 ++++++++++ mm/page_alloc.c | 29 +++++++++++++++++++++++++++-- 5 files changed, 58 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 89a887c76629..cb0368459da3 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -803,6 +803,24 @@ performance impact. Reclaim code needs to take various locks to find freeable directory and inode objects. With vfs_cache_pressure=1000, it will look for ten times more freeable objects than there are. +============================================================= + +watermark_scale_factor: + +This factor controls the aggressiveness of kswapd. It defines the +amount of memory left in a node/system before kswapd is woken up and +how much memory needs to be free before kswapd goes back to sleep. + +The unit is in fractions of 10,000. The default value of 10 means the +distances between watermarks are 0.1% of the available memory in the +node/system. The maximum value is 1000, or 10% of memory. + +A high rate of threads entering direct reclaim (allocstall) or kswapd +going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate +that the number of free pages kswapd maintains for latency reasons is +too small for the allocation bursts occurring in the system. This knob +can then be used to tune kswapd aggressiveness accordingly. + ============================================================== zone_reclaim_mode: diff --git a/include/linux/mm.h b/include/linux/mm.h index 75d1907b9009..f7fd64227d3a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1889,6 +1889,7 @@ extern void zone_pcp_reset(struct zone *zone); /* page_alloc.c */ extern int min_free_kbytes; +extern int watermark_scale_factor; /* nommu.c */ extern atomic_long_t mmap_pages_allocated; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index bdd9a270a813..c60df9257cc7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -841,6 +841,8 @@ static inline int is_highmem(struct zone *zone) struct ctl_table; int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); +int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, + void __user *, size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f5102fabef7f..725587f10667 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -126,6 +126,7 @@ static int __maybe_unused two = 2; static int __maybe_unused four = 4; static unsigned long one_ul = 1; static int one_hundred = 100; +static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif @@ -1403,6 +1404,15 @@ static struct ctl_table vm_table[] = { .proc_handler = min_free_kbytes_sysctl_handler, .extra1 = &zero, }, + { + .procname = "watermark_scale_factor", + .data = &watermark_scale_factor, + .maxlen = sizeof(watermark_scale_factor), + .mode = 0644, + .proc_handler = watermark_scale_factor_sysctl_handler, + .extra1 = &one, + .extra2 = &one_thousand, + }, { .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 941b802e11ec..d156310aedeb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +int watermark_scale_factor = 10; static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; @@ -6347,8 +6348,17 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_MIN] = tmp; } - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + /* + * Set the kswapd watermarks distance according to the + * scale factor in proportion to available memory, but + * ensure a minimum size on small systems. + */ + tmp = max_t(u64, tmp >> 2, + mult_frac(zone->managed_pages, + watermark_scale_factor, 10000)); + + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; __mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - low_wmark_pages(zone) - @@ -6489,6 +6499,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } +int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write) + setup_per_zone_wmarks(); + + return 0; +} + #ifdef CONFIG_NUMA int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) -- cgit From da8b44d5a9f8bf26da637b7336508ca534d6b319 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 17 Mar 2016 14:20:51 -0700 Subject: timer: convert timer_slack_ns from unsigned long to u64 This patchset introduces a /proc//timerslack_ns interface which would allow controlling processes to be able to set the timerslack value on other processes in order to save power by avoiding wakeups (Something Android currently does via out-of-tree patches). The first patch tries to fix the internal timer_slack_ns usage which was defined as a long, which limits the slack range to ~4 seconds on 32bit systems. It converts it to a u64, which provides the same basically unlimited slack (500 years) on both 32bit and 64bit machines. The second patch introduces the /proc//timerslack_ns interface which allows the full 64bit slack range for a task to be read or set on both 32bit and 64bit machines. With these two patches, on a 32bit machine, after setting the slack on bash to 10 seconds: $ time sleep 1 real 0m10.747s user 0m0.001s sys 0m0.005s The first patch is a little ugly, since I had to chase the slack delta arguments through a number of functions converting them to u64s. Let me know if it makes sense to break that up more or not. Other than that things are fairly straightforward. This patch (of 2): The timer_slack_ns value in the task struct is currently a unsigned long. This means that on 32bit applications, the maximum slack is just over 4 seconds. However, on 64bit machines, its much much larger (~500 years). This disparity could make application development a little (as well as the default_slack) to a u64. This means both 32bit and 64bit systems have the same effective internal slack range. Now the existing ABI via PR_GET_TIMERSLACK and PR_SET_TIMERSLACK specify the interface as a unsigned long, so we preserve that limitation on 32bit systems, where SET_TIMERSLACK can only set the slack to a unsigned long value, and GET_TIMERSLACK will return ULONG_MAX if the slack is actually larger then what can be stored by an unsigned long. This patch also modifies hrtimer functions which specified the slack delta as a unsigned long. Signed-off-by: John Stultz Cc: Arjan van de Ven Cc: Thomas Gleixner Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Kees Cook Cc: Android Kernel Team Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 2 +- fs/select.c | 8 ++++---- include/linux/freezer.h | 2 +- include/linux/hrtimer.h | 12 +++++++----- include/linux/poll.h | 2 +- include/linux/sched.h | 4 ++-- kernel/sys.c | 5 ++++- kernel/time/hrtimer.c | 8 ++++---- kernel/time/timer.c | 4 ++-- 9 files changed, 26 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index cde60741cad2..8a74a2a52e0f 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1616,7 +1616,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, { int res = 0, eavail, timed_out = 0; unsigned long flags; - long slack = 0; + u64 slack = 0; wait_queue_t wait; ktime_t expires, *to = NULL; diff --git a/fs/select.c b/fs/select.c index 79d0d4953cad..869293988c2a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -70,9 +70,9 @@ static long __estimate_accuracy(struct timespec *tv) return slack; } -long select_estimate_accuracy(struct timespec *tv) +u64 select_estimate_accuracy(struct timespec *tv) { - unsigned long ret; + u64 ret; struct timespec now; /* @@ -402,7 +402,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; - unsigned long slack = 0; + u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_end = 0; @@ -784,7 +784,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; - unsigned long slack = 0; + u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_end = 0; diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 6b7fd9cf5ea2..dd03e837ebb7 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -231,7 +231,7 @@ static inline long freezable_schedule_timeout_killable_unsafe(long timeout) * call this with locks held. */ static inline int freezable_schedule_hrtimeout_range(ktime_t *expires, - unsigned long delta, const enum hrtimer_mode mode) + u64 delta, const enum hrtimer_mode mode) { int __retval; freezer_do_not_count(); diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2ead22dd74a0..c98c6539e2c2 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -220,7 +220,7 @@ static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time timer->node.expires = ktime_add_safe(time, delta); } -static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta) +static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta) { timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta)); @@ -378,7 +378,7 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } /* Basic timer operations: */ extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long range_ns, const enum hrtimer_mode mode); + u64 range_ns, const enum hrtimer_mode mode); /** * hrtimer_start - (re)start an hrtimer on the current CPU @@ -399,7 +399,7 @@ extern int hrtimer_try_to_cancel(struct hrtimer *timer); static inline void hrtimer_start_expires(struct hrtimer *timer, enum hrtimer_mode mode) { - unsigned long delta; + u64 delta; ktime_t soft, hard; soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); @@ -477,10 +477,12 @@ extern long hrtimer_nanosleep_restart(struct restart_block *restart_block); extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *tsk); -extern int schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, +extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode); extern int schedule_hrtimeout_range_clock(ktime_t *expires, - unsigned long delta, const enum hrtimer_mode mode, int clock); + u64 delta, + const enum hrtimer_mode mode, + int clock); extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); /* Soft interrupt function to run the hrtimer queues: */ diff --git a/include/linux/poll.h b/include/linux/poll.h index c08386fb3e08..9fb4f40d9a26 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -96,7 +96,7 @@ extern void poll_initwait(struct poll_wqueues *pwq); extern void poll_freewait(struct poll_wqueues *pwq); extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack); -extern long select_estimate_accuracy(struct timespec *tv); +extern u64 select_estimate_accuracy(struct timespec *tv); static inline int poll_schedule(struct poll_wqueues *pwq, int state) diff --git a/include/linux/sched.h b/include/linux/sched.h index eb7f2f84009b..3284d07edec7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1792,8 +1792,8 @@ struct task_struct { * time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ - unsigned long timer_slack_ns; - unsigned long default_timer_slack_ns; + u64 timer_slack_ns; + u64 default_timer_slack_ns; #ifdef CONFIG_KASAN unsigned int kasan_depth; diff --git a/kernel/sys.c b/kernel/sys.c index 78947de6f969..cf8ba545c7d3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2169,7 +2169,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: - error = current->timer_slack_ns; + if (current->timer_slack_ns > ULONG_MAX) + error = ULONG_MAX; + else + error = current->timer_slack_ns; break; case PR_SET_TIMERSLACK: if (arg2 <= 0) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fa909f9fd559..58a321c34cfb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -979,7 +979,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, * relative (HRTIMER_MODE_REL) */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) + u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; @@ -1548,7 +1548,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; - unsigned long slack; + u64 slack; slack = current->timer_slack_ns; if (dl_task(current) || rt_task(current)) @@ -1724,7 +1724,7 @@ void __init hrtimers_init(void) * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME */ int __sched -schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, +schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, int clock) { struct hrtimer_sleeper t; @@ -1792,7 +1792,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, * * Returns 0 when the timer has expired otherwise -EINTR */ -int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, +int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, diff --git a/kernel/time/timer.c b/kernel/time/timer.c index bbc5d1114583..d1798fa0c743 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1698,10 +1698,10 @@ EXPORT_SYMBOL(msleep_interruptible); static void __sched do_usleep_range(unsigned long min, unsigned long max) { ktime_t kmin; - unsigned long delta; + u64 delta; kmin = ktime_set(0, min * NSEC_PER_USEC); - delta = (max - min) * NSEC_PER_USEC; + delta = (u64)(max - min) * NSEC_PER_USEC; schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); } -- cgit From a8199371afc27946d72f0d53e938e78d2ea0bae3 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 17 Mar 2016 14:21:20 -0700 Subject: printk: move can_use_console() out of console_trylock_for_printk() console_unlock() allows to cond_resched() if its caller has set `console_may_schedule' to 1 (this functionality is present since 8d91f8b15361 ("printk: do cond_resched() between lines while outputting to consoles"). The rules are: -- console_lock() always sets `console_may_schedule' to 1 -- console_trylock() always sets `console_may_schedule' to 0 printk() calls console_unlock() with preemption desabled, which basically can lead to RCU stalls, watchdog soft lockups, etc. if something is simultaneously calling printk() frequent enough (IOW, console_sem owner always has new data to send to console divers and can't leave console_unlock() for a long time). printk()->console_trylock() callers do not necessarily execute in atomic contexts, and some of them can cond_resched() in console_unlock(). console_trylock() can set `console_may_schedule' to 1 (allow cond_resched() later in consoe_unlock()) when it's safe. This patch (of 3): vprintk_emit() disables preemption around console_trylock_for_printk() and console_unlock() calls for a strong reason -- can_use_console() check. The thing is that vprintl_emit() can be called on a CPU that is not fully brought up yet (!cpu_online()), which potentially can cause problems if console driver wants to access per-cpu data. A console driver can explicitly state that it's safe to call it from !online cpu by setting CON_ANYTIME bit in console ->flags. That's why for !cpu_online() can_use_console() iterates all the console to find out if there is a CON_ANYTIME console, otherwise console_unlock() must be avoided. can_use_console() ensures that console_unlock() call is safe in vprintk_emit() only; console_lock() and console_trylock() are not covered by this check. Even though call_console_drivers(), invoked from console_cont_flush() and console_unlock(), tests `!cpu_online() && CON_ANYTIME' for_each_console(), it may be too late, which can result in messages loss. Assume that we have 2 cpus -- CPU0 is online, CPU1 is !online, and no CON_ANYTIME consoles available. CPU0 online CPU1 !online console_trylock() ... console_unlock() console_cont_flush spin_lock logbuf_lock if (!cont.len) { spin_unlock logbuf_lock return } for (;;) { vprintk_emit spin_lock logbuf_lock log_store spin_unlock logbuf_lock spin_lock logbuf_lock !console_trylock_for_printk msg_print_text return console_idx = log_next() console_seq++ console_prev = msg->flags spin_unlock logbuf_lock call_console_drivers() for_each_console(con) { if (!cpu_online() && !(con->flags & CON_ANYTIME)) continue; } /* * no message printed, we lost it */ vprintk_emit spin_lock logbuf_lock log_store spin_unlock logbuf_lock !console_trylock_for_printk return /* * go to the beginning of the loop, * find out there are new messages, * lose it */ } console_trylock()/console_lock() call on CPU1 may come from cpu notifiers registered on that CPU. Since notifiers are not getting unregistered when CPU is going DOWN, all of the notifiers receive notifications during CPU UP. For example, on my x86_64, I see around 50 notification sent from offline CPU to itself [swapper/2] from cpu:2 to:2 action:CPU_STARTING hotplug_hrtick [swapper/2] from cpu:2 to:2 action:CPU_STARTING blk_mq_main_cpu_notify [swapper/2] from cpu:2 to:2 action:CPU_STARTING blk_mq_queue_reinit_notify [swapper/2] from cpu:2 to:2 action:CPU_STARTING console_cpu_notify while doing echo 0 > /sys/devices/system/cpu/cpu2/online echo 1 > /sys/devices/system/cpu/cpu2/online So grabbing the console_sem lock while CPU is !online is possible, in theory. This patch moves can_use_console() check out of console_trylock_for_printk(). Instead it calls it in console_unlock(), so now console_lock()/console_unlock() are also 'protected' by can_use_console(). This also means that console_trylock_for_printk() is not really needed anymore and can be removed. Signed-off-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Cc: Jan Kara Cc: Tejun Heo Cc: Kyle McMartin Cc: Dave Jones Cc: Calvin Owens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 97 ++++++++++++++++++++++---------------------------- 1 file changed, 42 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c963ba534a78..2523332bd998 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1483,58 +1483,6 @@ static void zap_locks(void) sema_init(&console_sem, 1); } -/* - * Check if we have any console that is capable of printing while cpu is - * booting or shutting down. Requires console_sem. - */ -static int have_callable_console(void) -{ - struct console *con; - - for_each_console(con) - if (con->flags & CON_ANYTIME) - return 1; - - return 0; -} - -/* - * Can we actually use the console at this time on this cpu? - * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. - */ -static inline int can_use_console(unsigned int cpu) -{ - return cpu_online(cpu) || have_callable_console(); -} - -/* - * Try to get console ownership to actually show the kernel - * messages from a 'printk'. Return true (and with the - * console_lock held, and 'console_locked' set) if it - * is successful, false otherwise. - */ -static int console_trylock_for_printk(void) -{ - unsigned int cpu = smp_processor_id(); - - if (!console_trylock()) - return 0; - /* - * If we can't use the console, we need to release the console - * semaphore by hand to avoid flushing the buffer. We need to hold the - * console semaphore in order to do this test safely. - */ - if (!can_use_console(cpu)) { - console_locked = 0; - up_console_sem(); - return 0; - } - return 1; -} - int printk_delay_msec __read_mostly; static inline void printk_delay(void) @@ -1681,7 +1629,6 @@ asmlinkage int vprintk_emit(int facility, int level, boot_delay_msec(level); printk_delay(); - /* This stops the holder of console_sem just where we want him */ local_irq_save(flags); this_cpu = smp_processor_id(); @@ -1705,6 +1652,7 @@ asmlinkage int vprintk_emit(int facility, int level, } lockdep_off(); + /* This stops the holder of console_sem just where we want him */ raw_spin_lock(&logbuf_lock); logbuf_cpu = this_cpu; @@ -1821,7 +1769,7 @@ asmlinkage int vprintk_emit(int facility, int level, * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock_for_printk()) + if (console_trylock()) console_unlock(); preempt_enable(); lockdep_on(); @@ -2184,6 +2132,33 @@ int is_console_locked(void) return console_locked; } +/* + * Check if we have any console that is capable of printing while cpu is + * booting or shutting down. Requires console_sem. + */ +static int have_callable_console(void) +{ + struct console *con; + + for_each_console(con) + if (con->flags & CON_ANYTIME) + return 1; + + return 0; +} + +/* + * Can we actually use the console at this time on this cpu? + * + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. + */ +static inline int can_use_console(void) +{ + return cpu_online(raw_smp_processor_id()) || have_callable_console(); +} + static void console_cont_flush(char *text, size_t size) { unsigned long flags; @@ -2254,9 +2229,21 @@ void console_unlock(void) do_cond_resched = console_may_schedule; console_may_schedule = 0; +again: + /* + * We released the console_sem lock, so we need to recheck if + * cpu is online and (if not) is there at least one CON_ANYTIME + * console. + */ + if (!can_use_console()) { + console_locked = 0; + up_console_sem(); + return; + } + /* flush buffered message fragment immediately to console */ console_cont_flush(text, sizeof(text)); -again: + for (;;) { struct printk_log *msg; size_t ext_len = 0; -- cgit From 6b97a20d3a7909daa06625d4440c2c52d7bf08d7 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 17 Mar 2016 14:21:23 -0700 Subject: printk: set may_schedule for some of console_trylock() callers console_unlock() allows to cond_resched() if its caller has set `console_may_schedule' to 1, since 8d91f8b15361 ("printk: do cond_resched() between lines while outputting to consoles"). The rules are: -- console_lock() always sets `console_may_schedule' to 1 -- console_trylock() always sets `console_may_schedule' to 0 However, console_trylock() callers (among them is printk()) do not always call printk() from atomic contexts, and some of them can cond_resched() in console_unlock(), so console_trylock() can set `console_may_schedule' to 1 for such processes. For !CONFIG_PREEMPT_COUNT kernels, however, console_trylock() always sets `console_may_schedule' to 0. It's possible to drop explicit preempt_disable()/preempt_enable() in vprintk_emit(), because console_unlock() and console_trylock() are now smart enough: a) console_unlock() does not cond_resched() when it's unsafe (console_trylock() takes care of that) b) console_unlock() does can_use_console() check. Signed-off-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Cc: Jan Kara Cc: Tejun Heo Cc: Kyle McMartin Cc: Dave Jones Cc: Calvin Owens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 2523332bd998..a6d023c3b852 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1757,13 +1757,6 @@ asmlinkage int vprintk_emit(int facility, int level, /* If called from the scheduler, we can not call up(). */ if (!in_sched) { lockdep_off(); - /* - * Disable preemption to avoid being preempted while holding - * console_sem which would prevent anyone from printing to - * console - */ - preempt_disable(); - /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up @@ -1771,7 +1764,6 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (console_trylock()) console_unlock(); - preempt_enable(); lockdep_on(); } @@ -2122,7 +2114,20 @@ int console_trylock(void) return 0; } console_locked = 1; - console_may_schedule = 0; + /* + * When PREEMPT_COUNT disabled we can't reliably detect if it's + * safe to schedule (e.g. calling printk while holding a spin_lock), + * because preempt_disable()/preempt_enable() are just barriers there + * and preempt_count() is always 0. + * + * RCU read sections have a separate preemption counter when + * PREEMPT_RCU enabled thus we must take extra care and check + * rcu_preempt_depth(), otherwise RCU read sections modify + * preempt_count(). + */ + console_may_schedule = !oops_in_progress && + preemptible() && + !rcu_preempt_depth(); return 1; } EXPORT_SYMBOL(console_trylock); -- cgit From adaf6590ee7db23c3a124fb9f213c90c15cecf96 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 17 Mar 2016 14:21:27 -0700 Subject: printk: check CON_ENABLED in have_callable_console() have_callable_console() must also test CON_ENABLED bit, not just CON_ANYTIME. We may have disabled CON_ANYTIME console so printk can wrongly assume that it's safe to call_console_drivers(). Signed-off-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Cc: Jan Kara Cc: Tejun Heo Cc: Kyle McMartin Cc: Dave Jones Cc: Calvin Owens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a6d023c3b852..d5fd844e5b08 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2146,7 +2146,8 @@ static int have_callable_console(void) struct console *con; for_each_console(con) - if (con->flags & CON_ANYTIME) + if ((con->flags & CON_ENABLED) && + (con->flags & CON_ANYTIME)) return 1; return 0; -- cgit From f468908bb55a0b01d9424c74f8ec8eb906835150 Mon Sep 17 00:00:00 2001 From: Ivan Delalande Date: Thu, 17 Mar 2016 14:21:30 -0700 Subject: printk: add clear_idx symbol to vmcoreinfo This allows us to extract from the vmcore only the messages emitted since the last time the ring buffer was cleared. We just have to make sure its value is always up-to-date, when old messages are discarded to free space in log_make_free_space() for example. Signed-off-by: Zeyu Zhao Signed-off-by: Ivan Delalande Cc: Kay Sievers Cc: Neil Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d5fd844e5b08..bfbf284e4218 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -367,16 +367,20 @@ static int logbuf_has_space(u32 msg_size, bool empty) static int log_make_free_space(u32 msg_size) { - while (log_first_seq < log_next_seq) { - if (logbuf_has_space(msg_size, false)) - return 0; + while (log_first_seq < log_next_seq && + !logbuf_has_space(msg_size, false)) { /* drop old messages until we have enough contiguous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } + if (clear_seq < log_first_seq) { + clear_seq = log_first_seq; + clear_idx = log_first_idx; + } + /* sequence numbers are equal, so the log buffer is empty */ - if (logbuf_has_space(msg_size, true)) + if (logbuf_has_space(msg_size, log_first_seq == log_next_seq)) return 0; return -ENOMEM; @@ -854,6 +858,7 @@ void log_buf_kexec_setup(void) VMCOREINFO_SYMBOL(log_buf); VMCOREINFO_SYMBOL(log_buf_len); VMCOREINFO_SYMBOL(log_first_idx); + VMCOREINFO_SYMBOL(clear_idx); VMCOREINFO_SYMBOL(log_next_idx); /* * Export struct printk_log size and field offsets. User space tools can @@ -1216,12 +1221,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u32 idx; enum log_flags prev; - if (clear_seq < log_first_seq) { - /* messages are gone, move to first available one */ - clear_seq = log_first_seq; - clear_idx = log_first_idx; - } - /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. -- cgit From 4cc7ecb7f2a60e8deb783b8fbf7c1ae467acb920 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Mar 2016 14:23:00 -0700 Subject: param: convert some "on"/"off" users to strtobool This changes several users of manual "on"/"off" parsing to use strtobool. Some side-effects: - these uses will now parse y/n/1/0 meaningfully too - the early_param uses will now bubble up parse errors Signed-off-by: Kees Cook Acked-by: Heiko Carstens Acked-by: Michael Ellerman Cc: Amitkumar Karwar Cc: Andy Shevchenko Cc: Daniel Borkmann Cc: Joe Perches Cc: Kalle Valo Cc: Martin Schwidefsky Cc: Nishant Sarmukadam Cc: Rasmus Villemoes Cc: Steve French Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/rtasd.c | 9 ++------- arch/powerpc/platforms/pseries/hotplug-cpu.c | 10 ++-------- arch/s390/kernel/time.c | 8 ++------ arch/s390/kernel/topology.c | 7 ++----- arch/x86/kernel/aperture_64.c | 12 ++---------- include/linux/tick.h | 2 +- kernel/time/hrtimer.c | 10 ++-------- kernel/time/tick-sched.c | 10 ++-------- 8 files changed, 15 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 5a2c049c1c61..aa610ce8742f 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -49,7 +49,7 @@ static unsigned int rtas_error_log_buffer_max; static unsigned int event_scan; static unsigned int rtas_event_scan_rate; -static int full_rtas_msgs = 0; +static bool full_rtas_msgs; /* Stop logging to nvram after first fatal error */ static int logging_enabled; /* Until we initialize everything, @@ -592,11 +592,6 @@ __setup("surveillance=", surveillance_setup); static int __init rtasmsgs_setup(char *str) { - if (strcmp(str, "on") == 0) - full_rtas_msgs = 1; - else if (strcmp(str, "off") == 0) - full_rtas_msgs = 0; - - return 1; + return (kstrtobool(str, &full_rtas_msgs) == 0); } __setup("rtasmsgs=", rtasmsgs_setup); diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 32274f72fe3f..282837a1d74b 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -47,20 +47,14 @@ static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE; static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE; -static int cede_offline_enabled __read_mostly = 1; +static bool cede_offline_enabled __read_mostly = true; /* * Enable/disable cede_offline when available. */ static int __init setup_cede_offline(char *str) { - if (!strcmp(str, "off")) - cede_offline_enabled = 0; - else if (!strcmp(str, "on")) - cede_offline_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &cede_offline_enabled) == 0); } __setup("cede_offline=", setup_cede_offline); diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index c4e5f183f225..9409d32f285e 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -1432,7 +1432,7 @@ device_initcall(etr_init_sysfs); /* * Server Time Protocol (STP) code. */ -static int stp_online; +static bool stp_online; static struct stp_sstpi stp_info; static void *stp_page; @@ -1443,11 +1443,7 @@ static struct timer_list stp_timer; static int __init early_parse_stp(char *p) { - if (strncmp(p, "off", 3) == 0) - stp_online = 0; - else if (strncmp(p, "on", 2) == 0) - stp_online = 1; - return 0; + return kstrtobool(p, &stp_online); } early_param("stp", early_parse_stp); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 40b8102fdadb..64298a867589 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -37,7 +37,7 @@ static void set_topology_timer(void); static void topology_work_fn(struct work_struct *work); static struct sysinfo_15_1_x *tl_info; -static int topology_enabled = 1; +static bool topology_enabled = true; static DECLARE_WORK(topology_work, topology_work_fn); /* @@ -444,10 +444,7 @@ static const struct cpumask *cpu_book_mask(int cpu) static int __init early_parse_topology(char *p) { - if (strncmp(p, "off", 3)) - return 0; - topology_enabled = 0; - return 0; + return kstrtobool(p, &topology_enabled); } early_param("topology", early_parse_topology); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 6e85f713641d..0a2bb1f62e72 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -227,19 +227,11 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp) return 0; } -static int gart_fix_e820 __initdata = 1; +static bool gart_fix_e820 __initdata = true; static int __init parse_gart_mem(char *p) { - if (!p) - return -EINVAL; - - if (!strncmp(p, "off", 3)) - gart_fix_e820 = 0; - else if (!strncmp(p, "on", 2)) - gart_fix_e820 = 1; - - return 0; + return kstrtobool(p, &gart_fix_e820); } early_param("gart_fix_e820", parse_gart_mem); diff --git a/include/linux/tick.h b/include/linux/tick.h index 21f73649a4dc..62be0786d6d0 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -111,7 +111,7 @@ enum tick_dep_bits { #define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE) #ifdef CONFIG_NO_HZ_COMMON -extern int tick_nohz_enabled; +extern bool tick_nohz_enabled; extern int tick_nohz_tick_stopped(void); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 58a321c34cfb..fa0b983290cf 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -515,7 +515,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) /* * High resolution timer enabled ? */ -static int hrtimer_hres_enabled __read_mostly = 1; +static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); @@ -524,13 +524,7 @@ EXPORT_SYMBOL_GPL(hrtimer_resolution); */ static int __init setup_hrtimer_hres(char *str) { - if (!strcmp(str, "off")) - hrtimer_hres_enabled = 0; - else if (!strcmp(str, "on")) - hrtimer_hres_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } __setup("highres=", setup_hrtimer_hres); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 969e6704c3c9..195fe7d2caad 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -486,20 +486,14 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -int tick_nohz_enabled __read_mostly = 1; +bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { - if (!strcmp(str, "off")) - tick_nohz_enabled = 0; - else if (!strcmp(str, "on")) - tick_nohz_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); -- cgit From 2553b67a1fbe7bf202e4e8070ab0b00d3d3a06a2 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 17 Mar 2016 14:23:04 -0700 Subject: lib/bug.c: use common WARN helper The traceoff_on_warning option doesn't have any effect on s390, powerpc, arm64, parisc, and sh because there are two different types of WARN implementations: 1) The above mentioned architectures treat WARN() as a special case of a BUG() exception. They handle warnings in report_bug() in lib/bug.c. 2) All other architectures just call warn_slowpath_*() directly. Their warnings are handled in warn_slowpath_common() in kernel/panic.c. Support traceoff_on_warning on all architectures and prevent any future divergence by using a single common function to emit the warning. Also remove the '()' from '%pS()', because the parentheses look funky: [ 45.607629] WARNING: at /root/warn_mod/warn_mod.c:17 .init_dummy+0x20/0x40 [warn_mod]() Reported-by: Chunyu Hu Signed-off-by: Josh Poimboeuf Acked-by: Heiko Carstens Tested-by: Prarit Bhargava Acked-by: Prarit Bhargava Acked-by: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 6 ++++++ kernel/panic.c | 41 ++++++++++++++++++++++++++--------------- lib/bug.c | 26 ++------------------------ 3 files changed, 34 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index fdf6fa078422..f90588abbfd4 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -81,6 +81,12 @@ extern void warn_slowpath_null(const char *file, const int line); do { printk(arg); __WARN_TAINT(taint); } while (0) #endif +/* used internally by panic.c */ +struct warn_args; + +void __warn(const char *file, int line, void *caller, unsigned taint, + struct pt_regs *regs, struct warn_args *args); + #ifndef WARN_ON #define WARN_ON(condition) ({ \ int __ret_warn_on = !!(condition); \ diff --git a/kernel/panic.c b/kernel/panic.c index d96469de72dc..fa400852bf6c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -24,6 +24,7 @@ #include #include #include +#include #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -449,20 +450,25 @@ void oops_exit(void) kmsg_dump(KMSG_DUMP_OOPS); } -#ifdef WANT_WARN_ON_SLOWPATH -struct slowpath_args { +struct warn_args { const char *fmt; va_list args; }; -static void warn_slowpath_common(const char *file, int line, void *caller, - unsigned taint, struct slowpath_args *args) +void __warn(const char *file, int line, void *caller, unsigned taint, + struct pt_regs *regs, struct warn_args *args) { disable_trace_on_warning(); pr_warn("------------[ cut here ]------------\n"); - pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", - raw_smp_processor_id(), current->pid, file, line, caller); + + if (file) + pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", + raw_smp_processor_id(), current->pid, file, line, + caller); + else + pr_warn("WARNING: CPU: %d PID: %d at %pS\n", + raw_smp_processor_id(), current->pid, caller); if (args) vprintk(args->fmt, args->args); @@ -479,20 +485,27 @@ static void warn_slowpath_common(const char *file, int line, void *caller, } print_modules(); - dump_stack(); + + if (regs) + show_regs(regs); + else + dump_stack(); + print_oops_end_marker(); + /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); } +#ifdef WANT_WARN_ON_SLOWPATH void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) { - struct slowpath_args args; + struct warn_args args; args.fmt = fmt; va_start(args.args, fmt); - warn_slowpath_common(file, line, __builtin_return_address(0), - TAINT_WARN, &args); + __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, + &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt); @@ -500,20 +513,18 @@ EXPORT_SYMBOL(warn_slowpath_fmt); void warn_slowpath_fmt_taint(const char *file, int line, unsigned taint, const char *fmt, ...) { - struct slowpath_args args; + struct warn_args args; args.fmt = fmt; va_start(args.args, fmt); - warn_slowpath_common(file, line, __builtin_return_address(0), - taint, &args); + __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt_taint); void warn_slowpath_null(const char *file, int line) { - warn_slowpath_common(file, line, __builtin_return_address(0), - TAINT_WARN, NULL); + __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL); } EXPORT_SYMBOL(warn_slowpath_null); #endif diff --git a/lib/bug.c b/lib/bug.c index 6cde380f09de..bc3656e944d2 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -167,30 +167,8 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) if (warning) { /* this is a WARN_ON rather than BUG/BUG_ON */ - pr_warn("------------[ cut here ]------------\n"); - - if (file) - pr_warn("WARNING: at %s:%u\n", file, line); - else - pr_warn("WARNING: at %p [verbose debug info unavailable]\n", - (void *)bugaddr); - - if (panic_on_warn) { - /* - * This thread may hit another WARN() in the panic path. - * Resetting this prevents additional WARN() from - * panicking the system on this thread. Other threads - * are blocked by the panic_mutex in panic(). - */ - panic_on_warn = 0; - panic("panic_on_warn set ...\n"); - } - - print_modules(); - show_regs(regs); - print_oops_end_marker(); - /* Just a warning, don't kill lockdep. */ - add_taint(BUG_GET_TAINT(bug), LOCKDEP_STILL_OK); + __warn(file, line, (void *)bugaddr, BUG_GET_TAINT(bug), regs, + NULL); return BUG_TRAP_TYPE_WARN; } -- cgit From 84b6d3e6149c5280bc18b42e2f12efdaf354e49c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 16 Mar 2016 15:34:32 +0100 Subject: ftrace: Make ftrace_hash_rec_enable return update bool Change __ftrace_hash_rec_update to return true in case we need to update dynamic ftrace call records. It return false in case no update is needed. Link: http://lkml.kernel.org/r/1458138873-1553-5-git-send-email-jolsa@kernel.org Acked-by: Namhyung Kim Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 57a6eea84694..11ffcfd3804e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1610,7 +1610,7 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) return keep_regs; } -static void __ftrace_hash_rec_update(struct ftrace_ops *ops, +static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) { @@ -1618,12 +1618,13 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, struct ftrace_hash *other_hash; struct ftrace_page *pg; struct dyn_ftrace *rec; + bool update = false; int count = 0; int all = 0; /* Only update if the ops has been registered */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return; + return false; /* * In the filter_hash case: @@ -1650,7 +1651,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * then there's nothing to do. */ if (ftrace_hash_empty(hash)) - return; + return false; } do_for_each_ftrace_rec(pg, rec) { @@ -1694,7 +1695,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (inc) { rec->flags++; if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX)) - return; + return false; /* * If there's only a single callback registered to a @@ -1720,7 +1721,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, rec->flags |= FTRACE_FL_REGS; } else { if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0)) - return; + return false; rec->flags--; /* @@ -1753,22 +1754,28 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, */ } count++; + + /* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */ + update |= ftrace_test_record(rec, 1) != FTRACE_UPDATE_IGNORE; + /* Shortcut, if we handled all records, we are done. */ if (!all && count == hash->count) - return; + return update; } while_for_each_ftrace_rec(); + + return update; } -static void ftrace_hash_rec_disable(struct ftrace_ops *ops, +static bool ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash) { - __ftrace_hash_rec_update(ops, filter_hash, 0); + return __ftrace_hash_rec_update(ops, filter_hash, 0); } -static void ftrace_hash_rec_enable(struct ftrace_ops *ops, +static bool ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash) { - __ftrace_hash_rec_update(ops, filter_hash, 1); + return __ftrace_hash_rec_update(ops, filter_hash, 1); } static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, -- cgit From 7f50d06bb6b825d34f069c6c7a1aab96ad0b94d9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 16 Mar 2016 15:34:33 +0100 Subject: ftrace: Update dynamic ftrace calls only if necessary Currently dynamic ftrace calls are updated any time the ftrace_ops is un/registered. If we do this update only when it's needed, we save lot of time for perf system wide ftrace function sampling/counting. The reason is that for system wide sampling/counting, perf creates event for each cpu in the system. Each event then registers separate copy of ftrace_ops, which ends up in FTRACE_UPDATE_CALLS updates. On servers with many cpus that means serious stall (240 cpus server): Counting: # time ./perf stat -e ftrace:function -a sleep 1 Performance counter stats for 'system wide': 370,663 ftrace:function 1.401427505 seconds time elapsed real 3m51.743s user 0m0.023s sys 3m48.569s Sampling: # time ./perf record -e ftrace:function -a sleep 1 [ perf record: Woken up 0 times to write data ] Warning: Processed 141200 events and lost 5 chunks! [ perf record: Captured and wrote 10.703 MB perf.data (135950 samples) ] real 2m31.429s user 0m0.213s sys 2m29.494s There's no reason to do the FTRACE_UPDATE_CALLS update for each event in perf case, because all the ftrace_ops always share the same filter, so the updated calls are always the same. It's required that only first ftrace_ops registration does the FTRACE_UPDATE_CALLS update (also sometimes the second if the first one used the trampoline), but the rest can be only cheaply linked into the ftrace_ops list. Counting: # time ./perf stat -e ftrace:function -a sleep 1 Performance counter stats for 'system wide': 398,571 ftrace:function 1.377503733 seconds time elapsed real 0m2.787s user 0m0.005s sys 0m1.883s Sampling: # time ./perf record -e ftrace:function -a sleep 1 [ perf record: Woken up 0 times to write data ] Warning: Processed 261730 events and lost 9 chunks! [ perf record: Captured and wrote 19.907 MB perf.data (256293 samples) ] real 1m31.948s user 0m0.309s sys 1m32.051s Link: http://lkml.kernel.org/r/1458138873-1553-6-git-send-email-jolsa@kernel.org Acked-by: Namhyung Kim Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 11ffcfd3804e..d3850cbb840f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2651,7 +2651,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return ret; ftrace_start_up++; - command |= FTRACE_UPDATE_CALLS; /* * Note that ftrace probes uses this to start up @@ -2672,7 +2671,8 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return ret; } - ftrace_hash_rec_enable(ops, 1); + if (ftrace_hash_rec_enable(ops, 1)) + command |= FTRACE_UPDATE_CALLS; ftrace_startup_enable(command); @@ -2702,11 +2702,11 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) /* Disabling ipmodify never fails */ ftrace_hash_ipmodify_disable(ops); - ftrace_hash_rec_disable(ops, 1); - ops->flags &= ~FTRACE_OPS_FL_ENABLED; + if (ftrace_hash_rec_disable(ops, 1)) + command |= FTRACE_UPDATE_CALLS; - command |= FTRACE_UPDATE_CALLS; + ops->flags &= ~FTRACE_OPS_FL_ENABLED; if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; -- cgit From 6363c6b599ae67b779d01a48642a7c0d7d721814 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 15 Mar 2016 22:12:34 +0800 Subject: ftrace: Use kasprintf() in ftrace_profile_tracefs() Use kasprintf() instead of kmalloc() and snprintf(). Link: http://lkml.kernel.org/r/135a7bc36e51fd9eaa57124dd2140285b771f738.1458050835.git.geliangtang@163.com Acked-by: Namhyung Kim Signed-off-by: Geliang Tang Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d3850cbb840f..6a93faafbea4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1030,8 +1030,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) for_each_possible_cpu(cpu) { stat = &per_cpu(ftrace_profile_stats, cpu); - /* allocate enough for function name + cpu number */ - name = kmalloc(32, GFP_KERNEL); + name = kasprintf(GFP_KERNEL, "function%d", cpu); if (!name) { /* * The files created are permanent, if something happens @@ -1043,7 +1042,6 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) return; } stat->stat = function_stats; - snprintf(name, 32, "function%d", cpu); stat->stat.name = name; ret = register_stat_tracer(&stat->stat); if (ret) { -- cgit From c8ca003b2fde177b83de87f9f20f6a5933fb50bf Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Mon, 14 Mar 2016 20:35:41 +0800 Subject: tracing: Fix return while holding a lock in register_tracer() commit d39cdd2036a6 ("tracing: Make tracer_flags use the right set_flag callback") introduces a potential mutex deadlock issue, as it forgets to free the mutex when allocaing the tracer_flags gets fail. The issue was found by Dan Carpenter through Smatch static code check tool. Link: http://lkml.kernel.org/r/1457958941-30265-1-git-send-email-chuhu@redhat.com Fixes: d39cdd2036a6 ("tracing: Make tracer_flags use the right set_flag callback") Reported-by: Dan Carpenter Signed-off-by: Chunyu Hu Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b401a1892dc6..0ae46048f724 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1256,8 +1256,10 @@ int __init register_tracer(struct tracer *type) if (!type->flags) { /*allocate a dummy tracer_flags*/ type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL); - if (!type->flags) - return -ENOMEM; + if (!type->flags) { + ret = -ENOMEM; + goto out; + } type->flags->val = 0; type->flags->opts = dummy_tracer_opt; } else -- cgit From cb86e05390debcc084cfdb0a71ed4c5dbbec517d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 18 Mar 2016 12:27:43 -0400 Subject: tracing: Have preempt(irqs)off trace preempt disabled functions Joel Fernandes reported that the function tracing of preempt disabled sections was not being reported when running either the preemptirqsoff or preemptoff tracers. This was due to the fact that the function tracer callback for those tracers checked if irqs were disabled before tracing. But this fails when we want to trace preempt off locations as well. Joel explained that he wanted to see funcitons where interrupts are enabled but preemption was disabled. The expected output he wanted: <...>-2265 1d.h1 3419us : preempt_count_sub <-irq_exit <...>-2265 1d..1 3419us : __do_softirq <-irq_exit <...>-2265 1d..1 3419us : msecs_to_jiffies <-__do_softirq <...>-2265 1d..1 3420us : irqtime_account_irq <-__do_softirq <...>-2265 1d..1 3420us : __local_bh_disable_ip <-__do_softirq <...>-2265 1..s1 3421us : run_timer_softirq <-__do_softirq <...>-2265 1..s1 3421us : hrtimer_run_pending <-run_timer_softirq <...>-2265 1..s1 3421us : _raw_spin_lock_irq <-run_timer_softirq <...>-2265 1d.s1 3422us : preempt_count_add <-_raw_spin_lock_irq <...>-2265 1d.s2 3422us : _raw_spin_unlock_irq <-run_timer_softirq <...>-2265 1..s2 3422us : preempt_count_sub <-_raw_spin_unlock_irq <...>-2265 1..s1 3423us : rcu_bh_qs <-__do_softirq <...>-2265 1d.s1 3423us : irqtime_account_irq <-__do_softirq <...>-2265 1d.s1 3423us : __local_bh_enable <-__do_softirq There's a comment saying that the irq disabled check is because there's a possible race that tracing_cpu may be set when the function is executed. But I don't remember that race. For now, I added a check for preemption being enabled too to not record the function, as there would be no race if that was the case. I need to re-investigate this, as I'm now thinking that the tracing_cpu will always be correct. But no harm in keeping the check for now, except for the slight performance hit. Link: http://lkml.kernel.org/r/1457770386-88717-1-git-send-email-agnel.joel@gmail.com Fixes: 5e6d2b9cfa3a "tracing: Use one prologue for the preempt irqs off tracer function tracers" Cc: stable@vget.kernel.org # 2.6.37+ Reported-by: Joel Fernandes Signed-off-by: Steven Rostedt --- kernel/trace/trace_irqsoff.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index e4e56589ec1d..be3222b7d72e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -109,8 +109,12 @@ static int func_prolog_dec(struct trace_array *tr, return 0; local_save_flags(*flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(*flags)) + /* + * Slight chance to get a false positive on tracing_cpu, + * although I'm starting to think there isn't a chance. + * Leave this for now just to be paranoid. + */ + if (!irqs_disabled_flags(*flags) && !preempt_count()) return 0; *data = per_cpu_ptr(tr->trace_buffer.data, cpu); -- cgit From a29054d9478d0435ab01b7544da4f674ab13f533 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 18 Mar 2016 15:46:48 -0400 Subject: tracing: Fix crash from reading trace_pipe with sendfile If tracing contains data and the trace_pipe file is read with sendfile(), then it can trigger a NULL pointer dereference and various BUG_ON within the VM code. There's a patch to fix this in the splice_to_pipe() code, but it's also a good idea to not let that happen from trace_pipe either. Link: http://lkml.kernel.org/r/1457641146-9068-1-git-send-email-rabin@rab.in Cc: stable@vger.kernel.org # 2.6.30+ Reported-by: Rabin Vincent Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0ae46048f724..cb2b708e4ea7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4954,7 +4954,10 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, spd.nr_pages = i; - ret = splice_to_pipe(pipe, &spd); + if (i) + ret = splice_to_pipe(pipe, &spd); + else + ret = 0; out: splice_shrink_spd(&spd); return ret; -- cgit From 741f3a69f101250dc6b171b88e14ea51b099b1a9 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Mon, 19 Oct 2015 21:10:26 +0300 Subject: tracing: Remove redundant reset per-CPU buff in irqsoff tracer There is no reason to do it twice: from commit b6f11df26fdc28 ("trace: Call tracing_reset_online_cpus before tracer->init()") resetting of per-CPU buffers done before tracer->init() call. tracer->init() calls {irqs,preempt,preemptirqs}off_tracer_init() and it calls __irqsoff_tracer_init(), which resets per-CPU ringbuffer second time. It's slowpath, but anyway. Link: http://lkml.kernel.org/r/1445278226-16187-1-git-send-email-0x7f454c46@gmail.com Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_irqsoff.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index be3222b7d72e..03cdff84d026 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -626,7 +626,6 @@ static int __irqsoff_tracer_init(struct trace_array *tr) irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); - tracing_reset_online_cpus(&tr->trace_buffer); ftrace_init_array_ops(tr, irqsoff_tracer_call); -- cgit From 1e02cd40f15190b78fcc6b3f50c952fb4028e9a5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Mar 2016 15:39:24 +0100 Subject: perf/core: Fix the unthrottle logic Its possible to IOC_PERIOD while the event is throttled, this would re-start the event and the next tick would then try to unthrottle it, and find the event wasn't actually stopped anymore. This would tickle a WARN in the x86-pmu code which isn't expecting to start a !stopped event. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: dvyukov@google.com Cc: oleg@redhat.com Cc: panand@redhat.com Cc: sasha.levin@oracle.com Cc: vince@deater.net Link: http://lkml.kernel.org/r/20160310143924.GR6356@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 712570dddacd..d39477390415 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4210,6 +4210,14 @@ static void __perf_event_period(struct perf_event *event, active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { perf_pmu_disable(ctx->pmu); + /* + * We could be throttled; unthrottle now to avoid the tick + * trying to unthrottle while we already re-started the event. + */ + if (event->hw.interrupts == MAX_INTERRUPTS) { + event->hw.interrupts = 0; + perf_log_throttle(event, 1); + } event->pmu->stop(event, PERF_EF_UPDATE); } -- cgit From 91a612eea9a316c464cc170ff8492ec09e7d1c69 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Mar 2016 15:17:35 +0100 Subject: perf/core: Fix dynamic interrupt throttle There were two problems with the dynamic interrupt throttle mechanism, both triggered by the same action. When you (or perf_fuzzer) write a huge value into /proc/sys/kernel/perf_event_max_sample_rate the computed perf_sample_allowed_ns becomes 0. This effectively disables the whole dynamic throttle. This is fixed by ensuring update_perf_cpu_limits() never sets the value to 0. However, we allow disabling of the dynamic throttle by writing 100 to /proc/sys/kernel/perf_cpu_time_max_percent. This will generate a warning in dmesg. The second problem is that by setting the max_sample_rate to a huge number, the adaptive process can take a few tries, since it halfs the limit each time. Change that to directly compute a new value based on the observed duration. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 87 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d39477390415..43e35faf576e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -376,8 +376,11 @@ static void update_perf_cpu_limits(void) u64 tmp = perf_sample_period_ns; tmp *= sysctl_perf_cpu_time_max_percent; - do_div(tmp, 100); - ACCESS_ONCE(perf_sample_allowed_ns) = tmp; + tmp = div_u64(tmp, 100); + if (!tmp) + tmp = 1; + + WRITE_ONCE(perf_sample_allowed_ns, tmp); } static int perf_rotate_context(struct perf_cpu_context *cpuctx); @@ -409,7 +412,13 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, if (ret || !write) return ret; - update_perf_cpu_limits(); + if (sysctl_perf_cpu_time_max_percent == 100) { + printk(KERN_WARNING + "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); + WRITE_ONCE(perf_sample_allowed_ns, 0); + } else { + update_perf_cpu_limits(); + } return 0; } @@ -423,62 +432,68 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); +static u64 __report_avg; +static u64 __report_allowed; + static void perf_duration_warn(struct irq_work *w) { - u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); - u64 avg_local_sample_len; - u64 local_samples_len; - - local_samples_len = __this_cpu_read(running_sample_length); - avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; - printk_ratelimited(KERN_WARNING - "perf interrupt took too long (%lld > %lld), lowering " - "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns >> 1, - sysctl_perf_event_sample_rate); + "perf: interrupt took too long (%lld > %lld), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + __report_avg, __report_allowed, + sysctl_perf_event_sample_rate); } static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); void perf_sample_event_took(u64 sample_len_ns) { - u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); - u64 avg_local_sample_len; - u64 local_samples_len; + u64 max_len = READ_ONCE(perf_sample_allowed_ns); + u64 running_len; + u64 avg_len; + u32 max; - if (allowed_ns == 0) + if (max_len == 0) return; - /* decay the counter by 1 average sample */ - local_samples_len = __this_cpu_read(running_sample_length); - local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; - local_samples_len += sample_len_ns; - __this_cpu_write(running_sample_length, local_samples_len); + /* Decay the counter by 1 average sample. */ + running_len = __this_cpu_read(running_sample_length); + running_len -= running_len/NR_ACCUMULATED_SAMPLES; + running_len += sample_len_ns; + __this_cpu_write(running_sample_length, running_len); /* - * note: this will be biased artifically low until we have - * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us + * Note: this will be biased artifically low until we have + * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count. */ - avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; - - if (avg_local_sample_len <= allowed_ns) + avg_len = running_len/NR_ACCUMULATED_SAMPLES; + if (avg_len <= max_len) return; - if (max_samples_per_tick <= 1) - return; + __report_avg = avg_len; + __report_allowed = max_len; - max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2); - sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; - perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; + /* + * Compute a throttle threshold 25% below the current duration. + */ + avg_len += avg_len / 4; + max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; + if (avg_len < max) + max /= (u32)avg_len; + else + max = 1; - update_perf_cpu_limits(); + WRITE_ONCE(perf_sample_allowed_ns, avg_len); + WRITE_ONCE(max_samples_per_tick, max); + + sysctl_perf_event_sample_rate = max * HZ; + perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; if (!irq_work_queue(&perf_duration_work)) { - early_printk("perf interrupt took too long (%lld > %lld), lowering " + early_printk("perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns >> 1, + __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } } -- cgit From 8184059e93c200757f5c0805dae0f14e880eab5d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Jan 2016 15:17:51 +0100 Subject: perf/core: Fix Undefined behaviour in rb_alloc() Sasha reported: [ 3494.030114] UBSAN: Undefined behaviour in kernel/events/ring_buffer.c:685:22 [ 3494.030647] shift exponent -1 is negative Andrey spotted that this is because: It happens if nr_pages = 0: rb->page_order = ilog2(nr_pages); Fix it by making both assignments conditional on nr_pages; since otherwise they should both be 0 anyway, and will be because of the kzalloc() used to allocate the structure. Reported-by: Sasha Levin Reported-by: Andrey Ryabinin Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/20160129141751.GA407@worktop Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 1faad2cfdb9e..c61f0cbd308b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -746,8 +746,10 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) rb->user_page = all_buf; rb->data_pages[0] = all_buf + PAGE_SIZE; - rb->page_order = ilog2(nr_pages); - rb->nr_pages = !!nr_pages; + if (nr_pages) { + rb->nr_pages = 1; + rb->page_order = ilog2(nr_pages); + } ring_buffer_init(rb, watermark, flags); -- cgit From 1dcaac1ce07db672e2cb5b4ef9642990689bb2a1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 8 Mar 2016 17:56:05 +0100 Subject: perf/core: Document some hotplug bits Document some of the hotplug notifier usage. Requested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 43e35faf576e..de24fbce5277 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9449,10 +9449,29 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: + /* + * This must be done before the CPU comes alive, because the + * moment we can run tasks we can encounter (software) events. + * + * Specifically, someone can have inherited events on kthreadd + * or a pre-existing worker thread that gets re-bound. + */ perf_event_init_cpu(cpu); break; case CPU_DOWN_PREPARE: + /* + * This must be done before the CPU dies because after that an + * active event might want to IPI the CPU and that'll not work + * so great for dead CPUs. + * + * XXX smp_call_function_single() return -ENXIO without a warn + * so we could possibly deal with this. + * + * This is safe against new events arriving because + * sys_perf_event_open() serializes against hotplug using + * get_online_cpus(). + */ perf_event_exit_cpu(cpu); break; default: -- cgit From 2f5177f0fd7e531b26d54633be62d1d4cb94621c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Mar 2016 16:22:45 +0100 Subject: sched/cgroup: Fix/cleanup cgroup teardown/init The CPU controller hasn't kept up with the various changes in the whole cgroup initialization / destruction sequence, and commit: 2e91fa7f6d45 ("cgroup: keep zombies associated with their original cgroups") caused it to explode. The reason for this is that zombies do not inhibit css_offline() from being called, but do stall css_released(). Now we tear down the cfs_rq structures on css_offline() but zombies can run after that, leading to use-after-free issues. The solution is to move the tear-down to css_released(), which guarantees nobody (including no zombies) is still using our cgroup. Furthermore, a few simple cleanups are possible too. There doesn't appear to be any point to us using css_online() (anymore?) so fold that in css_alloc(). And since cgroup code guarantees an RCU grace period between css_released() and css_free() we can forgo using call_rcu() and free the stuff immediately. Suggested-by: Tejun Heo Reported-by: Kazuki Yamaguchi Reported-by: Niklas Cassel Tested-by: Niklas Cassel Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 2e91fa7f6d45 ("cgroup: keep zombies associated with their original cgroups") Link: http://lkml.kernel.org/r/20160316152245.GY6344@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d872e19244b..2a87bdde8d4e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7536,7 +7536,7 @@ void set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); -static void free_sched_group(struct task_group *tg) +static void sched_free_group(struct task_group *tg) { free_fair_sched_group(tg); free_rt_sched_group(tg); @@ -7562,7 +7562,7 @@ struct task_group *sched_create_group(struct task_group *parent) return tg; err: - free_sched_group(tg); + sched_free_group(tg); return ERR_PTR(-ENOMEM); } @@ -7582,17 +7582,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) } /* rcu callback to free various structures associated with a task group */ -static void free_sched_group_rcu(struct rcu_head *rhp) +static void sched_free_group_rcu(struct rcu_head *rhp) { /* now it should be safe to free those cfs_rqs */ - free_sched_group(container_of(rhp, struct task_group, rcu)); + sched_free_group(container_of(rhp, struct task_group, rcu)); } -/* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); + call_rcu(&tg->rcu, sched_free_group_rcu); } void sched_offline_group(struct task_group *tg) @@ -8051,31 +8050,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); + sched_online_group(tg, parent); + return &tg->css; } -static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) +static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css->parent); - if (parent) - sched_online_group(tg, parent); - return 0; + sched_offline_group(tg); } static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - sched_destroy_group(tg); -} - -static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) -{ - struct task_group *tg = css_tg(css); - - sched_offline_group(tg); + /* + * Relies on the RCU grace period between css_released() and this. + */ + sched_free_group(tg); } static void cpu_cgroup_fork(struct task_struct *task) @@ -8435,9 +8429,8 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, + .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, - .css_online = cpu_cgroup_css_online, - .css_offline = cpu_cgroup_css_offline, .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, -- cgit From 3a47d5124a957358274e9ca7b115b2f3a914f56d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Mar 2016 13:04:03 +0100 Subject: sched/fair: Fix fairness issue on migration Pavan reported that in the presence of very light tasks (or cgroups) the placement of migrated tasks can cause severe fairness issues. The problem is that enqueue_entity() places the task before it updates time, thereby it can place the task far in the past (remember that light tasks will shoot virtual time forward at a high speed, so in relation to the pre-existing light task, we can land far in the past). This is done because update_curr() needs the current task, and we might be placing the current task. The obvious solution is to differentiate between the current and any other task; placing the current before we update time, and placing any other task after, such that !curr tasks end up at the current moment in time, and not in the past. Reported-by: Pavan Kondeti Tested-by: Pavan Kondeti Signed-off-by: Peter Zijlstra (Intel) Cc: Ben Segall Cc: Linus Torvalds Cc: Matt Fleming Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: byungchul.park@lge.com Link: http://lkml.kernel.org/r/20160309120403.GK6344@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 33130529e9b5..3c114d971d84 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3157,17 +3157,25 @@ static inline void check_schedstat_required(void) static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING); + bool curr = cfs_rq->curr == se; + /* - * Update the normalized vruntime before updating min_vruntime - * through calling update_curr(). + * If we're the current task, we must renormalise before calling + * update_curr(). */ - if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) + if (renorm && curr) se->vruntime += cfs_rq->min_vruntime; + update_curr(cfs_rq); + /* - * Update run-time statistics of the 'current'. + * Otherwise, renormalise after, such that we're placed at the current + * moment in time, instead of some random moment in the past. */ - update_curr(cfs_rq); + if (renorm && !curr) + se->vruntime += cfs_rq->min_vruntime; + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -3183,7 +3191,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_stats_enqueue(cfs_rq, se); check_spread(cfs_rq, se); } - if (se != cfs_rq->curr) + if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; -- cgit From d4335581dc30ec6545999c7443bb9fead274a980 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Wed, 9 Mar 2016 14:59:08 +0000 Subject: sched/fair: Add comments to explain select_idle_sibling() It's not entirely obvious how the main loop in select_idle_sibling() works on first glance. Sprinkle a few comments to explain the design and intention behind the loop based on some conversations with Mike and Peter. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1457535548-15329-1-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c114d971d84..303d6392b389 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5055,7 +5055,19 @@ static int select_idle_sibling(struct task_struct *p, int target) return i; /* - * Otherwise, iterate the domains and find an elegible idle cpu. + * Otherwise, iterate the domains and find an eligible idle cpu. + * + * A completely idle sched group at higher domains is more + * desirable than an idle group at a lower level, because lower + * domains have smaller groups and usually share hardware + * resources which causes tasks to contend on them, e.g. x86 + * hyperthread siblings in the lowest domain (SMT) can contend + * on the shared cpu pipeline. + * + * However, while we prefer idle groups at higher domains + * finding an idle cpu at the lowest domain is still better than + * returning 'target', which we've already established, isn't + * idle. */ sd = rcu_dereference(per_cpu(sd_llc, target)); for_each_lower_domain(sd) { @@ -5065,11 +5077,16 @@ static int select_idle_sibling(struct task_struct *p, int target) tsk_cpus_allowed(p))) goto next; + /* Ensure the entire group is idle */ for_each_cpu(i, sched_group_cpus(sg)) { if (i == target || !idle_cpu(i)) goto next; } + /* + * It doesn't matter which cpu we pick, the + * whole group is idle. + */ target = cpumask_first_and(sched_group_cpus(sg), tsk_cpus_allowed(p)); goto done; -- cgit From 1a736b77a3f50910843d076623204ba6e5057dc1 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Mon, 21 Dec 2015 19:14:42 +0800 Subject: sched/cpuacct: Rename parameter in cpuusage_write() for readability The name of the 'reset' parameter to cpuusage_write() is quite confusing, because the only valid value we allow is '0', so !reset is actually the case that resets ... Rename it to 'val' and explain it in a comment that we only allow 0. Signed-off-by: Dongsheng Yang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: cgroups@vger.kernel.org Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1450696483-2864-1-git-send-email-yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dd7cbb55bbf2..9c2bbf7efa1a 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -145,13 +145,16 @@ static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) } static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, - u64 reset) + u64 val) { struct cpuacct *ca = css_ca(css); int err = 0; int i; - if (reset) { + /* + * Only allow '0' here to do a reset. + */ + if (val) { err = -EINVAL; goto out; } -- cgit From 73e6aafd9ea81498d31361f01db84a0118da2d1c Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 17 Mar 2016 12:19:43 +0800 Subject: sched/cpuacct: Simplify the cpuacct code - Use for() instead of while() loop in some functions to make the code simpler. - Use this_cpu_ptr() instead of per_cpu_ptr() to make the code cleaner and a bit faster. Suggested-by: Peter Zijlstra Signed-off-by: Zhao Lei Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Tejun Heo Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d8a7ef9592f55224630cb26dea239f05b6398a4e.1458187654.git.zhaolei@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 28 +++++----------------------- kernel/sched/cpuacct.h | 4 ++-- 2 files changed, 7 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9c2bbf7efa1a..434c2fa41352 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -238,23 +238,10 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int cpu; - - cpu = task_cpu(tsk); rcu_read_lock(); - - ca = task_ca(tsk); - - while (true) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - - ca = parent_ca(ca); - if (!ca) - break; - } - + for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) + *this_cpu_ptr(ca->cpuusage) += cputime; rcu_read_unlock(); } @@ -263,18 +250,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) * * Note: it's the caller that updates the account of the root cgroup. */ -void cpuacct_account_field(struct task_struct *p, int index, u64 val) +void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { - struct kernel_cpustat *kcpustat; struct cpuacct *ca; rcu_read_lock(); - ca = task_ca(p); - while (ca != &root_cpuacct) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += val; - ca = parent_ca(ca); - } + for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) + this_cpu_ptr(ca->cpustat)->cpustat[index] += val; rcu_read_unlock(); } diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index ed605624a5e7..ba72807c73d4 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -1,7 +1,7 @@ #ifdef CONFIG_CGROUP_CPUACCT extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); +extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); #else @@ -10,7 +10,7 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) } static inline void -cpuacct_account_field(struct task_struct *p, int index, u64 val) +cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { } -- cgit From 3debb0a9ddb16526de8b456491b7db60114f7b5e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 22 Mar 2016 17:30:58 -0400 Subject: tracing: Fix trace_printk() to print when not using bprintk() The trace_printk() code will allocate extra buffers if the compile detects that a trace_printk() is used. To do this, the format of the trace_printk() is saved to the __trace_printk_fmt section, and if that section is bigger than zero, the buffers are allocated (along with a message that this has happened). If trace_printk() uses a format that is not a constant, and thus something not guaranteed to be around when the print happens, the compiler optimizes the fmt out, as it is not used, and the __trace_printk_fmt section is not filled. This means the kernel will not allocate the special buffers needed for the trace_printk() and the trace_printk() will not write anything to the tracing buffer. Adding a "__used" to the variable in the __trace_printk_fmt section will keep it around, even though it is set to NULL. This will keep the string from being printed in the debugfs/tracing/printk_formats section as it is not needed. Reported-by: Vlastimil Babka Fixes: 07d777fe8c398 "tracing: Add percpu buffers for trace_printk()" Cc: stable@vger.kernel.org # v3.5+ Signed-off-by: Steven Rostedt --- include/linux/kernel.h | 6 +++--- kernel/trace/trace_printk.c | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f31638c6e873..95452f72349a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -635,7 +635,7 @@ do { \ #define do_trace_printk(fmt, args...) \ do { \ - static const char *trace_printk_fmt \ + static const char *trace_printk_fmt __used \ __attribute__((section("__trace_printk_fmt"))) = \ __builtin_constant_p(fmt) ? fmt : NULL; \ \ @@ -679,7 +679,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...); */ #define trace_puts(str) ({ \ - static const char *trace_printk_fmt \ + static const char *trace_printk_fmt __used \ __attribute__((section("__trace_printk_fmt"))) = \ __builtin_constant_p(str) ? str : NULL; \ \ @@ -701,7 +701,7 @@ extern void trace_dump_stack(int skip); #define ftrace_vprintk(fmt, vargs) \ do { \ if (__builtin_constant_p(fmt)) { \ - static const char *trace_printk_fmt \ + static const char *trace_printk_fmt __used \ __attribute__((section("__trace_printk_fmt"))) = \ __builtin_constant_p(fmt) ? fmt : NULL; \ \ diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 060df67dbdd1..f96f0383f6c6 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -296,6 +296,9 @@ static int t_show(struct seq_file *m, void *v) const char *str = *fmt; int i; + if (!*fmt) + return 0; + seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); /* -- cgit From 7e6867bf831c71fe0e47438831ae3a94d4c7ab3c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 18 Mar 2016 16:28:04 +0100 Subject: tracing: Record and show NMI state The latency tracer format has a nice column to indicate IRQ state, but this is not able to tell us about NMI state. When tracing perf interrupt handlers (which often run in NMI context) it is very useful to see how the events nest. Link: http://lkml.kernel.org/r/20160318153022.105068893@infradead.org Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 + kernel/trace/trace.h | 1 + kernel/trace/trace_output.c | 10 +++++++--- 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cb2b708e4ea7..7bdf8ba323ec 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1664,6 +1664,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, #else TRACE_FLAG_IRQS_NOSUPPORT | #endif + ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 39588c23dd8b..3fff4adfd431 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -125,6 +125,7 @@ enum trace_flag_type { TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, TRACE_FLAG_PREEMPT_RESCHED = 0x20, + TRACE_FLAG_NMI = 0x40, }; #define TRACE_BUF_SIZE 1024 diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 282982195e09..0bb9cf2d53e6 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -389,7 +389,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) char irqs_off; int hardirq; int softirq; + int nmi; + nmi = entry->flags & TRACE_FLAG_NMI; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; @@ -415,10 +417,12 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) } hardsoft_irq = + (nmi && hardirq) ? 'Z' : + nmi ? 'z' : (hardirq && softirq) ? 'H' : - hardirq ? 'h' : - softirq ? 's' : - '.'; + hardirq ? 'h' : + softirq ? 's' : + '.' ; trace_seq_printf(s, "%c%c%c", irqs_off, need_resched, hardsoft_irq); -- cgit From b4aa14a63cb3194d8eab355fcee194838ab09121 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 22 Mar 2016 14:24:39 -0700 Subject: kernel/hung_task.c: use timeout diff when timeout is updated When new timeout is written to /proc/sys/kernel/hung_task_timeout_secs, khungtaskd is interrupted and again sleeps for full timeout duration. This means that hang task will not be checked if new timeout is written periodically within old timeout duration and/or checking of hang task will be delayed for up to previous timeout duration. Fix this by remembering last time khungtaskd checked hang task. This change will allow other watchdog tasks (if any) to share khungtaskd by sleeping for minimal timeout diff of all watchdog tasks. Doing more watchdog tasks from khungtaskd will reduce the possibility of printk() collisions by multiple watchdog threads. Signed-off-by: Tetsuo Handa Cc: Oleg Nesterov Cc: Aaron Tomlin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index e0f90c2b57aa..d234022805dc 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -185,10 +185,12 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); } -static unsigned long timeout_jiffies(unsigned long timeout) +static long hung_timeout_jiffies(unsigned long last_checked, + unsigned long timeout) { /* timeout of 0 will disable the watchdog */ - return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT; + return timeout ? last_checked - jiffies + timeout * HZ : + MAX_SCHEDULE_TIMEOUT; } /* @@ -224,18 +226,21 @@ EXPORT_SYMBOL_GPL(reset_hung_task_detector); */ static int watchdog(void *dummy) { + unsigned long hung_last_checked = jiffies; + set_user_nice(current, 0); for ( ; ; ) { unsigned long timeout = sysctl_hung_task_timeout_secs; + long t = hung_timeout_jiffies(hung_last_checked, timeout); - while (schedule_timeout_interruptible(timeout_jiffies(timeout))) - timeout = sysctl_hung_task_timeout_secs; - - if (atomic_xchg(&reset_hung_task, 0)) + if (t <= 0) { + if (!atomic_xchg(&reset_hung_task, 0)) + check_hung_uninterruptible_tasks(timeout); + hung_last_checked = jiffies; continue; - - check_hung_uninterruptible_tasks(timeout); + } + schedule_timeout_interruptible(t); } return 0; -- cgit From 5c38065e021bc76f97fc08997f6d7fc7ea3fb7a7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:24:52 -0700 Subject: seccomp: check in_compat_syscall, not is_compat_task, in strict mode Seccomp wants to know the syscall bitness, not the caller task bitness, when it selects the syscall whitelist. As far as I know, this makes no difference on any architecture, so it's not a security problem. (It generates identical code everywhere except sparc, and, on sparc, the syscall numbering is the same for both ABIs.) Signed-off-by: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/seccomp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 15a1795bbba1..e1e5a354854e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -395,7 +395,7 @@ seccomp_prepare_user_filter(const char __user *user_filter) struct seccomp_filter *filter = ERR_PTR(-EFAULT); #ifdef CONFIG_COMPAT - if (is_compat_task()) { + if (in_compat_syscall()) { struct compat_sock_fprog fprog32; if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) goto out; @@ -529,7 +529,7 @@ static void __secure_computing_strict(int this_syscall) { int *syscall_whitelist = mode1_syscalls; #ifdef CONFIG_COMPAT - if (is_compat_task()) + if (in_compat_syscall()) syscall_whitelist = mode1_syscalls_32; #endif do { -- cgit From 5c465217a930d4bbb7dd35a56bde1eea5bbd14d6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:24:55 -0700 Subject: ptrace: in PEEK_SIGINFO, check syscall bitness, not task bitness Users of the 32-bit ptrace() ABI expect the full 32-bit ABI. siginfo translation should check ptrace() ABI, not caller task ABI. This is an ABI change on SPARC. Let's hope that no one relied on the old buggy ABI. Signed-off-by: Andy Lutomirski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2341efe7fe02..c79b91d09e35 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -681,7 +681,7 @@ static int ptrace_peek_siginfo(struct task_struct *child, break; #ifdef CONFIG_COMPAT - if (unlikely(is_compat_task())) { + if (unlikely(in_compat_syscall())) { compat_siginfo_t __user *uinfo = compat_ptr(data); if (copy_siginfo_to_user32(uinfo, &info) || -- cgit From efbc0fbf34927bd4d3d49b50b370990be82809c2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:24:58 -0700 Subject: auditsc: for seccomp events, log syscall compat state using in_compat_syscall Except on SPARC, this is what the code always did. SPARC compat seccomp was buggy, although the impact of the bug was limited because SPARC 32-bit and 64-bit syscall numbers are the same. Signed-off-by: Andy Lutomirski Cc: Paul Moore Cc: Eric Paris Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 195ffaee50b9..7d0e3cf8abe1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2412,8 +2412,8 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", - signr, syscall_get_arch(), syscall, is_compat_task(), - KSTK_EIP(current), code); + signr, syscall_get_arch(), syscall, + in_compat_syscall(), KSTK_EIP(current), code); audit_log_end(ab); } -- cgit From 1333ab03150478df8d6f5673a91df1e50dc6ab97 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 22 Mar 2016 14:25:33 -0700 Subject: ptrace: change __ptrace_unlink() to clear ->ptrace under ->siglock This test-case (simplified version of generated by syzkaller) #include #include #include void test(void) { for (;;) { if (fork()) { wait(NULL); continue; } ptrace(PTRACE_SEIZE, getppid(), 0, 0); ptrace(PTRACE_INTERRUPT, getppid(), 0, 0); _exit(0); } } int main(void) { int np; for (np = 0; np < 8; ++np) if (!fork()) test(); while (wait(NULL) > 0) ; return 0; } triggers the 2nd WARN_ON_ONCE(!signr) warning in do_jobctl_trap(). The problem is that __ptrace_unlink() clears task->jobctl under siglock but task->ptrace is cleared without this lock held; this fools the "else" branch which assumes that !PT_SEIZED means PT_PTRACED. Note also that most of other PTRACE_SEIZE checks can race with detach from the exiting tracer too. Say, the callers of ptrace_trap_notify() assume that SEIZED can't go away after it was checked. Signed-off-by: Oleg Nesterov Reported-by: Dmitry Vyukov Cc: Tejun Heo Cc: syzkaller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c79b91d09e35..d49bfa1e53e6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -73,12 +73,11 @@ void __ptrace_unlink(struct task_struct *child) { BUG_ON(!child->ptrace); - child->ptrace = 0; child->parent = child->real_parent; list_del_init(&child->ptrace_entry); spin_lock(&child->sighand->siglock); - + child->ptrace = 0; /* * Clear all pending traps and TRAPPING. TRAPPING should be * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly. -- cgit From 378c6520e7d29280f400ef2ceaf155c86f05a71a Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 22 Mar 2016 14:25:36 -0700 Subject: fs/coredump: prevent fsuid=0 dumps into user-controlled directories This commit fixes the following security hole affecting systems where all of the following conditions are fulfilled: - The fs.suid_dumpable sysctl is set to 2. - The kernel.core_pattern sysctl's value starts with "/". (Systems where kernel.core_pattern starts with "|/" are not affected.) - Unprivileged user namespace creation is permitted. (This is true on Linux >=3.8, but some distributions disallow it by default using a distro patch.) Under these conditions, if a program executes under secure exec rules, causing it to run with the SUID_DUMP_ROOT flag, then unshares its user namespace, changes its root directory and crashes, the coredump will be written using fsuid=0 and a path derived from kernel.core_pattern - but this path is interpreted relative to the root directory of the process, allowing the attacker to control where a coredump will be written with root privileges. To fix the security issue, always interpret core_pattern for dumps that are written under SUID_DUMP_ROOT relative to the root directory of init. Signed-off-by: Jann Horn Acked-by: Kees Cook Cc: Al Viro Cc: "Eric W. Biederman" Cc: Andy Lutomirski Cc: Oleg Nesterov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/drivers/mconsole_kern.c | 2 +- fs/coredump.c | 30 ++++++++++++++++++++++++++---- fs/fhandle.c | 2 +- fs/open.c | 6 ++---- include/linux/fs.h | 2 +- kernel/sysctl_binary.c | 2 +- 6 files changed, 32 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index b821b13d343a..8a6b57108ac2 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -133,7 +133,7 @@ void mconsole_proc(struct mc_request *req) ptr += strlen("proc"); ptr = skip_spaces(ptr); - file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY); + file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0); if (IS_ERR(file)) { mconsole_reply(req, "Failed to open file", 1, 0); printk(KERN_ERR "open /proc/%s: %ld\n", ptr, PTR_ERR(file)); diff --git a/fs/coredump.c b/fs/coredump.c index 9ea87e9fdccf..47c32c3bfa1d 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -32,6 +32,9 @@ #include #include #include +#include +#include +#include #include #include @@ -649,6 +652,8 @@ void do_coredump(const siginfo_t *siginfo) } } else { struct inode *inode; + int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW | + O_LARGEFILE | O_EXCL; if (cprm.limit < binfmt->min_coredump) goto fail_unlock; @@ -687,10 +692,27 @@ void do_coredump(const siginfo_t *siginfo) * what matters is that at least one of the two processes * writes its coredump successfully, not which one. */ - cprm.file = filp_open(cn.corename, - O_CREAT | 2 | O_NOFOLLOW | - O_LARGEFILE | O_EXCL, - 0600); + if (need_suid_safe) { + /* + * Using user namespaces, normal user tasks can change + * their current->fs->root to point to arbitrary + * directories. Since the intention of the "only dump + * with a fully qualified path" rule is to control where + * coredumps may be placed using root privileges, + * current->fs->root must not be used. Instead, use the + * root directory of init_task. + */ + struct path root; + + task_lock(&init_task); + get_fs_root(init_task.fs, &root); + task_unlock(&init_task); + cprm.file = file_open_root(root.dentry, root.mnt, + cn.corename, open_flags, 0600); + path_put(&root); + } else { + cprm.file = filp_open(cn.corename, open_flags, 0600); + } if (IS_ERR(cprm.file)) goto fail_unlock; diff --git a/fs/fhandle.c b/fs/fhandle.c index d59712dfa3e7..ca3c3dd01789 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -228,7 +228,7 @@ long do_handle_open(int mountdirfd, path_put(&path); return fd; } - file = file_open_root(path.dentry, path.mnt, "", open_flag); + file = file_open_root(path.dentry, path.mnt, "", open_flag, 0); if (IS_ERR(file)) { put_unused_fd(fd); retval = PTR_ERR(file); diff --git a/fs/open.c b/fs/open.c index 55bdc75e2172..17cb6b1dab75 100644 --- a/fs/open.c +++ b/fs/open.c @@ -992,14 +992,12 @@ struct file *filp_open(const char *filename, int flags, umode_t mode) EXPORT_SYMBOL(filp_open); struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, - const char *filename, int flags) + const char *filename, int flags, umode_t mode) { struct open_flags op; - int err = build_open_flags(flags, 0, &op); + int err = build_open_flags(flags, mode, &op); if (err) return ERR_PTR(err); - if (flags & O_CREAT) - return ERR_PTR(-EINVAL); return do_file_open_root(dentry, mnt, filename, &op); } EXPORT_SYMBOL(file_open_root); diff --git a/include/linux/fs.h b/include/linux/fs.h index 35d99266ca9a..14a97194b34b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2263,7 +2263,7 @@ extern long do_sys_open(int dfd, const char __user *filename, int flags, extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(struct dentry *, struct vfsmount *, - const char *, int); + const char *, int, umode_t); extern struct file * dentry_open(const struct path *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 7e7746a42a62..10a1d7dc9313 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1321,7 +1321,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, } mnt = task_active_pid_ns(current)->proc_mnt; - file = file_open_root(mnt->mnt_root, mnt, pathname, flags); + file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0); result = PTR_ERR(file); if (IS_ERR(file)) goto out_putname; -- cgit From ebc41f20d77f6ad91f1f2d2af5147dc9bb6b5eea Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Tue, 22 Mar 2016 14:27:17 -0700 Subject: panic: change nmi_panic from macro to function Commit 1717f2096b54 ("panic, x86: Fix re-entrance problem due to panic on NMI") and commit 58c5661f2144 ("panic, x86: Allow CPUs to save registers even if looping in NMI context") introduced nmi_panic() which prevents concurrent/recursive execution of panic(). It also saves registers for the crash dump on x86. However, there are some cases where NMI handlers still use panic(). This patch set partially replaces them with nmi_panic() in those cases. Even this patchset is applied, some NMI or similar handlers (e.g. MCE handler) continue to use panic(). This is because I can't test them well and actual problems won't happen. For example, the possibility that normal panic and panic on MCE happen simultaneously is very low. This patch (of 3): Convert nmi_panic() to a proper function and export it instead of exporting internal implementation details to modules, for obvious reasons. Signed-off-by: Hidehiro Kawai Acked-by: Borislav Petkov Acked-by: Michal Nazarewicz Cc: Michal Hocko Cc: Rasmus Villemoes Cc: Nicolas Iooss Cc: Javi Merino Cc: Gobinda Charan Maji Cc: "Steven Rostedt (Red Hat)" Cc: Thomas Gleixner Cc: Vitaly Kuznetsov Cc: HATAYAMA Daisuke Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 21 +-------------------- kernel/panic.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index b82646ee70eb..a13c52ccd8ac 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -255,7 +255,7 @@ extern long (*panic_blink)(int state); __printf(1, 2) void panic(const char *fmt, ...) __noreturn __cold; -void nmi_panic_self_stop(struct pt_regs *); +void nmi_panic(struct pt_regs *regs, const char *msg); extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); @@ -456,25 +456,6 @@ extern bool crash_kexec_post_notifiers; extern atomic_t panic_cpu; #define PANIC_CPU_INVALID -1 -/* - * A variant of panic() called from NMI context. We return if we've already - * panicked on this CPU. If another CPU already panicked, loop in - * nmi_panic_self_stop() which can provide architecture dependent code such - * as saving register state for crash dump. - */ -#define nmi_panic(regs, fmt, ...) \ -do { \ - int old_cpu, cpu; \ - \ - cpu = raw_smp_processor_id(); \ - old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu); \ - \ - if (old_cpu == PANIC_CPU_INVALID) \ - panic(fmt, ##__VA_ARGS__); \ - else if (old_cpu != cpu) \ - nmi_panic_self_stop(regs); \ -} while (0) - /* * Only to be used by arch init code. If the user over-wrote the default * CONFIG_PANIC_TIMEOUT, honor it. diff --git a/kernel/panic.c b/kernel/panic.c index fa400852bf6c..535c96510a44 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -73,6 +73,26 @@ void __weak nmi_panic_self_stop(struct pt_regs *regs) atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +/* + * A variant of panic() called from NMI context. We return if we've already + * panicked on this CPU. If another CPU already panicked, loop in + * nmi_panic_self_stop() which can provide architecture dependent code such + * as saving register state for crash dump. + */ +void nmi_panic(struct pt_regs *regs, const char *msg) +{ + int old_cpu, cpu; + + cpu = raw_smp_processor_id(); + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu); + + if (old_cpu == PANIC_CPU_INVALID) + panic("%s", msg); + else if (old_cpu != cpu) + nmi_panic_self_stop(regs); +} +EXPORT_SYMBOL(nmi_panic); + /** * panic - halt the system * @fmt: The text string to print -- cgit From ade356b99a4187578609f2a91c4d2ed88e4e70dc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 22 Mar 2016 14:27:26 -0700 Subject: profile: hide unused functions when !CONFIG_PROC_FS A couple of functions and variables in the profile implementation are used only on SMP systems by the procfs code, but are unused if either procfs is disabled or in uniprocessor kernels. gcc prints a harmless warning about the unused symbols: kernel/profile.c:243:13: error: 'profile_flip_buffers' defined but not used [-Werror=unused-function] static void profile_flip_buffers(void) ^ kernel/profile.c:266:13: error: 'profile_discard_flip_buffers' defined but not used [-Werror=unused-function] static void profile_discard_flip_buffers(void) ^ kernel/profile.c:330:12: error: 'profile_cpu_callback' defined but not used [-Werror=unused-function] static int profile_cpu_callback(struct notifier_block *info, ^ This adds further #ifdef to the file, to annotate exactly in which cases they are used. I have done several thousand ARM randconfig kernels with this patch applied and no longer get any warnings in this file. Signed-off-by: Arnd Bergmann Cc: Vlastimil Babka Cc: Robin Holt Cc: Johannes Weiner Cc: Christoph Lameter Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 51369697466e..c2199e9901c9 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -44,7 +44,7 @@ int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); static cpumask_var_t prof_cpu_mask; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); static DEFINE_MUTEX(profile_flip_mutex); @@ -202,7 +202,7 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n) } EXPORT_SYMBOL_GPL(profile_event_unregister); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) /* * Each cpu has a pair of open-addressed hashtables for pending * profile hits. read_profile() IPI's all cpus to request them -- cgit From 5c9a8750a6409c63a0f01d51a9024861022f6593 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 22 Mar 2016 14:27:30 -0700 Subject: kernel: add kcov code coverage kcov provides code coverage collection for coverage-guided fuzzing (randomized testing). Coverage-guided fuzzing is a testing technique that uses coverage feedback to determine new interesting inputs to a system. A notable user-space example is AFL (http://lcamtuf.coredump.cx/afl/). However, this technique is not widely used for kernel testing due to missing compiler and kernel support. kcov does not aim to collect as much coverage as possible. It aims to collect more or less stable coverage that is function of syscall inputs. To achieve this goal it does not collect coverage in soft/hard interrupts and instrumentation of some inherently non-deterministic or non-interesting parts of kernel is disbled (e.g. scheduler, locking). Currently there is a single coverage collection mode (tracing), but the API anticipates additional collection modes. Initially I also implemented a second mode which exposes coverage in a fixed-size hash table of counters (what Quentin used in his original patch). I've dropped the second mode for simplicity. This patch adds the necessary support on kernel side. The complimentary compiler support was added in gcc revision 231296. We've used this support to build syzkaller system call fuzzer, which has found 90 kernel bugs in just 2 months: https://github.com/google/syzkaller/wiki/Found-Bugs We've also found 30+ bugs in our internal systems with syzkaller. Another (yet unexplored) direction where kcov coverage would greatly help is more traditional "blob mutation". For example, mounting a random blob as a filesystem, or receiving a random blob over wire. Why not gcov. Typical fuzzing loop looks as follows: (1) reset coverage, (2) execute a bit of code, (3) collect coverage, repeat. A typical coverage can be just a dozen of basic blocks (e.g. an invalid input). In such context gcov becomes prohibitively expensive as reset/collect coverage steps depend on total number of basic blocks/edges in program (in case of kernel it is about 2M). Cost of kcov depends only on number of executed basic blocks/edges. On top of that, kernel requires per-thread coverage because there are always background threads and unrelated processes that also produce coverage. With inlined gcov instrumentation per-thread coverage is not possible. kcov exposes kernel PCs and control flow to user-space which is insecure. But debugfs should not be mapped as user accessible. Based on a patch by Quentin Casasnovas. [akpm@linux-foundation.org: make task_struct.kcov_mode have type `enum kcov_mode'] [akpm@linux-foundation.org: unbreak allmodconfig] [akpm@linux-foundation.org: follow x86 Makefile layout standards] Signed-off-by: Dmitry Vyukov Reviewed-by: Kees Cook Cc: syzkaller Cc: Vegard Nossum Cc: Catalin Marinas Cc: Tavis Ormandy Cc: Will Deacon Cc: Quentin Casasnovas Cc: Kostya Serebryany Cc: Eric Dumazet Cc: Alexander Potapenko Cc: Kees Cook Cc: Bjorn Helgaas Cc: Sasha Levin Cc: David Drysdale Cc: Ard Biesheuvel Cc: Andrey Ryabinin Cc: Kirill A. Shutemov Cc: Jiri Slaby Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kcov.txt | 111 ++++++++++++++ Makefile | 11 +- arch/x86/Kconfig | 1 + arch/x86/boot/Makefile | 7 + arch/x86/boot/compressed/Makefile | 3 + arch/x86/entry/vdso/Makefile | 3 + arch/x86/kernel/Makefile | 6 + arch/x86/kernel/apic/Makefile | 4 + arch/x86/kernel/cpu/Makefile | 4 + arch/x86/lib/Makefile | 3 + arch/x86/mm/Makefile | 3 + arch/x86/realmode/rm/Makefile | 3 + drivers/firmware/efi/libstub/Makefile | 3 + include/linux/kcov.h | 29 ++++ include/linux/sched.h | 11 ++ include/uapi/linux/kcov.h | 10 ++ kernel/Makefile | 12 ++ kernel/exit.c | 2 + kernel/fork.c | 3 + kernel/kcov.c | 273 ++++++++++++++++++++++++++++++++++ kernel/locking/Makefile | 3 + kernel/rcu/Makefile | 4 + kernel/sched/Makefile | 4 + lib/Kconfig.debug | 21 +++ lib/Makefile | 12 ++ mm/Makefile | 15 ++ mm/kasan/Makefile | 1 + scripts/Makefile.lib | 6 + 28 files changed, 567 insertions(+), 1 deletion(-) create mode 100644 Documentation/kcov.txt create mode 100644 include/linux/kcov.h create mode 100644 include/uapi/linux/kcov.h create mode 100644 kernel/kcov.c (limited to 'kernel') diff --git a/Documentation/kcov.txt b/Documentation/kcov.txt new file mode 100644 index 000000000000..779ff4ab1c1d --- /dev/null +++ b/Documentation/kcov.txt @@ -0,0 +1,111 @@ +kcov: code coverage for fuzzing +=============================== + +kcov exposes kernel code coverage information in a form suitable for coverage- +guided fuzzing (randomized testing). Coverage data of a running kernel is +exported via the "kcov" debugfs file. Coverage collection is enabled on a task +basis, and thus it can capture precise coverage of a single system call. + +Note that kcov does not aim to collect as much coverage as possible. It aims +to collect more or less stable coverage that is function of syscall inputs. +To achieve this goal it does not collect coverage in soft/hard interrupts +and instrumentation of some inherently non-deterministic parts of kernel is +disbled (e.g. scheduler, locking). + +Usage: +====== + +Configure kernel with: + + CONFIG_KCOV=y + +CONFIG_KCOV requires gcc built on revision 231296 or later. +Profiling data will only become accessible once debugfs has been mounted: + + mount -t debugfs none /sys/kernel/debug + +The following program demonstrates kcov usage from within a test program: + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) +#define KCOV_ENABLE _IO('c', 100) +#define KCOV_DISABLE _IO('c', 101) +#define COVER_SIZE (64<<10) + +int main(int argc, char **argv) +{ + int fd; + unsigned long *cover, n, i; + + /* A single fd descriptor allows coverage collection on a single + * thread. + */ + fd = open("/sys/kernel/debug/kcov", O_RDWR); + if (fd == -1) + perror("open"), exit(1); + /* Setup trace mode and trace size. */ + if (ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE)) + perror("ioctl"), exit(1); + /* Mmap buffer shared between kernel- and user-space. */ + cover = (unsigned long*)mmap(NULL, COVER_SIZE * sizeof(unsigned long), + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if ((void*)cover == MAP_FAILED) + perror("mmap"), exit(1); + /* Enable coverage collection on the current thread. */ + if (ioctl(fd, KCOV_ENABLE, 0)) + perror("ioctl"), exit(1); + /* Reset coverage from the tail of the ioctl() call. */ + __atomic_store_n(&cover[0], 0, __ATOMIC_RELAXED); + /* That's the target syscal call. */ + read(-1, NULL, 0); + /* Read number of PCs collected. */ + n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED); + for (i = 0; i < n; i++) + printf("0x%lx\n", cover[i + 1]); + /* Disable coverage collection for the current thread. After this call + * coverage can be enabled for a different thread. + */ + if (ioctl(fd, KCOV_DISABLE, 0)) + perror("ioctl"), exit(1); + /* Free resources. */ + if (munmap(cover, COVER_SIZE * sizeof(unsigned long))) + perror("munmap"), exit(1); + if (close(fd)) + perror("close"), exit(1); + return 0; +} + +After piping through addr2line output of the program looks as follows: + +SyS_read +fs/read_write.c:562 +__fdget_pos +fs/file.c:774 +__fget_light +fs/file.c:746 +__fget_light +fs/file.c:750 +__fget_light +fs/file.c:760 +__fdget_pos +fs/file.c:784 +SyS_read +fs/read_write.c:562 + +If a program needs to collect coverage from several threads (independently), +it needs to open /sys/kernel/debug/kcov in each thread separately. + +The interface is fine-grained to allow efficient forking of test processes. +That is, a parent process opens /sys/kernel/debug/kcov, enables trace mode, +mmaps coverage buffer and then forks child processes in a loop. Child processes +only need to enable coverage (disable happens automatically on thread end). diff --git a/Makefile b/Makefile index e055b969c325..b98a4f70d1b5 100644 --- a/Makefile +++ b/Makefile @@ -365,6 +365,7 @@ LDFLAGS_MODULE = CFLAGS_KERNEL = AFLAGS_KERNEL = CFLAGS_GCOV = -fprofile-arcs -ftest-coverage +CFLAGS_KCOV = -fsanitize-coverage=trace-pc # Use USERINCLUDE when you must reference the UAPI directories only. @@ -411,7 +412,7 @@ export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS -export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KASAN CFLAGS_UBSAN +export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KCOV CFLAGS_KASAN CFLAGS_UBSAN export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL @@ -673,6 +674,14 @@ endif endif KBUILD_CFLAGS += $(stackp-flag) +ifdef CONFIG_KCOV + ifeq ($(call cc-option, $(CFLAGS_KCOV)),) + $(warning Cannot use CONFIG_KCOV: \ + -fsanitize-coverage=trace-pc is not supported by compiler) + CFLAGS_KCOV = + endif +endif + ifeq ($(cc-name),clang) KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8b680a5cb25b..54478b7635de 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,6 +28,7 @@ config X86 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_SG_CHAIN diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 0bf6749522d9..b1ef9e489084 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -12,6 +12,13 @@ KASAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y +# Kernel does not boot with kcov instrumentation here. +# One of the problems observed was insertion of __sanitizer_cov_trace_pc() +# callback into middle of per-cpu data enabling code. Thus the callback observed +# inconsistent state and crashed. We are interested mostly in syscall coverage, +# so boot code is not interesting anyway. +KCOV_INSTRUMENT := n + # If you want to preset the SVGA mode, uncomment the next line and # set SVGA_MODE to whatever number you want. # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 5e1d26e09407..6915ff2bd996 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -19,6 +19,9 @@ KASAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index f9fb859c98b9..6874da5f67fc 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -7,6 +7,9 @@ KASAN_SANITIZE := n UBSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + VDSO64-$(CONFIG_X86_64) := y VDSOX32-$(CONFIG_X86_X32_ABI) := y VDSO32-$(CONFIG_X86_32) := y diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d5fb0871aba3..adaae2c781c1 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -25,6 +25,12 @@ OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y +# If instrumentation of this dir is enabled, boot hangs during first second. +# Probably could be more selective here, but note that files related to irqs, +# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to +# non-deterministic coverage. +KCOV_INSTRUMENT := n + CFLAGS_irq.o := -I$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 8bb12ddc5db8..8e63ebdcbd0b 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,6 +2,10 @@ # Makefile for local APIC drivers and for the IO-APIC code # +# Leads to non-deterministic coverage that is not a function of syscall inputs. +# In particualr, smp_apic_timer_interrupt() is called in random places. +KCOV_INSTRUMENT := n + obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o vector.o obj-y += hw_nmi.o diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 0d373d7affc8..4a8697f7d4ef 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -8,6 +8,10 @@ CFLAGS_REMOVE_common.o = -pg CFLAGS_REMOVE_perf_event.o = -pg endif +# If these files are instrumented, boot hangs during the first second. +KCOV_INSTRUMENT_common.o := n +KCOV_INSTRUMENT_perf_event.o := n + # Make sure load_percpu_segment has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_common.o := $(nostackp) diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index a501fa25da41..72a576752a7e 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -2,6 +2,9 @@ # Makefile for x86 specific library files. # +# Produces uninteresting flaky coverage. +KCOV_INSTRUMENT_delay.o := n + inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt quiet_cmd_inat_tables = GEN $@ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 67cf2e1e557b..f98913258c63 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,3 +1,6 @@ +# Kernel does not boot with instrumentation of tlb.c. +KCOV_INSTRUMENT_tlb.o := n + obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o physaddr.o gup.o setup_nx.o diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 053abe7b0ef7..b95964610ea7 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -9,6 +9,9 @@ KASAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + always := realmode.bin realmode.relocs wakeup-objs := wakeup_asm.o wakemain.o video-mode.o diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index a15841eced4e..da99bbb74aeb 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -25,6 +25,9 @@ KASAN_SANITIZE := n UBSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + lib-y := efi-stub-helper.o # include the stub's generic dependencies from lib/ when building for ARM/arm64 diff --git a/include/linux/kcov.h b/include/linux/kcov.h new file mode 100644 index 000000000000..2883ac98c280 --- /dev/null +++ b/include/linux/kcov.h @@ -0,0 +1,29 @@ +#ifndef _LINUX_KCOV_H +#define _LINUX_KCOV_H + +#include + +struct task_struct; + +#ifdef CONFIG_KCOV + +void kcov_task_init(struct task_struct *t); +void kcov_task_exit(struct task_struct *t); + +enum kcov_mode { + /* Coverage collection is not enabled yet. */ + KCOV_MODE_DISABLED = 0, + /* + * Tracing coverage collection mode. + * Covered PCs are collected in a per-task buffer. + */ + KCOV_MODE_TRACE = 1, +}; + +#else + +static inline void kcov_task_init(struct task_struct *t) {} +static inline void kcov_task_exit(struct task_struct *t) {} + +#endif /* CONFIG_KCOV */ +#endif /* _LINUX_KCOV_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 084ed9fba620..34495d2d2d7b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -51,6 +51,7 @@ struct sched_param { #include #include #include +#include #include #include #include @@ -1818,6 +1819,16 @@ struct task_struct { /* bitmask and counter of trace recursion */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ +#ifdef CONFIG_KCOV + /* Coverage collection mode enabled for this task (0 if disabled). */ + enum kcov_mode kcov_mode; + /* Size of the kcov_area. */ + unsigned kcov_size; + /* Buffer for coverage collection. */ + void *kcov_area; + /* kcov desciptor wired with this task or NULL. */ + struct kcov *kcov; +#endif #ifdef CONFIG_MEMCG struct mem_cgroup *memcg_in_oom; gfp_t memcg_oom_gfp_mask; diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h new file mode 100644 index 000000000000..574e22ec640d --- /dev/null +++ b/include/uapi/linux/kcov.h @@ -0,0 +1,10 @@ +#ifndef _LINUX_KCOV_IOCTLS_H +#define _LINUX_KCOV_IOCTLS_H + +#include + +#define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) +#define KCOV_ENABLE _IO('c', 100) +#define KCOV_DISABLE _IO('c', 101) + +#endif /* _LINUX_KCOV_IOCTLS_H */ diff --git a/kernel/Makefile b/kernel/Makefile index baa55e50a315..f0c40bf49d9f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -18,6 +18,17 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif +# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip() +# in coverage traces. +KCOV_INSTRUMENT_softirq.o := n +# These are called from save_stack_trace() on slub debug path, +# and produce insane amounts of uninteresting coverage. +KCOV_INSTRUMENT_module.o := n +KCOV_INSTRUMENT_extable.o := n +# Don't self-instrument. +KCOV_INSTRUMENT_kcov.o := n +KASAN_SANITIZE_kcov.o := n + # cond_syscall is currently not LTO compatible CFLAGS_sys_ni.o = $(DISABLE_LTO) @@ -68,6 +79,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ +obj-$(CONFIG_KCOV) += kcov.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o diff --git a/kernel/exit.c b/kernel/exit.c index 10e088237fed..953d1a1c0387 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include @@ -655,6 +656,7 @@ void do_exit(long code) TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); + kcov_task_exit(tsk); WARN_ON(blk_needs_flush_plug(tsk)); diff --git a/kernel/fork.c b/kernel/fork.c index 5b8d1e7ceeea..d277e83ed3e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -392,6 +393,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) account_kernel_stack(ti, 1); + kcov_task_init(tsk); + return tsk; free_ti: diff --git a/kernel/kcov.c b/kernel/kcov.c new file mode 100644 index 000000000000..3efbee0834a8 --- /dev/null +++ b/kernel/kcov.c @@ -0,0 +1,273 @@ +#define pr_fmt(fmt) "kcov: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * kcov descriptor (one per opened debugfs file). + * State transitions of the descriptor: + * - initial state after open() + * - then there must be a single ioctl(KCOV_INIT_TRACE) call + * - then, mmap() call (several calls are allowed but not useful) + * - then, repeated enable/disable for a task (only one task a time allowed) + */ +struct kcov { + /* + * Reference counter. We keep one for: + * - opened file descriptor + * - task with enabled coverage (we can't unwire it from another task) + */ + atomic_t refcount; + /* The lock protects mode, size, area and t. */ + spinlock_t lock; + enum kcov_mode mode; + /* Size of arena (in long's for KCOV_MODE_TRACE). */ + unsigned size; + /* Coverage buffer shared with user space. */ + void *area; + /* Task for which we collect coverage, or NULL. */ + struct task_struct *t; +}; + +/* + * Entry point from instrumented code. + * This is called once per basic-block/edge. + */ +void __sanitizer_cov_trace_pc(void) +{ + struct task_struct *t; + enum kcov_mode mode; + + t = current; + /* + * We are interested in code coverage as a function of a syscall inputs, + * so we ignore code executed in interrupts. + */ + if (!t || in_interrupt()) + return; + mode = READ_ONCE(t->kcov_mode); + if (mode == KCOV_MODE_TRACE) { + unsigned long *area; + unsigned long pos; + + /* + * There is some code that runs in interrupts but for which + * in_interrupt() returns false (e.g. preempt_schedule_irq()). + * READ_ONCE()/barrier() effectively provides load-acquire wrt + * interrupts, there are paired barrier()/WRITE_ONCE() in + * kcov_ioctl_locked(). + */ + barrier(); + area = t->kcov_area; + /* The first word is number of subsequent PCs. */ + pos = READ_ONCE(area[0]) + 1; + if (likely(pos < t->kcov_size)) { + area[pos] = _RET_IP_; + WRITE_ONCE(area[0], pos); + } + } +} +EXPORT_SYMBOL(__sanitizer_cov_trace_pc); + +static void kcov_get(struct kcov *kcov) +{ + atomic_inc(&kcov->refcount); +} + +static void kcov_put(struct kcov *kcov) +{ + if (atomic_dec_and_test(&kcov->refcount)) { + vfree(kcov->area); + kfree(kcov); + } +} + +void kcov_task_init(struct task_struct *t) +{ + t->kcov_mode = KCOV_MODE_DISABLED; + t->kcov_size = 0; + t->kcov_area = NULL; + t->kcov = NULL; +} + +void kcov_task_exit(struct task_struct *t) +{ + struct kcov *kcov; + + kcov = t->kcov; + if (kcov == NULL) + return; + spin_lock(&kcov->lock); + if (WARN_ON(kcov->t != t)) { + spin_unlock(&kcov->lock); + return; + } + /* Just to not leave dangling references behind. */ + kcov_task_init(t); + kcov->t = NULL; + spin_unlock(&kcov->lock); + kcov_put(kcov); +} + +static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) +{ + int res = 0; + void *area; + struct kcov *kcov = vma->vm_file->private_data; + unsigned long size, off; + struct page *page; + + area = vmalloc_user(vma->vm_end - vma->vm_start); + if (!area) + return -ENOMEM; + + spin_lock(&kcov->lock); + size = kcov->size * sizeof(unsigned long); + if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 || + vma->vm_end - vma->vm_start != size) { + res = -EINVAL; + goto exit; + } + if (!kcov->area) { + kcov->area = area; + vma->vm_flags |= VM_DONTEXPAND; + spin_unlock(&kcov->lock); + for (off = 0; off < size; off += PAGE_SIZE) { + page = vmalloc_to_page(kcov->area + off); + if (vm_insert_page(vma, vma->vm_start + off, page)) + WARN_ONCE(1, "vm_insert_page() failed"); + } + return 0; + } +exit: + spin_unlock(&kcov->lock); + vfree(area); + return res; +} + +static int kcov_open(struct inode *inode, struct file *filep) +{ + struct kcov *kcov; + + kcov = kzalloc(sizeof(*kcov), GFP_KERNEL); + if (!kcov) + return -ENOMEM; + atomic_set(&kcov->refcount, 1); + spin_lock_init(&kcov->lock); + filep->private_data = kcov; + return nonseekable_open(inode, filep); +} + +static int kcov_close(struct inode *inode, struct file *filep) +{ + kcov_put(filep->private_data); + return 0; +} + +static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, + unsigned long arg) +{ + struct task_struct *t; + unsigned long size, unused; + + switch (cmd) { + case KCOV_INIT_TRACE: + /* + * Enable kcov in trace mode and setup buffer size. + * Must happen before anything else. + */ + if (kcov->mode != KCOV_MODE_DISABLED) + return -EBUSY; + /* + * Size must be at least 2 to hold current position and one PC. + * Later we allocate size * sizeof(unsigned long) memory, + * that must not overflow. + */ + size = arg; + if (size < 2 || size > INT_MAX / sizeof(unsigned long)) + return -EINVAL; + kcov->size = size; + kcov->mode = KCOV_MODE_TRACE; + return 0; + case KCOV_ENABLE: + /* + * Enable coverage for the current task. + * At this point user must have been enabled trace mode, + * and mmapped the file. Coverage collection is disabled only + * at task exit or voluntary by KCOV_DISABLE. After that it can + * be enabled for another task. + */ + unused = arg; + if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED || + kcov->area == NULL) + return -EINVAL; + if (kcov->t != NULL) + return -EBUSY; + t = current; + /* Cache in task struct for performance. */ + t->kcov_size = kcov->size; + t->kcov_area = kcov->area; + /* See comment in __sanitizer_cov_trace_pc(). */ + barrier(); + WRITE_ONCE(t->kcov_mode, kcov->mode); + t->kcov = kcov; + kcov->t = t; + /* This is put either in kcov_task_exit() or in KCOV_DISABLE. */ + kcov_get(kcov); + return 0; + case KCOV_DISABLE: + /* Disable coverage for the current task. */ + unused = arg; + if (unused != 0 || current->kcov != kcov) + return -EINVAL; + t = current; + if (WARN_ON(kcov->t != t)) + return -EINVAL; + kcov_task_init(t); + kcov->t = NULL; + kcov_put(kcov); + return 0; + default: + return -ENOTTY; + } +} + +static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct kcov *kcov; + int res; + + kcov = filep->private_data; + spin_lock(&kcov->lock); + res = kcov_ioctl_locked(kcov, cmd, arg); + spin_unlock(&kcov->lock); + return res; +} + +static const struct file_operations kcov_fops = { + .open = kcov_open, + .unlocked_ioctl = kcov_ioctl, + .mmap = kcov_mmap, + .release = kcov_close, +}; + +static int __init kcov_init(void) +{ + if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) { + pr_err("failed to create kcov in debugfs\n"); + return -ENOMEM; + } + return 0; +} + +device_initcall(kcov_init); diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 8e96f6cc2a4a..31322a4275cd 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,3 +1,6 @@ +# Any varying coverage in these files is non-deterministic +# and is generally not a function of system call inputs. +KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 61a16569ffbf..032b2c015beb 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,3 +1,7 @@ +# Any varying coverage in these files is non-deterministic +# and is generally not a function of system call inputs. +KCOV_INSTRUMENT := n + obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 302d6ebd64f7..414d9c16da42 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) endif +# These files are disabled because they produce non-interesting flaky coverage +# that is not a function of syscall inputs. E.g. involuntary context switches. +KCOV_INSTRUMENT := n + ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 5a60f45cd9bb..532d4d52d1df 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -696,6 +696,27 @@ source "lib/Kconfig.kasan" endmenu # "Memory Debugging" +config ARCH_HAS_KCOV + bool + help + KCOV does not have any arch-specific code, but currently it is enabled + only for x86_64. KCOV requires testing on other archs, and most likely + disabling of instrumentation for some early boot code. + +config KCOV + bool "Code coverage for fuzzing" + depends on ARCH_HAS_KCOV + select DEBUG_FS + help + KCOV exposes kernel code coverage information in a form suitable + for coverage-guided fuzzing (randomized testing). + + If RANDOMIZE_BASE is enabled, PC values will not be stable across + different machines and across reboots. If you need stable PC values, + disable RANDOMIZE_BASE. + + For more details, see Documentation/kcov.txt. + config DEBUG_SHIRQ bool "Debug shared IRQ handlers" depends on DEBUG_KERNEL diff --git a/lib/Makefile b/lib/Makefile index 4962d14c450f..a1de5b61ff40 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -7,6 +7,18 @@ ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) endif +# These files are disabled because they produce lots of non-interesting and/or +# flaky coverage that is not a function of syscall inputs. For example, +# rbtree can be global and individual rotations don't correlate with inputs. +KCOV_INSTRUMENT_string.o := n +KCOV_INSTRUMENT_rbtree.o := n +KCOV_INSTRUMENT_list_debug.o := n +KCOV_INSTRUMENT_debugobjects.o := n +KCOV_INSTRUMENT_dynamic_debug.o := n +# Kernel does not boot if we instrument this file as it uses custom calling +# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS). +KCOV_INSTRUMENT_hweight.o := n + lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o \ diff --git a/mm/Makefile b/mm/Makefile index 6da300a1414b..f5e797cbd128 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,6 +5,21 @@ KASAN_SANITIZE_slab_common.o := n KASAN_SANITIZE_slub.o := n +# These files are disabled because they produce non-interesting and/or +# flaky coverage that is not a function of syscall inputs. E.g. slab is out of +# free pages, or a task is migrated between nodes. +KCOV_INSTRUMENT_slab_common.o := n +KCOV_INSTRUMENT_slob.o := n +KCOV_INSTRUMENT_slab.o := n +KCOV_INSTRUMENT_slub.o := n +KCOV_INSTRUMENT_page_alloc.o := n +KCOV_INSTRUMENT_debug-pagealloc.o := n +KCOV_INSTRUMENT_kmemleak.o := n +KCOV_INSTRUMENT_kmemcheck.o := n +KCOV_INSTRUMENT_memcontrol.o := n +KCOV_INSTRUMENT_mmzone.o := n +KCOV_INSTRUMENT_vmstat.o := n + mmu-y := nommu.o mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index a61460d9f5b0..131daadf40e4 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -1,5 +1,6 @@ KASAN_SANITIZE := n UBSAN_SANITIZE_kasan.o := n +KCOV_INSTRUMENT := n CFLAGS_REMOVE_kasan.o = -pg # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index ad50d5859ac4..ddf83d0181e7 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -136,6 +136,12 @@ _c_flags += $(if $(patsubst n%,, \ $(CFLAGS_UBSAN)) endif +ifeq ($(CONFIG_KCOV),y) +_c_flags += $(if $(patsubst n%,, \ + $(KCOV_INSTRUMENT_$(basetarget).o)$(KCOV_INSTRUMENT)y), \ + $(CFLAGS_KCOV)) +endif + # If building the kernel in a separate objtree expand all occurrences # of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/'). -- cgit From 41b27154874b3a40d6673052d08c8e9fd0c6404f Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 22 Mar 2016 14:27:54 -0700 Subject: kernel/signal.c: add compile-time check for __ARCH_SI_PREAMBLE_SIZE The value of __ARCH_SI_PREAMBLE_SIZE defines the size (including padding) of the part of the struct siginfo that is before the union, and it is then used to calculate the needed padding (SI_PAD_SIZE) to make the size of struct siginfo equal to 128 (SI_MAX_SIZE) bytes. Depending on the target architecture and word width it equals to either 3 or 4 times sizeof int. Since the very beginning we had __ARCH_SI_PREAMBLE_SIZE wrong on the parisc architecture for the 64bit kernel build. It's even more frustrating, because it can easily be checked at compile time if the value was defined correctly. This patch adds such a check for the correctness of __ARCH_SI_PREAMBLE_SIZE in the hope that it will prevent existing and future architectures from running into the same problem. I refrained from replacing __ARCH_SI_PREAMBLE_SIZE by offsetof() in copy_siginfo() in include/asm-generic/siginfo.h, because a) it doesn't make any difference and b) it's used in the Documentation/kmemcheck.txt example. I ran this patch through the 0-DAY kernel test infrastructure and only the parisc architecture triggered as expected. That means that this patch should be OK for all major architectures. Signed-off-by: Helge Deller Cc: Stephen Rothwell Cc: Michael Ellerman Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index fe8ed298373c..aa9bf00749c1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3585,6 +3585,10 @@ __weak const char *arch_vma_name(struct vm_area_struct *vma) void __init signals_init(void) { + /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */ + BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE + != offsetof(struct siginfo, _sifields._pad)); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); } -- cgit From cf61e2a1487d833e4748dead4096584de70bf742 Mon Sep 17 00:00:00 2001 From: Brian Starkey Date: Tue, 22 Mar 2016 14:27:57 -0700 Subject: memremap: don't modify flags These patches implement a MEMREMAP_WC flag for memremap(), which can be used to obtain writecombine mappings. This is then used for setting up dma_coherent_mem regions which use the DMA_MEMORY_MAP flag. The motivation is to fix an alignment fault on arm64, and the suggestion to implement MEMREMAP_WC for this case was made at [1]. That particular issue is handled in patch 4, which makes sure that the appropriate memset function is used when zeroing allocations mapped as IO memory. This patch (of 4): Don't modify the flags input argument to memremap(). MEMREMAP_WB is already a special case so we can check for it directly instead of clearing flag bits in each mapper. Signed-off-by: Brian Starkey Cc: Catalin Marinas Cc: Dan Williams Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 584febd13e2e..e5e685e6ff2a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -64,6 +64,9 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); void *addr = NULL; + if (!flags) + return NULL; + if (is_ram == REGION_MIXED) { WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", &offset, (unsigned long) size); @@ -72,7 +75,6 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) /* Try all mapping types requested until one returns non-NULL */ if (flags & MEMREMAP_WB) { - flags &= ~MEMREMAP_WB; /* * MEMREMAP_WB is special in that it can be satisifed * from the direct map. Some archs depend on the @@ -86,21 +88,19 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) } /* - * If we don't have a mapping yet and more request flags are - * pending then we will be attempting to establish a new virtual + * If we don't have a mapping yet and other request flags are + * present then we will be attempting to establish a new virtual * address mapping. Enforce that this mapping is not aliasing * System RAM. */ - if (!addr && is_ram == REGION_INTERSECTS && flags) { + if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) { WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", &offset, (unsigned long) size); return NULL; } - if (!addr && (flags & MEMREMAP_WT)) { - flags &= ~MEMREMAP_WT; + if (!addr && (flags & MEMREMAP_WT)) addr = ioremap_wt(offset, size); - } return addr; } -- cgit From c907e0eb43a522de60fb651c011c553f87273222 Mon Sep 17 00:00:00 2001 From: Brian Starkey Date: Tue, 22 Mar 2016 14:28:00 -0700 Subject: memremap: add MEMREMAP_WC flag Add a flag to memremap() for writecombine mappings. Mappings satisfied by this flag will not be cached, however writes may be delayed or combined into more efficient bursts. This is most suitable for buffers written sequentially by the CPU for use by other DMA devices. Signed-off-by: Brian Starkey Reviewed-by: Catalin Marinas Cc: Dan Williams Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/io.h | 1 + kernel/memremap.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/io.h b/include/linux/io.h index 32403b5716e5..e2c8419278c1 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -135,6 +135,7 @@ enum { /* See memremap() kernel-doc for usage description... */ MEMREMAP_WB = 1 << 0, MEMREMAP_WT = 1 << 1, + MEMREMAP_WC = 1 << 2, }; void *memremap(resource_size_t offset, size_t size, unsigned long flags); diff --git a/kernel/memremap.c b/kernel/memremap.c index e5e685e6ff2a..a6d382312e6f 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -41,11 +41,13 @@ static void *try_ram_remap(resource_size_t offset, size_t size) * memremap() - remap an iomem_resource as cacheable memory * @offset: iomem resource start address * @size: size of remap - * @flags: either MEMREMAP_WB or MEMREMAP_WT + * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC * * memremap() is "ioremap" for cases where it is known that the resource * being mapped does not have i/o side effects and the __iomem - * annotation is not applicable. + * annotation is not applicable. In the case of multiple flags, the different + * mapping types will be attempted in the order listed below until one of + * them succeeds. * * MEMREMAP_WB - matches the default mapping for System RAM on * the architecture. This is usually a read-allocate write-back cache. @@ -57,6 +59,10 @@ static void *try_ram_remap(resource_size_t offset, size_t size) * cache or are written through to memory and never exist in a * cache-dirty state with respect to program visibility. Attempts to * map System RAM with this mapping type will fail. + * + * MEMREMAP_WC - establish a writecombine mapping, whereby writes may + * be coalesced together (e.g. in the CPU's write buffers), but is otherwise + * uncached. Attempts to map System RAM with this mapping type will fail. */ void *memremap(resource_size_t offset, size_t size, unsigned long flags) { @@ -102,6 +108,9 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) if (!addr && (flags & MEMREMAP_WT)) addr = ioremap_wt(offset, size); + if (!addr && (flags & MEMREMAP_WC)) + addr = ioremap_wc(offset, size); + return addr; } EXPORT_SYMBOL(memremap); -- cgit From a395d6a7e3d6e3d1d316376db0c4c8b5d2995930 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 22 Mar 2016 14:28:09 -0700 Subject: kernel/...: convert pr_warning to pr_warn Use the more common logging method with the eventual goal of removing pr_warning altogether. Miscellanea: - Realign arguments - Coalesce formats - Add missing space between a few coalesced formats Signed-off-by: Joe Perches Acked-by: Rafael J. Wysocki [kernel/power/suspend.c] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 4 ++-- kernel/power/suspend.c | 3 +-- kernel/time/tick-sched.c | 8 ++++---- kernel/trace/blktrace.c | 4 ++-- kernel/trace/ftrace.c | 7 +++---- kernel/trace/trace.c | 40 ++++++++++++++++++------------------ kernel/trace/trace_functions_graph.c | 6 +++--- kernel/trace/trace_kprobe.c | 27 ++++++++++-------------- kernel/trace/trace_mmiotrace.c | 2 +- kernel/trace/trace_probe.c | 4 ++-- kernel/trace/trace_stat.c | 3 +-- kernel/trace/trace_uprobe.c | 2 +- kernel/tracepoint.c | 2 +- 13 files changed, 52 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 64731e84c982..cc1cc641d653 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1322,8 +1322,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (nmsk != omsk) /* hope the handler works with current trigger mode */ - pr_warning("irq %d uses trigger mode %u; requested %u\n", - irq, nmsk, omsk); + pr_warn("irq %d uses trigger mode %u; requested %u\n", + irq, nmsk, omsk); } *old_ptr = new; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 230a77225e2e..5b70d64b871e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -473,8 +473,7 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_FREEZE) { #ifdef CONFIG_PM_DEBUG if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { - pr_warning("PM: Unsupported test mode for suspend to idle," - "please choose none/freezer/devices/platform.\n"); + pr_warn("PM: Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n"); return -EAGAIN; } #endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 195fe7d2caad..084b79f5917e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -378,7 +378,7 @@ static int __init tick_nohz_full_setup(char *str) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); if (cpulist_parse(str, tick_nohz_full_mask) < 0) { - pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); + pr_warn("NO_HZ: Incorrect nohz_full cpumask\n"); free_bootmem_cpumask_var(tick_nohz_full_mask); return 1; } @@ -446,8 +446,7 @@ void __init tick_nohz_init(void) * interrupts to avoid circular dependency on the tick */ if (!arch_irq_work_has_interrupt()) { - pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " - "support irq work self-IPIs\n"); + pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); cpumask_copy(housekeeping_mask, cpu_possible_mask); tick_nohz_full_running = false; @@ -457,7 +456,8 @@ void __init tick_nohz_init(void) cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { - pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); + pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", + cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2aeb6ffc0a1e..f94e7a21f52d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1437,12 +1437,12 @@ static struct trace_event trace_blk_event = { static int __init init_blk_tracer(void) { if (!register_trace_event(&trace_blk_event)) { - pr_warning("Warning: could not register block events\n"); + pr_warn("Warning: could not register block events\n"); return 1; } if (register_tracer(&blk_tracer) != 0) { - pr_warning("Warning: could not register the block tracer\n"); + pr_warn("Warning: could not register the block tracer\n"); unregister_trace_event(&trace_blk_event); return 1; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 57a6eea84694..2ece9f1a3e5a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1058,8 +1058,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) entry = tracefs_create_file("function_profile_enabled", 0644, d_tracer, NULL, &ftrace_profile_fops); if (!entry) - pr_warning("Could not create tracefs " - "'function_profile_enabled' entry\n"); + pr_warn("Could not create tracefs 'function_profile_enabled' entry\n"); } #else /* CONFIG_FUNCTION_PROFILER */ @@ -2314,8 +2313,8 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_curr(rec); if (FTRACE_WARN_ON(!ops)) { - pr_warning("Bad trampoline accounting at: %p (%pS)\n", - (void *)rec->ip, (void *)rec->ip); + pr_warn("Bad trampoline accounting at: %p (%pS)\n", + (void *)rec->ip, (void *)rec->ip); /* Ftrace is shutting down, return anything */ return (unsigned long)FTRACE_ADDR; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d9293402ee68..032b388bea66 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2071,20 +2071,20 @@ void trace_printk_init_buffers(void) /* trace_printk() is for debug use only. Don't use it in production. */ - pr_warning("\n"); - pr_warning("**********************************************************\n"); - pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warning("** **\n"); - pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); - pr_warning("** **\n"); - pr_warning("** This means that this is a DEBUG kernel and it is **\n"); - pr_warning("** unsafe for production use. **\n"); - pr_warning("** **\n"); - pr_warning("** If you see this message and you are not debugging **\n"); - pr_warning("** the kernel, report this immediately to your vendor! **\n"); - pr_warning("** **\n"); - pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warning("**********************************************************\n"); + pr_warn("\n"); + pr_warn("**********************************************************\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("** **\n"); + pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warn("** **\n"); + pr_warn("** This means that this is a DEBUG kernel and it is **\n"); + pr_warn("** unsafe for production use. **\n"); + pr_warn("** **\n"); + pr_warn("** If you see this message and you are not debugging **\n"); + pr_warn("** the kernel, report this immediately to your vendor! **\n"); + pr_warn("** **\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("**********************************************************\n"); /* Expand the buffers to set size */ tracing_update_buffers(); @@ -4101,7 +4101,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, */ map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); if (!map_array) { - pr_warning("Unable to allocate trace enum mapping\n"); + pr_warn("Unable to allocate trace enum mapping\n"); return; } @@ -6131,7 +6131,7 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = tracefs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { - pr_warning("Could not create tracefs '%s' entry\n", cpu_dir); + pr_warn("Could not create tracefs '%s' entry\n", cpu_dir); return; } @@ -6318,7 +6318,7 @@ struct dentry *trace_create_file(const char *name, ret = tracefs_create_file(name, mode, parent, data, fops); if (!ret) - pr_warning("Could not create tracefs '%s' entry\n", name); + pr_warn("Could not create tracefs '%s' entry\n", name); return ret; } @@ -6337,7 +6337,7 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) tr->options = tracefs_create_dir("options", d_tracer); if (!tr->options) { - pr_warning("Could not create tracefs directory 'options'\n"); + pr_warn("Could not create tracefs directory 'options'\n"); return NULL; } @@ -7248,8 +7248,8 @@ __init static int tracer_alloc_buffers(void) if (trace_boot_clock) { ret = tracing_set_clock(&global_trace, trace_boot_clock); if (ret < 0) - pr_warning("Trace clock %s not defined, going back to default\n", - trace_boot_clock); + pr_warn("Trace clock %s not defined, going back to default\n", + trace_boot_clock); } /* diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a663cbb84107..91d6a63a2ea7 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1350,7 +1350,7 @@ void graph_trace_open(struct trace_iterator *iter) out_err_free: kfree(data); out_err: - pr_warning("function graph tracer: not enough memory\n"); + pr_warn("function graph tracer: not enough memory\n"); } void graph_trace_close(struct trace_iterator *iter) @@ -1468,12 +1468,12 @@ static __init int init_graph_trace(void) max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); if (!register_trace_event(&graph_trace_entry_event)) { - pr_warning("Warning: could not register graph trace events\n"); + pr_warn("Warning: could not register graph trace events\n"); return 1; } if (!register_trace_event(&graph_trace_ret_event)) { - pr_warning("Warning: could not register graph trace events\n"); + pr_warn("Warning: could not register graph trace events\n"); return 1; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 21b81a41dae5..919e0ddd8fcc 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -459,16 +459,14 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) if (ret == 0) tk->tp.flags |= TP_FLAG_REGISTERED; else { - pr_warning("Could not insert probe at %s+%lu: %d\n", - trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); + pr_warn("Could not insert probe at %s+%lu: %d\n", + trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { - pr_warning("This probe might be able to register after" - "target module is loaded. Continue.\n"); + pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); ret = 0; } else if (ret == -EILSEQ) { - pr_warning("Probing address(0x%p) is not an " - "instruction boundary.\n", - tk->rp.kp.addr); + pr_warn("Probing address(0x%p) is not an instruction boundary.\n", + tk->rp.kp.addr); ret = -EINVAL; } } @@ -529,7 +527,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) /* Register new event */ ret = register_kprobe_event(tk); if (ret) { - pr_warning("Failed to register probe event(%d)\n", ret); + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } @@ -564,10 +562,9 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, __unregister_trace_kprobe(tk); ret = __register_trace_kprobe(tk); if (ret) - pr_warning("Failed to re-register probe %s on" - "%s: %d\n", - trace_event_name(&tk->tp.call), - mod->name, ret); + pr_warn("Failed to re-register probe %s on %s: %d\n", + trace_event_name(&tk->tp.call), + mod->name, ret); } } mutex_unlock(&probe_lock); @@ -1336,16 +1333,14 @@ static __init int init_kprobe_trace(void) /* Event list interface */ if (!entry) - pr_warning("Could not create tracefs " - "'kprobe_events' entry\n"); + pr_warn("Could not create tracefs 'kprobe_events' entry\n"); /* Profile interface */ entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, NULL, &kprobe_profile_ops); if (!entry) - pr_warning("Could not create tracefs " - "'kprobe_profile' entry\n"); + pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); return 0; } fs_initcall(init_kprobe_trace); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 2be8c4f2403d..68f376ca6d3f 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -146,7 +146,7 @@ static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, /* XXX: This is later than where events were lost. */ trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); if (!overrun_detected) - pr_warning("mmiotrace has lost events.\n"); + pr_warn("mmiotrace has lost events\n"); overrun_detected = true; goto print_out; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1769a81da8a7..1d372fa6fefb 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -636,8 +636,8 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, *tmp = '\0'; size = tmp - kbuf + 1; } else if (done + size < count) { - pr_warning("Line length is too long: " - "Should be less than %d.", WRITE_BUFSIZE); + pr_warn("Line length is too long: Should be less than %d\n", + WRITE_BUFSIZE); ret = -EINVAL; goto out; } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 6cf935316769..413ff108fbd0 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -281,8 +281,7 @@ static int tracing_stat_init(void) stat_dir = tracefs_create_dir("trace_stat", d_tracing); if (!stat_dir) - pr_warning("Could not create tracefs " - "'trace_stat' entry\n"); + pr_warn("Could not create tracefs 'trace_stat' entry\n"); return 0; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d2f6d0be3503..7915142c89e4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -334,7 +334,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) ret = register_uprobe_event(tu); if (ret) { - pr_warning("Failed to register probe event(%d)\n", ret); + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index ecd536de603a..d0639d917899 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -491,7 +491,7 @@ static __init int init_tracepoints(void) ret = register_module_notifier(&tracepoint_module_nb); if (ret) - pr_warning("Failed to register tracepoint module enter notifier\n"); + pr_warn("Failed to register tracepoint module enter notifier\n"); return ret; } -- cgit From 276142730c39c9839465a36a90e5674a8c34e839 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 23 Mar 2016 00:11:20 +0100 Subject: PM / sleep: Clear pm_suspend_global_flags upon hibernate When suspending to RAM, waking up and later suspending to disk, we gratuitously runtime resume devices after the thaw phase. This does not occur if we always suspend to RAM or always to disk. pm_complete_with_resume_check(), which gets called from pci_pm_complete() among others, schedules a runtime resume if PM_SUSPEND_FLAG_FW_RESUME is set. The flag is set during a suspend-to-RAM cycle. It is cleared at the beginning of the suspend-to-RAM cycle but not afterwards and it is not cleared during a suspend-to-disk cycle at all. Fix it. Fixes: ef25ba047601 (PM / sleep: Add flags to indicate platform firmware involvement) Signed-off-by: Lukas Wunner Cc: 4.4+ # 4.4+ Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b7342a24f559..b7dd5718836e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -339,6 +339,7 @@ int hibernation_snapshot(int platform_mode) pm_message_t msg; int error; + pm_suspend_clear_flags(); error = platform_begin(platform_mode); if (error) goto Close; -- cgit From 69b27baf00fa9b7b14b3263c105390d1683425b2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 25 Mar 2016 14:20:21 -0700 Subject: sched: add schedule_timeout_idle() This will be needed in the patch "mm, oom: introduce oom reaper". Acked-by: Michal Hocko Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + kernel/time/timer.c | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 589c4780b077..478b41de7f7d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -426,6 +426,7 @@ extern signed long schedule_timeout(signed long timeout); extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); +extern signed long schedule_timeout_idle(signed long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index d1798fa0c743..73164c3aa56b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1566,6 +1566,17 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); +/* + * Like schedule_timeout_uninterruptible(), except this task will not contribute + * to load average. + */ +signed long __sched schedule_timeout_idle(signed long timeout) +{ + __set_current_state(TASK_IDLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_idle); + #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) { -- cgit From 36324a990cf578b57828c04cd85ac62cd25cf5a4 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 25 Mar 2016 14:20:27 -0700 Subject: oom: clear TIF_MEMDIE after oom_reaper managed to unmap the address space When oom_reaper manages to unmap all the eligible vmas there shouldn't be much of the freable memory held by the oom victim left anymore so it makes sense to clear the TIF_MEMDIE flag for the victim and allow the OOM killer to select another task. The lack of TIF_MEMDIE also means that the victim cannot access memory reserves anymore but that shouldn't be a problem because it would get the access again if it needs to allocate and hits the OOM killer again due to the fatal_signal_pending resp. PF_EXITING check. We can safely hide the task from the OOM killer because it is clearly not a good candidate anymore as everyhing reclaimable has been torn down already. This patch will allow to cap the time an OOM victim can keep TIF_MEMDIE and thus hold off further global OOM killer actions granted the oom reaper is able to take mmap_sem for the associated mm struct. This is not guaranteed now but further steps should make sure that mmap_sem for write should be blocked killable which will help to reduce such a lock contention. This is not done by this patch. Note that exit_oom_victim might be called on a remote task from __oom_reap_task now so we have to check and clear the flag atomically otherwise we might race and underflow oom_victims or wake up waiters too early. Signed-off-by: Michal Hocko Suggested-by: Johannes Weiner Suggested-by: Tetsuo Handa Cc: Andrea Argangeli Cc: David Rientjes Cc: Hugh Dickins Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 2 +- kernel/exit.c | 2 +- mm/oom_kill.c | 73 +++++++++++++++++++++++++++++++++++------------------ 3 files changed, 50 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/oom.h b/include/linux/oom.h index 03e6257321f0..45993b840ed6 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -91,7 +91,7 @@ extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, extern bool out_of_memory(struct oom_control *oc); -extern void exit_oom_victim(void); +extern void exit_oom_victim(struct task_struct *tsk); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); diff --git a/kernel/exit.c b/kernel/exit.c index 953d1a1c0387..fd90195667e1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -435,7 +435,7 @@ static void exit_mm(struct task_struct *tsk) mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) - exit_oom_victim(); + exit_oom_victim(tsk); } static struct task_struct *find_alive_thread(struct task_struct *p) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f7ed6ece0719..2830b1c6483e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -416,20 +416,36 @@ bool oom_killer_disabled __read_mostly; * victim (if that is possible) to help the OOM killer to move on. */ static struct task_struct *oom_reaper_th; -static struct mm_struct *mm_to_reap; +static struct task_struct *task_to_reap; static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); -static bool __oom_reap_vmas(struct mm_struct *mm) +static bool __oom_reap_task(struct task_struct *tsk) { struct mmu_gather tlb; struct vm_area_struct *vma; + struct mm_struct *mm; + struct task_struct *p; struct zap_details details = {.check_swap_entries = true, .ignore_dirty = true}; bool ret = true; - /* We might have raced with exit path */ - if (!atomic_inc_not_zero(&mm->mm_users)) + /* + * Make sure we find the associated mm_struct even when the particular + * thread has already terminated and cleared its mm. + * We might have race with exit path so consider our work done if there + * is no mm. + */ + p = find_lock_task_mm(tsk); + if (!p) + return true; + + mm = p->mm; + if (!atomic_inc_not_zero(&mm->mm_users)) { + task_unlock(p); return true; + } + + task_unlock(p); if (!down_read_trylock(&mm->mmap_sem)) { ret = false; @@ -464,60 +480,66 @@ static bool __oom_reap_vmas(struct mm_struct *mm) } tlb_finish_mmu(&tlb, 0, -1); up_read(&mm->mmap_sem); + + /* + * Clear TIF_MEMDIE because the task shouldn't be sitting on a + * reasonably reclaimable memory anymore. OOM killer can continue + * by selecting other victim if unmapping hasn't led to any + * improvements. This also means that selecting this task doesn't + * make any sense. + */ + tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; + exit_oom_victim(tsk); out: mmput(mm); return ret; } -static void oom_reap_vmas(struct mm_struct *mm) +static void oom_reap_task(struct task_struct *tsk) { int attempts = 0; /* Retry the down_read_trylock(mmap_sem) a few times */ - while (attempts++ < 10 && !__oom_reap_vmas(mm)) + while (attempts++ < 10 && !__oom_reap_task(tsk)) schedule_timeout_idle(HZ/10); /* Drop a reference taken by wake_oom_reaper */ - mmdrop(mm); + put_task_struct(tsk); } static int oom_reaper(void *unused) { while (true) { - struct mm_struct *mm; + struct task_struct *tsk; wait_event_freezable(oom_reaper_wait, - (mm = READ_ONCE(mm_to_reap))); - oom_reap_vmas(mm); - WRITE_ONCE(mm_to_reap, NULL); + (tsk = READ_ONCE(task_to_reap))); + oom_reap_task(tsk); + WRITE_ONCE(task_to_reap, NULL); } return 0; } -static void wake_oom_reaper(struct mm_struct *mm) +static void wake_oom_reaper(struct task_struct *tsk) { - struct mm_struct *old_mm; + struct task_struct *old_tsk; if (!oom_reaper_th) return; - /* - * Pin the given mm. Use mm_count instead of mm_users because - * we do not want to delay the address space tear down. - */ - atomic_inc(&mm->mm_count); + get_task_struct(tsk); /* * Make sure that only a single mm is ever queued for the reaper * because multiple are not necessary and the operation might be * disruptive so better reduce it to the bare minimum. */ - old_mm = cmpxchg(&mm_to_reap, NULL, mm); - if (!old_mm) + old_tsk = cmpxchg(&task_to_reap, NULL, tsk); + if (!old_tsk) wake_up(&oom_reaper_wait); else - mmdrop(mm); + put_task_struct(tsk); } static int __init oom_init(void) @@ -532,7 +554,7 @@ static int __init oom_init(void) } subsys_initcall(oom_init) #else -static void wake_oom_reaper(struct mm_struct *mm) +static void wake_oom_reaper(struct task_struct *tsk) { } #endif @@ -563,9 +585,10 @@ void mark_oom_victim(struct task_struct *tsk) /** * exit_oom_victim - note the exit of an OOM victim */ -void exit_oom_victim(void) +void exit_oom_victim(struct task_struct *tsk) { - clear_thread_flag(TIF_MEMDIE); + if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) + return; if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); @@ -748,7 +771,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, rcu_read_unlock(); if (can_oom_reap) - wake_oom_reaper(mm); + wake_oom_reaper(victim); mmdrop(mm); put_task_struct(victim); -- cgit From be7635e7287e0e8013af3c89a6354a9e0182594c Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 25 Mar 2016 14:22:05 -0700 Subject: arch, ftrace: for KASAN put hard/soft IRQ entries into separate sections KASAN needs to know whether the allocation happens in an IRQ handler. This lets us strip everything below the IRQ entry point to reduce the number of unique stack traces needed to be stored. Move the definition of __irq_entry to so that the users don't need to pull in . Also introduce the __softirq_entry macro which is similar to __irq_entry, but puts the corresponding functions to the .softirqentry.text section. Signed-off-by: Alexander Potapenko Acked-by: Steven Rostedt Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/exception.h | 2 +- arch/arm/kernel/vmlinux.lds.S | 1 + arch/arm64/include/asm/exception.h | 2 +- arch/arm64/kernel/vmlinux.lds.S | 1 + arch/blackfin/kernel/vmlinux.lds.S | 1 + arch/c6x/kernel/vmlinux.lds.S | 1 + arch/metag/kernel/vmlinux.lds.S | 1 + arch/microblaze/kernel/vmlinux.lds.S | 1 + arch/mips/kernel/vmlinux.lds.S | 1 + arch/nios2/kernel/vmlinux.lds.S | 1 + arch/openrisc/kernel/vmlinux.lds.S | 1 + arch/parisc/kernel/vmlinux.lds.S | 1 + arch/powerpc/kernel/vmlinux.lds.S | 1 + arch/s390/kernel/vmlinux.lds.S | 1 + arch/sh/kernel/vmlinux.lds.S | 1 + arch/sparc/kernel/vmlinux.lds.S | 1 + arch/tile/kernel/vmlinux.lds.S | 1 + arch/x86/kernel/vmlinux.lds.S | 1 + include/asm-generic/vmlinux.lds.h | 12 +++++++++++- include/linux/ftrace.h | 11 ----------- include/linux/interrupt.h | 20 ++++++++++++++++++++ kernel/softirq.c | 2 +- kernel/trace/trace_functions_graph.c | 1 + 23 files changed, 51 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/arch/arm/include/asm/exception.h b/arch/arm/include/asm/exception.h index 5abaf5bbd985..bf1991263d2d 100644 --- a/arch/arm/include/asm/exception.h +++ b/arch/arm/include/asm/exception.h @@ -7,7 +7,7 @@ #ifndef __ASM_ARM_EXCEPTION_H #define __ASM_ARM_EXCEPTION_H -#include +#include #define __exception __attribute__((section(".exception.text"))) #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 1fab979daeaf..e2c6da096cef 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -108,6 +108,7 @@ SECTIONS *(.exception.text) __exception_text_end = .; IRQENTRY_TEXT + SOFTIRQENTRY_TEXT TEXT_TEXT SCHED_TEXT LOCK_TEXT diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 6cb7e1a6bc02..0c2eec490abf 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -18,7 +18,7 @@ #ifndef __ASM_EXCEPTION_H #define __ASM_EXCEPTION_H -#include +#include #define __exception __attribute__((section(".exception.text"))) #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 37f624df68fa..5a1939a74ff3 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -103,6 +103,7 @@ SECTIONS *(.exception.text) __exception_text_end = .; IRQENTRY_TEXT + SOFTIRQENTRY_TEXT TEXT_TEXT SCHED_TEXT LOCK_TEXT diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S index c9eec84aa258..d920b959ff3a 100644 --- a/arch/blackfin/kernel/vmlinux.lds.S +++ b/arch/blackfin/kernel/vmlinux.lds.S @@ -35,6 +35,7 @@ SECTIONS #endif LOCK_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT KPROBES_TEXT #ifdef CONFIG_ROMKERNEL __sinittext = .; diff --git a/arch/c6x/kernel/vmlinux.lds.S b/arch/c6x/kernel/vmlinux.lds.S index 5a6e141d1641..50bc10f97bcb 100644 --- a/arch/c6x/kernel/vmlinux.lds.S +++ b/arch/c6x/kernel/vmlinux.lds.S @@ -72,6 +72,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT KPROBES_TEXT *(.fixup) *(.gnu.warning) diff --git a/arch/metag/kernel/vmlinux.lds.S b/arch/metag/kernel/vmlinux.lds.S index e12055e88bfe..150ace92c7ad 100644 --- a/arch/metag/kernel/vmlinux.lds.S +++ b/arch/metag/kernel/vmlinux.lds.S @@ -24,6 +24,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.text.*) *(.gnu.warning) } diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S index be9488d69734..0a47f0410554 100644 --- a/arch/microblaze/kernel/vmlinux.lds.S +++ b/arch/microblaze/kernel/vmlinux.lds.S @@ -36,6 +36,7 @@ SECTIONS { LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT . = ALIGN (4) ; _etext = . ; } diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 0a93e83cd014..54d653ee17e1 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -58,6 +58,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.text.*) *(.fixup) *(.gnu.warning) diff --git a/arch/nios2/kernel/vmlinux.lds.S b/arch/nios2/kernel/vmlinux.lds.S index 326fab40a9de..e23e89539967 100644 --- a/arch/nios2/kernel/vmlinux.lds.S +++ b/arch/nios2/kernel/vmlinux.lds.S @@ -39,6 +39,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT KPROBES_TEXT } =0 _etext = .; diff --git a/arch/openrisc/kernel/vmlinux.lds.S b/arch/openrisc/kernel/vmlinux.lds.S index 2d69a853b742..d936de4c07ca 100644 --- a/arch/openrisc/kernel/vmlinux.lds.S +++ b/arch/openrisc/kernel/vmlinux.lds.S @@ -50,6 +50,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.fixup) *(.text.__*) _etext = .; diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 308f29081d46..f3ead0b6ce46 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -72,6 +72,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.text.do_softirq) *(.text.sys_exit) *(.text.do_sigaltstack) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index d41fd0af8980..2dd91f79de05 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -55,6 +55,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT #ifdef CONFIG_PPC32 *(.got1) diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 445657fe658c..0f41a8286378 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -28,6 +28,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) } :text = 0x0700 diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index db88cbf9eafd..235a4101999f 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -39,6 +39,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) _etext = .; /* End of text section */ diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index f1a2f688b28a..aadd321aa05d 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -48,6 +48,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.gnu.warning) } = 0 _etext = .; diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S index 0e059a0101ea..378f5d8d1ec8 100644 --- a/arch/tile/kernel/vmlinux.lds.S +++ b/arch/tile/kernel/vmlinux.lds.S @@ -45,6 +45,7 @@ SECTIONS LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT __fix_text_end = .; /* tile-cpack won't rearrange before this */ ALIGN_FUNCTION(); *(.hottext*) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index d239639e0c1d..4c941f88d405 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -101,6 +101,7 @@ SECTIONS KPROBES_TEXT ENTRY_TEXT IRQENTRY_TEXT + SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) /* End of text section */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8f5a12ab2f2b..339125bb4d2c 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -456,7 +456,7 @@ *(.entry.text) \ VMLINUX_SYMBOL(__entry_text_end) = .; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) #define IRQENTRY_TEXT \ ALIGN_FUNCTION(); \ VMLINUX_SYMBOL(__irqentry_text_start) = .; \ @@ -466,6 +466,16 @@ #define IRQENTRY_TEXT #endif +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) +#define SOFTIRQENTRY_TEXT \ + ALIGN_FUNCTION(); \ + VMLINUX_SYMBOL(__softirqentry_text_start) = .; \ + *(.softirqentry.text) \ + VMLINUX_SYMBOL(__softirqentry_text_end) = .; +#else +#define SOFTIRQENTRY_TEXT +#endif + /* Section used for early init (in .S files) */ #define HEAD_TEXT *(.head.text) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 6d9df3f7e334..dea12a6e413b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -811,16 +811,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, */ #define __notrace_funcgraph notrace -/* - * We want to which function is an entrypoint of a hardirq. - * That will help us to put a signal on output. - */ -#define __irq_entry __attribute__((__section__(".irqentry.text"))) - -/* Limits of hardirq entrypoints */ -extern char __irqentry_text_start[]; -extern char __irqentry_text_end[]; - #define FTRACE_NOTRACE_DEPTH 65536 #define FTRACE_RETFUNC_DEPTH 50 #define FTRACE_RETSTACK_ALLOC_SIZE 32 @@ -857,7 +847,6 @@ static inline void unpause_graph_tracing(void) #else /* !CONFIG_FUNCTION_GRAPH_TRACER */ #define __notrace_funcgraph -#define __irq_entry #define INIT_FTRACE_GRAPH static inline void ftrace_graph_init_task(struct task_struct *t) { } diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 358076eda364..9fcabeb07787 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -683,4 +683,24 @@ extern int early_irq_init(void); extern int arch_probe_nr_irqs(void); extern int arch_early_irq_init(void); +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) +/* + * We want to know which function is an entrypoint of a hardirq or a softirq. + */ +#define __irq_entry __attribute__((__section__(".irqentry.text"))) +#define __softirq_entry \ + __attribute__((__section__(".softirqentry.text"))) + +/* Limits of hardirq entrypoints */ +extern char __irqentry_text_start[]; +extern char __irqentry_text_end[]; +/* Limits of softirq entrypoints */ +extern char __softirqentry_text_start[]; +extern char __softirqentry_text_end[]; + +#else +#define __irq_entry +#define __softirq_entry +#endif + #endif diff --git a/kernel/softirq.c b/kernel/softirq.c index 8aae49dd7da8..17caf4b63342 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -227,7 +227,7 @@ static inline bool lockdep_softirq_start(void) { return false; } static inline void lockdep_softirq_end(bool in_hardirq) { } #endif -asmlinkage __visible void __do_softirq(void) +asmlinkage __visible void __softirq_entry __do_softirq(void) { unsigned long end = jiffies + MAX_SOFTIRQ_TIME; unsigned long old_flags = current->flags; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 91d6a63a2ea7..3a0244ff7ea8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -8,6 +8,7 @@ */ #include #include +#include #include #include -- cgit