From 8a6b173287bb94b3ef8360119020e856afb1c934 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Mar 2014 18:56:22 +0200 Subject: uprobes: Kill UPROBE_SKIP_SSTEP and can_skip_sstep() UPROBE_COPY_INSN, UPROBE_SKIP_SSTEP, and uprobe->flags must die. This patch kills UPROBE_SKIP_SSTEP. I never understood why it was added; not only it doesn't help, it harms. It can only help to avoid arch_uprobe_skip_sstep() if it was already called before and failed. But this is ugly, if we want to know whether we can emulate this instruction or not we should do this analysis in arch_uprobe_analyze_insn(), not when we hit this probe for the first time. And in fact this logic is simply wrong. arch_uprobe_skip_sstep() can fail or not depending on the task/register state, if this insn can be emulated but, say, put_user() fails we need to xol it this time, but this doesn't mean we shouldn't try to emulate it when this or another thread hits this bp next time. And this is the actual reason for this change. We need to emulate the "call" insn, but push(return-address) can obviously fail. Per-arch notes: x86: __skip_sstep() can only emulate "rep;nop". With this change it will be called every time and most probably for no reason. This will be fixed by the next changes. We need to change this suboptimal code anyway. arm: Should not be affected. It has its own "bool simulate" flag checked in arch_uprobe_skip_sstep(). ppc: Looks like, it can emulate almost everything. Does it actually need to record the fact that emulate_step() failed? Hopefully not. But if yes, it can add the ppc- specific flag into arch_uprobe. TODO: rename arch_uprobe_skip_sstep() to arch_uprobe_emulate_insn(), Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: David A. Long Reviewed-by: Jim Keniston Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 04709b66369d..ea2a7c0728bb 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -60,8 +60,6 @@ static struct percpu_rw_semaphore dup_mmap_sem; /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 -/* Can skip singlestep */ -#define UPROBE_SKIP_SSTEP 1 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ @@ -491,12 +489,9 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->offset = offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); - /* For now assume that the instruction need not be single-stepped */ - __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); - /* a uprobe exists for this inode:offset combination */ if (cur_uprobe) { kfree(uprobe); @@ -1628,20 +1623,6 @@ bool uprobe_deny_signal(void) return true; } -/* - * Avoid singlestepping the original instruction if the original instruction - * is a NOP or can be emulated. - */ -static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) -{ - if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) { - if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) - return true; - clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); - } - return false; -} - static void mmf_recalc_uprobes(struct mm_struct *mm) { struct vm_area_struct *vma; @@ -1868,13 +1849,13 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); - if (can_skip_sstep(uprobe, regs)) + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; if (!pre_ssout(uprobe, regs, bp_vaddr)) return; - /* can_skip_sstep() succeeded, or restart if can't singlestep */ + /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ out: put_uprobe(uprobe); } -- cgit From 014940bad8e46ca7bd0483f760f9cba60088a3d4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Apr 2014 20:20:10 +0200 Subject: uprobes/x86: Send SIGILL if arch_uprobe_post_xol() fails Currently the error from arch_uprobe_post_xol() is silently ignored. This doesn't look good and this can lead to the hard-to-debug problems. 1. Change handle_singlestep() to loudly complain and send SIGILL. Note: this only affects x86, ppc/arm can't fail. 2. Change arch_uprobe_post_xol() to call arch_uprobe_abort_xol() and avoid TF games if it is going to return an error. This can help to to analyze the problem, if nothing else we should not report ->ip = xol_slot in the core-file. Note: this means that handle_riprel_post_xol() can be called twice, but this is fine because it is idempotent. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 16 ++++++++++++---- kernel/events/uprobes.c | 8 +++++++- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 08cdb82815fe..e72903eacd43 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -594,6 +594,15 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) struct uprobe_task *utask = current->utask; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); + + if (auprobe->ops->post_xol) { + int err = auprobe->ops->post_xol(auprobe, regs); + if (err) { + arch_uprobe_abort_xol(auprobe, regs); + return err; + } + } + current->thread.trap_nr = utask->autask.saved_trap_nr; /* * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP @@ -605,8 +614,6 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) else if (!(auprobe->fixups & UPROBE_FIX_SETF)) regs->flags &= ~X86_EFLAGS_TF; - if (auprobe->ops->post_xol) - return auprobe->ops->post_xol(auprobe, regs); return 0; } @@ -641,8 +648,9 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, /* * This function gets called when XOL instruction either gets trapped or - * the thread has a fatal signal, so reset the instruction pointer to its - * probed address. + * the thread has a fatal signal, or if arch_uprobe_post_xol() failed. + * Reset the instruction pointer to its probed address for the potential + * restart or for post mortem analysis. */ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ea2a7c0728bb..d1edc5e6fd03 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1867,10 +1867,11 @@ out: static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) { struct uprobe *uprobe; + int err = 0; uprobe = utask->active_uprobe; if (utask->state == UTASK_SSTEP_ACK) - arch_uprobe_post_xol(&uprobe->arch, regs); + err = arch_uprobe_post_xol(&uprobe->arch, regs); else if (utask->state == UTASK_SSTEP_TRAPPED) arch_uprobe_abort_xol(&uprobe->arch, regs); else @@ -1884,6 +1885,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) spin_lock_irq(¤t->sighand->siglock); recalc_sigpending(); /* see uprobe_deny_signal() */ spin_unlock_irq(¤t->sighand->siglock); + + if (unlikely(err)) { + uprobe_warn(current, "execute the probed insn, sending SIGILL."); + force_sig_info(SIGILL, SEND_SIG_FORCED, current); + } } /* -- cgit From febdbfe8a91ce0d11939d4940b592eb0dba8d663 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Feb 2014 18:16:07 +0100 Subject: arch: Prepare for smp_mb__{before,after}_atomic() Since the smp_mb__{before,after}*() ops are fundamentally dependent on how an arch can implement atomics it doesn't make sense to have 3 variants of them. They must all be the same. Furthermore, the 3 variants suggest they're only valid for those 3 atomic ops, while we have many more where they could be applied. So move away from smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}() and reduce the interface to just the two: smp_mb__{before,after}_atomic(). This patch prepares the way by introducing default implementations in asm-generic/barrier.h that default to a full barrier and providing __deprecated inlines for the previous 6 barriers if they're not provided by the arch. This should allow for a mostly painless transition (lots of deprecated warns in the interim). Signed-off-by: Peter Zijlstra Acked-by: Paul E. McKenney Link: http://lkml.kernel.org/n/tip-wr59327qdyi9mbzn6x937s4e@git.kernel.org Cc: Arnd Bergmann Cc: "Chen, Gong" Cc: John Sullivan Cc: Linus Torvalds Cc: Mauro Carvalho Chehab Cc: Srinivas Pandruvada Cc: "Theodore Ts'o" Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/asm-generic/atomic.h | 7 +------ include/asm-generic/barrier.h | 8 ++++++++ include/asm-generic/bitops.h | 9 +-------- include/linux/atomic.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/bitops.h | 20 ++++++++++++++++++++ kernel/sched/core.c | 16 ++++++++++++++++ 6 files changed, 82 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h index 33bd2de3bc1e..9c79e7603459 100644 --- a/include/asm-generic/atomic.h +++ b/include/asm-generic/atomic.h @@ -16,6 +16,7 @@ #define __ASM_GENERIC_ATOMIC_H #include +#include #ifdef CONFIG_SMP /* Force people to define core atomics */ @@ -182,11 +183,5 @@ static inline void atomic_set_mask(unsigned int mask, atomic_t *v) } #endif -/* Assume that atomic operations are already serializing */ -#define smp_mb__before_atomic_dec() barrier() -#define smp_mb__after_atomic_dec() barrier() -#define smp_mb__before_atomic_inc() barrier() -#define smp_mb__after_atomic_inc() barrier() - #endif /* __KERNEL__ */ #endif /* __ASM_GENERIC_ATOMIC_H */ diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 6f692f8ac664..1402fa855388 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -62,6 +62,14 @@ #define set_mb(var, value) do { (var) = (value); mb(); } while (0) #endif +#ifndef smp_mb__before_atomic +#define smp_mb__before_atomic() smp_mb() +#endif + +#ifndef smp_mb__after_atomic +#define smp_mb__after_atomic() smp_mb() +#endif + #define smp_store_release(p, v) \ do { \ compiletime_assert_atomic_type(*p); \ diff --git a/include/asm-generic/bitops.h b/include/asm-generic/bitops.h index 280ca7a96f75..dcdcacf2fd2b 100644 --- a/include/asm-generic/bitops.h +++ b/include/asm-generic/bitops.h @@ -11,14 +11,7 @@ #include #include - -/* - * clear_bit may not imply a memory barrier - */ -#ifndef smp_mb__before_clear_bit -#define smp_mb__before_clear_bit() smp_mb() -#define smp_mb__after_clear_bit() smp_mb() -#endif +#include #include #include diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 5b08a8540ecf..fef3a809e7cf 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -3,6 +3,42 @@ #define _LINUX_ATOMIC_H #include +/* + * Provide __deprecated wrappers for the new interface, avoid flag day changes. + * We need the ugly external functions to break header recursion hell. + */ +#ifndef smp_mb__before_atomic_inc +static inline void __deprecated smp_mb__before_atomic_inc(void) +{ + extern void __smp_mb__before_atomic(void); + __smp_mb__before_atomic(); +} +#endif + +#ifndef smp_mb__after_atomic_inc +static inline void __deprecated smp_mb__after_atomic_inc(void) +{ + extern void __smp_mb__after_atomic(void); + __smp_mb__after_atomic(); +} +#endif + +#ifndef smp_mb__before_atomic_dec +static inline void __deprecated smp_mb__before_atomic_dec(void) +{ + extern void __smp_mb__before_atomic(void); + __smp_mb__before_atomic(); +} +#endif + +#ifndef smp_mb__after_atomic_dec +static inline void __deprecated smp_mb__after_atomic_dec(void) +{ + extern void __smp_mb__after_atomic(void); + __smp_mb__after_atomic(); +} +#endif + /** * atomic_add_unless - add unless the number is already a given value * @v: pointer of type atomic_t diff --git a/include/linux/bitops.h b/include/linux/bitops.h index be5fd38bd5a0..cbc5833fb221 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -32,6 +32,26 @@ extern unsigned long __sw_hweight64(__u64 w); */ #include +/* + * Provide __deprecated wrappers for the new interface, avoid flag day changes. + * We need the ugly external functions to break header recursion hell. + */ +#ifndef smp_mb__before_clear_bit +static inline void __deprecated smp_mb__before_clear_bit(void) +{ + extern void __smp_mb__before_atomic(void); + __smp_mb__before_atomic(); +} +#endif + +#ifndef smp_mb__after_clear_bit +static inline void __deprecated smp_mb__after_clear_bit(void) +{ + extern void __smp_mb__after_atomic(void); + __smp_mb__after_atomic(); +} +#endif + #define for_each_set_bit(bit, addr, size) \ for ((bit) = find_first_bit((addr), (size)); \ (bit) < (size); \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45ea238c..8a70ec091760 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -90,6 +90,22 @@ #define CREATE_TRACE_POINTS #include +#ifdef smp_mb__before_atomic +void __smp_mb__before_atomic(void) +{ + smp_mb__before_atomic(); +} +EXPORT_SYMBOL(__smp_mb__before_atomic); +#endif + +#ifdef smp_mb__after_atomic +void __smp_mb__after_atomic(void) +{ + smp_mb__after_atomic(); +} +EXPORT_SYMBOL(__smp_mb__after_atomic); +#endif + void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; -- cgit From cadefd3d6cc914d95163ba1eda766bfe7ce1e5b7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Feb 2014 10:40:35 +0100 Subject: sched: Make scale_rt_power() deal with backward clocks Mike reported that, while unlikely, its entirely possible for scale_rt_power() to see the time go backwards. This yields rather 'interesting' results. So like all other sites that deal with clocks; make this one ignore backward clock movement too. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140227094035.GZ9987@twins.programming.kicks-ass.net Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7570dd969c28..5e157f157d85 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5564,6 +5564,7 @@ static unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); u64 total, available, age_stamp, avg; + s64 delta; /* * Since we're reading these variables without serialization make sure @@ -5572,7 +5573,11 @@ static unsigned long scale_rt_power(int cpu) age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); - total = sched_avg_period() + (rq_clock(rq) - age_stamp); + delta = rq_clock(rq) - age_stamp; + if (unlikely(delta < 0)) + delta = 0; + + total = sched_avg_period() + delta; if (unlikely(total < avg)) { /* Ensures that power won't end up being negative */ -- cgit From 10447917551e0fffb8d1892d46e633c3e0a9c1ec Mon Sep 17 00:00:00 2001 From: Kirill V Tkhai Date: Wed, 12 Mar 2014 06:18:33 -0400 Subject: sched/rt: Do not try to push tasks if pinned task switches to RT Just switched pinned task is not able to be pushed. If the rq had had several RT tasks before they have already been considered as candidates to be pushed (or pulled). Signed-off-by: Kirill V Tkhai Acked-by: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140312061833.3a43aa64@gandalf.local.home Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index bd2267ad404f..1e4992eb5166 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1892,9 +1892,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (p->on_rq && rq->curr != p) { #ifdef CONFIG_SMP - if (rq->rt.overloaded && push_rt_task(rq) && + if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && /* Don't resched if we changed runqueues */ - rq != task_rq(p)) + push_rt_task(rq) && rq != task_rq(p)) check_resched = 0; #endif /* CONFIG_SMP */ if (check_resched && p->prio < rq->curr->prio) -- cgit From 8698a745d800c59cd5a576398bdeccd578ac66f1 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 11 Mar 2014 18:09:12 +0800 Subject: sched, treewide: Replace hardcoded nice values with MIN_NICE/MAX_NICE Replace various -20/+19 hardcoded nice values with MIN_NICE/MAX_NICE. Signed-off-by: Dongsheng Yang Acked-by: Tejun Heo Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/ff13819fd09b7a5dba5ab5ae797f2e7019bdfa17.1394532288.git.yangds.fnst@cn.fujitsu.com Cc: devel@driverdev.osuosl.org Cc: devicetree@vger.kernel.org Cc: fcoe-devel@open-fcoe.org Cc: linux390@de.ibm.com Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-s390@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: nbd-general@lists.sourceforge.net Cc: ocfs2-devel@oss.oracle.com Cc: openipmi-developer@lists.sourceforge.net Cc: qla2xxx-upstream@qlogic.com Cc: linux-arch@vger.kernel.org [ Consolidated the patches, twiddled the changelog. ] Signed-off-by: Ingo Molnar --- drivers/block/loop.c | 2 +- drivers/block/nbd.c | 2 +- drivers/block/pktcdvd.c | 2 +- drivers/char/ipmi/ipmi_si_intf.c | 2 +- drivers/s390/crypto/ap_bus.c | 2 +- drivers/scsi/bnx2fc/bnx2fc_fcoe.c | 4 ++-- drivers/scsi/bnx2i/bnx2i_hwi.c | 2 +- drivers/scsi/fcoe/fcoe.c | 2 +- drivers/scsi/ibmvscsi/ibmvfc.c | 2 +- drivers/scsi/ibmvscsi/ibmvscsi.c | 2 +- drivers/scsi/lpfc/lpfc_hbadisc.c | 2 +- drivers/scsi/qla2xxx/qla_os.c | 2 +- drivers/staging/android/binder.c | 2 +- drivers/staging/lustre/lustre/llite/lloop.c | 2 +- fs/ocfs2/cluster/heartbeat.c | 2 +- kernel/locking/locktorture.c | 2 +- kernel/workqueue.c | 6 +++--- mm/huge_memory.c | 2 +- 18 files changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 66e8c3b94ef3..c8bf270b7890 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -548,7 +548,7 @@ static int loop_thread(void *data) struct loop_device *lo = data; struct bio *bio; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) { diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 55298db36b2d..2a1f26bd6640 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -533,7 +533,7 @@ static int nbd_thread(void *data) struct nbd_device *nbd = data; struct request *req; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { /* wait for something to do */ wait_event_interruptible(nbd->waiting_wq, diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index a2af73db187b..ef166ad2dbad 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1463,7 +1463,7 @@ static int kcdrwd(void *foobar) struct packet_data *pkt; long min_sleep_time, residue; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); set_freezable(); for (;;) { diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index b7efd3c1a882..0f20d3646a46 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -998,7 +998,7 @@ static int ipmi_thread(void *data) struct timespec busy_until; ipmi_si_set_not_busy(&busy_until); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { int busy_wait; diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index ab3baa7f9508..8eec1653c9cc 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -1803,7 +1803,7 @@ static int ap_poll_thread(void *data) int requests; struct ap_device *ap_dev; - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); while (1) { if (ap_suspend_flag) return 0; diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c index 6287f6a8b79d..3455cc5e4bfd 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c +++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c @@ -464,7 +464,7 @@ static int bnx2fc_l2_rcv_thread(void *arg) struct fcoe_percpu_s *bg = arg; struct sk_buff *skb; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { schedule(); @@ -602,7 +602,7 @@ int bnx2fc_percpu_io_thread(void *arg) struct bnx2fc_work *work, *tmp; LIST_HEAD(work_list); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { schedule(); diff --git a/drivers/scsi/bnx2i/bnx2i_hwi.c b/drivers/scsi/bnx2i/bnx2i_hwi.c index b5ffd280a1ae..d6d491c2f004 100644 --- a/drivers/scsi/bnx2i/bnx2i_hwi.c +++ b/drivers/scsi/bnx2i/bnx2i_hwi.c @@ -1870,7 +1870,7 @@ int bnx2i_percpu_io_thread(void *arg) struct bnx2i_work *work, *tmp; LIST_HEAD(work_list); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (!kthread_should_stop()) { spin_lock_bh(&p->p_work_lock); diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c index f3170008ae71..843a679d2a5e 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -1872,7 +1872,7 @@ static int fcoe_percpu_receive_thread(void *arg) skb_queue_head_init(&tmp); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); retry: while (!kthread_should_stop()) { diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 23f5ba5e6472..8dd47689d584 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -4515,7 +4515,7 @@ static int ibmvfc_work(void *data) struct ibmvfc_host *vhost = data; int rc; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (1) { rc = wait_event_interruptible(vhost->work_wait_q, diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c index fa764406df68..2ebfb2bb0f42 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c @@ -2213,7 +2213,7 @@ static int ibmvscsi_work(void *data) struct ibmvscsi_host_data *hostdata = data; int rc; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (1) { rc = wait_event_interruptible(hostdata->work_wait_q, diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index 59b51c529ba0..294c072e9083 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -731,7 +731,7 @@ lpfc_do_work(void *p) struct lpfc_hba *phba = p; int rc; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); current->flags |= PF_NOFREEZE; phba->data_flags = 0; diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 19e99cc33724..afc84814e9bb 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -4828,7 +4828,7 @@ qla2x00_do_dpc(void *data) ha = (struct qla_hw_data *)data; base_vha = pci_get_drvdata(ha->pdev); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index cfe4bc8f05cb..179b21b66504 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -441,7 +441,7 @@ static void binder_set_nice(long nice) "%d: nice value %ld not allowed use %ld instead\n", current->pid, nice, min_nice); set_user_nice(current, min_nice); - if (min_nice < 20) + if (min_nice <= MAX_NICE) return; binder_user_error("%d RLIMIT_NICE not set\n", current->pid); } diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c index f78eda235c7a..d4c2cd91add4 100644 --- a/drivers/staging/lustre/lustre/llite/lloop.c +++ b/drivers/staging/lustre/lustre/llite/lloop.c @@ -407,7 +407,7 @@ static int loop_thread(void *data) int refcheck; int ret = 0; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); lo->lo_state = LLOOP_BOUND; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index bf482dfed14f..73039295d0d1 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1107,7 +1107,7 @@ static int o2hb_thread(void *data) mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); /* Pin node */ o2nm_depend_this_node(); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f26b1a18e34e..23343be46e91 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -216,7 +216,7 @@ static int lock_torture_writer(void *arg) static DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_writer task started"); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); do { schedule_timeout_uninterruptible(1); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0ee63af30bd1..c30c01b32ece 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -100,10 +100,10 @@ enum { /* * Rescue workers are used only on emergencies and shared by - * all cpus. Give -20. + * all cpus. Give MIN_NICE. */ - RESCUER_NICE_LEVEL = -20, - HIGHPRI_NICE_LEVEL = -20, + RESCUER_NICE_LEVEL = MIN_NICE, + HIGHPRI_NICE_LEVEL = MIN_NICE, WQ_NAME_LEN = 24, }; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1546655a2d78..dcdb6f9adea1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2803,7 +2803,7 @@ static int khugepaged(void *none) struct mm_slot *mm_slot; set_freezable(); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { khugepaged_do_scan(); -- cgit From 22abdef37cebcdd4933c72339401a174b7d87768 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sat, 15 Mar 2014 02:14:49 +0400 Subject: sched/rt: Sum number of all children tasks in hierarhy at ->rt_nr_running {inc,dec}_rt_tasks() used to count entities which are directly queued on the rt_rq. If an entity was not a task (i.e., it is some queue), its children were not counted. There is no problem here, but now we want to count number of all tasks which are actually queued under the rt_rq in all the hierarchy (except throttled rt queues). Empty queues are not able to be queued and all of the places, which use ->rt_nr_running, just compare it with zero, so we do not break anything here. Signed-off-by: Kirill Tkhai Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394835289.18748.31.camel@HP-250-G1-Notebook-PC Cc: linux-kernel@vger.kernel.org [ Twiddled the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 1e4992eb5166..6892ed7d644b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1044,13 +1044,24 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} #endif /* CONFIG_RT_GROUP_SCHED */ +static inline +unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) +{ + struct rt_rq *group_rq = group_rt_rq(rt_se); + + if (group_rq) + return group_rq->rt_nr_running; + else + return 1; +} + static inline void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { int prio = rt_se_prio(rt_se); WARN_ON(!rt_prio(prio)); - rt_rq->rt_nr_running++; + rt_rq->rt_nr_running += rt_se_nr_running(rt_se); inc_rt_prio(rt_rq, prio); inc_rt_migration(rt_se, rt_rq); @@ -1062,7 +1073,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); - rt_rq->rt_nr_running--; + rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); dec_rt_prio(rt_rq, rt_se_prio(rt_se)); dec_rt_migration(rt_se, rt_rq); -- cgit From 653d07a6989a9a4166dcd1025aa252b3605737fd Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sat, 15 Mar 2014 02:14:55 +0400 Subject: sched/rt: Add accessors rq_of_rt_se() Two accessors for RT_GROUP_SCHED and !RT_GROUP_SCHED cases. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394835295.18748.32.camel@HP-250-G1-Notebook-PC Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 6892ed7d644b..f6aa3cdbee84 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -112,6 +112,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) return rt_se->rt_rq; } +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) +{ + struct rt_rq *rt_rq = rt_se->rt_rq; + + return rt_rq->rq; +} + void free_rt_sched_group(struct task_group *tg) { int i; @@ -211,10 +218,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) return container_of(rt_rq, struct rq, rt); } -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) { struct task_struct *p = rt_task_of(rt_se); - struct rq *rq = task_rq(p); + + return task_rq(p); +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ + struct rq *rq = rq_of_rt_se(rt_se); return &rq->rt; } -- cgit From f4ebcbc0d7e009783256c9daf76bc4b90e645c14 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sat, 15 Mar 2014 02:15:00 +0400 Subject: sched/rt: Substract number of tasks of throttled queues from rq->nr_running Now rq->rt becomes to be able to be in dequeued or enqueued state. We add new member rt_rq->rt_queued, which is used to indicate this. The member is used only for top queue rq->rt_rq. The goal is to fit generic scheme which is used in deadline and fair classes, i.e. throttled rt_rq's rt_nr_running is beeing substracted from rq->nr_running. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394835300.18748.33.camel@HP-250-G1-Notebook-PC Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 73 +++++++++++++++++++++++++++++++++++++++++++--------- kernel/sched/sched.h | 2 ++ 2 files changed, 63 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f6aa3cdbee84..2add019ddbd0 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); #endif + /* We start is dequeued state, because no RT tasks are queued */ + rt_rq->rt_queued = 0; rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; @@ -404,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq) } #endif /* CONFIG_SMP */ +static void enqueue_top_rt_rq(struct rt_rq *rt_rq); +static void dequeue_top_rt_rq(struct rt_rq *rt_rq); + static inline int on_rt_rq(struct sched_rt_entity *rt_se) { return !list_empty(&rt_se->run_list); @@ -465,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; if (rt_rq->rt_nr_running) { - if (rt_se && !on_rt_rq(rt_se)) + if (!rt_se) + enqueue_top_rt_rq(rt_rq); + else if (!on_rt_rq(rt_se)) enqueue_rt_entity(rt_se, false); + if (rt_rq->highest_prio.curr < curr->prio) resched_task(curr); } @@ -479,7 +487,9 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; - if (rt_se && on_rt_rq(rt_se)) + if (!rt_se) + dequeue_top_rt_rq(rt_rq); + else if (on_rt_rq(rt_se)) dequeue_rt_entity(rt_se); } @@ -545,12 +555,18 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { - if (rt_rq->rt_nr_running) - resched_task(rq_of_rt_rq(rt_rq)->curr); + struct rq *rq = rq_of_rt_rq(rt_rq); + + if (!rt_rq->rt_nr_running) + return; + + enqueue_top_rt_rq(rt_rq); + resched_task(rq->curr); } static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { + dequeue_top_rt_rq(rt_rq); } static inline const struct cpumask *sched_rt_period_mask(void) @@ -935,6 +951,38 @@ static void update_curr_rt(struct rq *rq) } } +static void +dequeue_top_rt_rq(struct rt_rq *rt_rq) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + BUG_ON(&rq->rt != rt_rq); + + if (!rt_rq->rt_queued) + return; + + BUG_ON(!rq->nr_running); + + rq->nr_running -= rt_rq->rt_nr_running; + rt_rq->rt_queued = 0; +} + +static void +enqueue_top_rt_rq(struct rt_rq *rt_rq) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + BUG_ON(&rq->rt != rt_rq); + + if (rt_rq->rt_queued) + return; + if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) + return; + + rq->nr_running += rt_rq->rt_nr_running; + rt_rq->rt_queued = 1; +} + #if defined CONFIG_SMP static void @@ -1143,6 +1191,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) back = rt_se; } + dequeue_top_rt_rq(rt_rq_of_se(back)); + for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) __dequeue_rt_entity(rt_se); @@ -1151,13 +1201,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) { + struct rq *rq = rq_of_rt_se(rt_se); + dequeue_rt_stack(rt_se); for_each_sched_rt_entity(rt_se) __enqueue_rt_entity(rt_se, head); + enqueue_top_rt_rq(&rq->rt); } static void dequeue_rt_entity(struct sched_rt_entity *rt_se) { + struct rq *rq = rq_of_rt_se(rt_se); + dequeue_rt_stack(rt_se); for_each_sched_rt_entity(rt_se) { @@ -1166,6 +1221,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) if (rt_rq && rt_rq->rt_nr_running) __enqueue_rt_entity(rt_se, false); } + enqueue_top_rt_rq(&rq->rt); } /* @@ -1183,8 +1239,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); - - inc_nr_running(rq); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) @@ -1195,8 +1249,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) dequeue_rt_entity(rt_se); dequeue_pushable_task(rq, p); - - dec_nr_running(rq); } /* @@ -1401,10 +1453,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) if (prev->sched_class == &rt_sched_class) update_curr_rt(rq); - if (!rt_rq->rt_nr_running) - return NULL; - - if (rt_rq_throttled(rt_rq)) + if (!rt_rq->rt_queued) return NULL; put_prev_task(rq, prev); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 456e492a3dca..c8d9ee418ca7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -409,6 +409,8 @@ struct rt_rq { int overloaded; struct plist_head pushable_tasks; #endif + int rt_queued; + int rt_throttled; u64 rt_time; u64 rt_runtime; -- cgit From 46383648b3c769fa74794ae6425ab993fc113bdb Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sat, 15 Mar 2014 02:15:07 +0400 Subject: sched: Revert commit 4c6c4e38c4e9 ("sched/core: Fix endless loop in pick_next_task()") This reverts commit 4c6c4e38c4e9 ("sched/core: Fix endless loop in pick_next_task()"), which is not necessary after ("sched/rt: Substract number of tasks of throttled queues from rq->nr_running"). Signed-off-by: Kirill Tkhai Reviewed-by: Preeti U Murthy [conflict resolution with stop task checking patch] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394835307.18748.34.camel@HP-250-G1-Notebook-PC Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +---- kernel/sched/rt.c | 10 ++++++++++ kernel/sched/sched.h | 12 ------------ 3 files changed, 11 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5e157f157d85..43232b8bacde 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6732,10 +6732,7 @@ static int idle_balance(struct rq *this_rq) out: /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running && - ((this_rq->stop && this_rq->stop->on_rq) || - this_rq->dl.dl_nr_running || - (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) + if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1; if (pulled_task) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2add019ddbd0..7795e292f4c9 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -493,6 +493,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) dequeue_rt_entity(rt_se); } +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; +} + static int rt_se_boosted(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); @@ -569,6 +574,11 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) dequeue_top_rt_rq(rt_rq); } +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled; +} + static inline const struct cpumask *sched_rt_period_mask(void) { return cpu_online_mask; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c8d9ee418ca7..b2cbe81308af 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -425,18 +425,6 @@ struct rt_rq { #endif }; -#ifdef CONFIG_RT_GROUP_SCHED -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; -} -#else -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled; -} -#endif - /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ -- cgit From c464c76eec4be587604ca082e8cded7e6b89f3bf Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 18 Mar 2014 16:56:41 +0800 Subject: perf: Allow building PMU drivers as modules This patch adds support for building PMU driver as module. It exports the functions perf_pmu_{register,unregister}() and adds reference tracking for the PMU driver module. When the PMU driver is built as a module, each active event of the PMU holds a reference to the driver module. Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1395133004-23205-1-git-send-email-zheng.z.yan@intel.com Cc: eranian@google.com Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + kernel/events/core.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3356abcfff18..af6dcf1d9e47 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -172,6 +172,7 @@ struct perf_event; struct pmu { struct list_head entry; + struct module *module; struct device *dev; const struct attribute_group **attr_groups; const char *name; diff --git a/kernel/events/core.c b/kernel/events/core.c index f83a71a3e46d..5129b1201050 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "internal.h" @@ -3229,6 +3230,9 @@ static void __free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); + if (event->pmu) + module_put(event->pmu->module); + call_rcu(&event->rcu_head, free_event_rcu); } static void free_event(struct perf_event *event) @@ -6551,6 +6555,7 @@ free_pdc: free_percpu(pmu->pmu_disable_count); goto unlock; } +EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { @@ -6572,6 +6577,7 @@ void perf_pmu_unregister(struct pmu *pmu) put_device(pmu->dev); free_pmu_context(pmu); } +EXPORT_SYMBOL_GPL(perf_pmu_unregister); struct pmu *perf_init_event(struct perf_event *event) { @@ -6585,6 +6591,10 @@ struct pmu *perf_init_event(struct perf_event *event) pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); if (pmu) { + if (!try_module_get(pmu->module)) { + pmu = ERR_PTR(-ENODEV); + goto unlock; + } event->pmu = pmu; ret = pmu->event_init(event); if (ret) @@ -6593,6 +6603,10 @@ struct pmu *perf_init_event(struct perf_event *event) } list_for_each_entry_rcu(pmu, &pmus, entry) { + if (!try_module_get(pmu->module)) { + pmu = ERR_PTR(-ENODEV); + goto unlock; + } event->pmu = pmu; ret = pmu->event_init(event); if (!ret) @@ -6771,6 +6785,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, err_pmu: if (event->destroy) event->destroy(event); + module_put(pmu->module); err_ns: if (event->ns) put_pid_ns(event->ns); -- cgit From 8588a2bbddc524325d84d1d6996758e9242e4ffc Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 18 Mar 2014 16:56:42 +0800 Subject: hrtimer: Export __hrtimer_start_range_ns() Export __hrtimer_start_range_ns() to allow building perf Intel uncore driver as a module. Signed-off-by: Yan, Zheng Acked-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1395133004-23205-2-git-send-email-zheng.z.yan@intel.com Cc: eranian@google.com Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index d55092ceee29..53d26829cd4d 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1017,6 +1017,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, return ret; } +EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns); /** * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU -- cgit From 4e857c58efeb99393cba5a5d0d8ec7117183137c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 17 Mar 2014 18:06:10 +0100 Subject: arch: Mass conversion of smp_mb__*() Mostly scripted conversion of the smp_mb__* barriers. Signed-off-by: Peter Zijlstra Acked-by: Paul E. McKenney Link: http://lkml.kernel.org/n/tip-55dhyhocezdw1dg7u19hmh1u@git.kernel.org Cc: Linus Torvalds Cc: linux-arch@vger.kernel.org Signed-off-by: Ingo Molnar --- block/blk-iopoll.c | 4 +- crypto/chainiv.c | 2 +- drivers/base/power/domain.c | 2 +- drivers/block/mtip32xx/mtip32xx.c | 4 +- drivers/cpuidle/coupled.c | 2 +- drivers/firewire/ohci.c | 2 +- drivers/gpu/drm/drm_irq.c | 10 ++-- drivers/gpu/drm/i915/i915_irq.c | 2 +- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/closure.h | 2 +- drivers/md/dm-bufio.c | 8 ++-- drivers/md/dm-snap.c | 4 +- drivers/md/dm.c | 2 +- drivers/md/raid5.c | 2 +- drivers/media/usb/dvb-usb-v2/dvb_usb_core.c | 6 +-- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 6 +-- drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 18 ++++---- drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c | 26 +++++------ drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c | 8 ++-- drivers/net/ethernet/broadcom/cnic.c | 8 ++-- drivers/net/ethernet/brocade/bna/bnad.c | 6 +-- drivers/net/ethernet/chelsio/cxgb/cxgb2.c | 2 +- drivers/net/ethernet/chelsio/cxgb3/sge.c | 6 +-- drivers/net/ethernet/chelsio/cxgb4/sge.c | 2 +- drivers/net/ethernet/chelsio/cxgb4vf/sge.c | 2 +- drivers/net/ethernet/freescale/gianfar.c | 8 ++-- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 8 ++-- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 6 +-- drivers/net/wireless/ti/wlcore/main.c | 2 +- drivers/pci/xen-pcifront.c | 4 +- drivers/scsi/isci/remote_device.c | 2 +- drivers/target/loopback/tcm_loop.c | 4 +- drivers/target/target_core_alua.c | 26 +++++------ drivers/target/target_core_device.c | 6 +-- drivers/target/target_core_iblock.c | 2 +- drivers/target/target_core_pr.c | 56 +++++++++++------------ drivers/target/target_core_transport.c | 16 +++---- drivers/target/target_core_ua.c | 10 ++-- drivers/tty/n_tty.c | 2 +- drivers/tty/serial/mxs-auart.c | 4 +- drivers/usb/gadget/tcm_usb_gadget.c | 4 +- drivers/usb/serial/usb_wwan.c | 2 +- drivers/vhost/scsi.c | 2 +- drivers/w1/w1_family.c | 4 +- drivers/xen/xen-pciback/pciback_ops.c | 4 +- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 6 +-- fs/btrfs/ioctl.c | 2 +- fs/buffer.c | 2 +- fs/ext4/resize.c | 2 +- fs/gfs2/glock.c | 8 ++-- fs/gfs2/glops.c | 2 +- fs/gfs2/lock_dlm.c | 4 +- fs/gfs2/recovery.c | 2 +- fs/gfs2/sys.c | 4 +- fs/jbd2/commit.c | 6 +-- fs/nfs/dir.c | 12 ++--- fs/nfs/inode.c | 2 +- fs/nfs/nfs4filelayoutdev.c | 4 +- fs/nfs/nfs4state.c | 4 +- fs/nfs/pagelist.c | 6 +-- fs/nfs/pnfs.c | 2 +- fs/nfs/pnfs.h | 2 +- fs/nfs/write.c | 4 +- fs/ubifs/lpt_commit.c | 4 +- fs/ubifs/tnc_commit.c | 4 +- include/asm-generic/bitops/atomic.h | 2 +- include/asm-generic/bitops/lock.h | 2 +- include/linux/buffer_head.h | 2 +- include/linux/genhd.h | 2 +- include/linux/interrupt.h | 8 ++-- include/linux/netdevice.h | 2 +- include/linux/sched.h | 6 +-- include/linux/sunrpc/sched.h | 8 ++-- include/linux/sunrpc/xprt.h | 8 ++-- include/linux/tracehook.h | 2 +- include/net/ip_vs.h | 4 +- kernel/debug/debug_core.c | 4 +- kernel/futex.c | 4 +- kernel/kmod.c | 2 +- kernel/rcu/tree.c | 22 ++++----- kernel/rcu/tree_plugin.h | 8 ++-- kernel/sched/cpupri.c | 6 +-- kernel/sched/wait.c | 2 +- mm/backing-dev.c | 2 +- mm/filemap.c | 4 +- net/atm/pppoatm.c | 2 +- net/bluetooth/hci_event.c | 4 +- net/core/dev.c | 8 ++-- net/core/link_watch.c | 2 +- net/ipv4/inetpeer.c | 2 +- net/ipv4/tcp_output.c | 4 +- net/netfilter/nf_conntrack_core.c | 2 +- net/rds/ib_recv.c | 4 +- net/rds/iw_recv.c | 4 +- net/rds/send.c | 6 +-- net/rds/tcp_send.c | 2 +- net/sunrpc/auth.c | 2 +- net/sunrpc/auth_gss/auth_gss.c | 2 +- net/sunrpc/backchannel_rqst.c | 4 +- net/sunrpc/xprt.c | 4 +- net/sunrpc/xprtsock.c | 16 +++---- net/unix/af_unix.c | 2 +- sound/pci/bt87x.c | 4 +- 106 files changed, 284 insertions(+), 288 deletions(-) (limited to 'kernel') diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c index c11d24e379e2..f8c6a11b13f0 100644 --- a/block/blk-iopoll.c +++ b/block/blk-iopoll.c @@ -49,7 +49,7 @@ EXPORT_SYMBOL(blk_iopoll_sched); void __blk_iopoll_complete(struct blk_iopoll *iop) { list_del(&iop->list); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); } EXPORT_SYMBOL(__blk_iopoll_complete); @@ -161,7 +161,7 @@ EXPORT_SYMBOL(blk_iopoll_disable); void blk_iopoll_enable(struct blk_iopoll *iop) { BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state)); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); } EXPORT_SYMBOL(blk_iopoll_enable); diff --git a/crypto/chainiv.c b/crypto/chainiv.c index 834d8dd3d4fc..9c294c8f9a07 100644 --- a/crypto/chainiv.c +++ b/crypto/chainiv.c @@ -126,7 +126,7 @@ static int async_chainiv_schedule_work(struct async_chainiv_ctx *ctx) int err = ctx->err; if (!ctx->queue.qlen) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(CHAINIV_STATE_INUSE, &ctx->state); if (!ctx->queue.qlen || diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index ae098a261fcd..eee55c1e5fde 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -105,7 +105,7 @@ static bool genpd_sd_counter_dec(struct generic_pm_domain *genpd) static void genpd_sd_counter_inc(struct generic_pm_domain *genpd) { atomic_inc(&genpd->sd_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } static void genpd_acquire_lock(struct generic_pm_domain *genpd) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 59c5abe32f06..4fd8d6c1c3d2 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -224,9 +224,9 @@ static int get_slot(struct mtip_port *port) */ static inline void release_slot(struct mtip_port *port, int tag) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(tag, port->allocated); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } /* diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c index cb6654bfad77..73fe2f8d7f96 100644 --- a/drivers/cpuidle/coupled.c +++ b/drivers/cpuidle/coupled.c @@ -159,7 +159,7 @@ void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a) { int n = dev->coupled->online_count; - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(a); while (atomic_read(a) < n) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 8db663219560..995dd42a2627 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -3498,7 +3498,7 @@ static int ohci_flush_iso_completions(struct fw_iso_context *base) } clear_bit_unlock(0, &ctx->flushing_completions); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } tasklet_enable(&ctx->context.tasklet); diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c index c2676b5908d9..ec5c3f4cdd01 100644 --- a/drivers/gpu/drm/drm_irq.c +++ b/drivers/gpu/drm/drm_irq.c @@ -156,7 +156,7 @@ static void vblank_disable_and_save(struct drm_device *dev, int crtc) */ if ((vblrc > 0) && (abs64(diff_ns) > 1000000)) { atomic_inc(&dev->vblank[crtc].count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } /* Invalidate all timestamps while vblank irq's are off. */ @@ -864,9 +864,9 @@ static void drm_update_vblank_count(struct drm_device *dev, int crtc) vblanktimestamp(dev, crtc, tslot) = t_vblank; } - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_add(diff, &dev->vblank[crtc].count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } /** @@ -1330,9 +1330,9 @@ bool drm_handle_vblank(struct drm_device *dev, int crtc) /* Increment cooked vblank count. This also atomically commits * the timestamp computed above. */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&dev->vblank[crtc].count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } else { DRM_DEBUG("crtc %d: Redundant vblirq ignored. diff_ns = %d\n", crtc, (int) diff_ns); diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 7753249b3a95..5409bfafff63 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -2147,7 +2147,7 @@ static void i915_error_work_func(struct work_struct *work) * updates before * the counter increment. */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&dev_priv->gpu_error.reset_counter); kobject_uevent_env(&dev->primary->kdev->kobj, diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 82c9c5d35251..d2ebcf323094 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -828,7 +828,7 @@ static inline bool cached_dev_get(struct cached_dev *dc) return false; /* Paired with the mb in cached_dev_attach */ - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); return true; } diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 7ef7461912be..a08e3eeac3c5 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -243,7 +243,7 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn, cl->fn = fn; cl->wq = wq; /* between atomic_dec() in closure_put() */ - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); } static inline void closure_queue(struct closure *cl) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 66c5d130c8c2..4e84095833db 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -607,9 +607,9 @@ static void write_endio(struct bio *bio, int error) BUG_ON(!test_bit(B_WRITING, &b->state)); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(B_WRITING, &b->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&b->state, B_WRITING); } @@ -997,9 +997,9 @@ static void read_endio(struct bio *bio, int error) BUG_ON(!test_bit(B_READING, &b->state)); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(B_READING, &b->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&b->state, B_READING); } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index ebddef5237e4..8e0caed0bf74 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -642,7 +642,7 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) struct dm_snapshot *s = pe->snap; mempool_free(pe, s->pending_pool); - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&s->pending_exceptions_count); } @@ -783,7 +783,7 @@ static int init_hash_tables(struct dm_snapshot *s) static void merge_shutdown(struct dm_snapshot *s) { clear_bit_unlock(RUNNING_MERGE, &s->state_bits); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&s->state_bits, RUNNING_MERGE); } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 455e64916498..2db768e4553f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2447,7 +2447,7 @@ static void dm_wq_work(struct work_struct *work) static void dm_queue_flush(struct mapped_device *md) { clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); queue_work(md->wq, &md->work); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ad1b9bea446e..2afef4ec9312 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4400,7 +4400,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) * STRIPE_ON_UNPLUG_LIST clear but the stripe * is still in our list */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); /* * STRIPE_ON_RELEASE_LIST could be set here. In that diff --git a/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c b/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c index de02db802ace..e35580618936 100644 --- a/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c +++ b/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c @@ -399,7 +399,7 @@ static int dvb_usb_stop_feed(struct dvb_demux_feed *dvbdmxfeed) /* clear 'streaming' status bit */ clear_bit(ADAP_STREAMING, &adap->state_bits); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&adap->state_bits, ADAP_STREAMING); skip_feed_stop: @@ -550,7 +550,7 @@ static int dvb_usb_fe_init(struct dvb_frontend *fe) err: if (!adap->suspend_resume_active) { clear_bit(ADAP_INIT, &adap->state_bits); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&adap->state_bits, ADAP_INIT); } @@ -591,7 +591,7 @@ err: if (!adap->suspend_resume_active) { adap->active_fe = -1; clear_bit(ADAP_SLEEP, &adap->state_bits); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&adap->state_bits, ADAP_SLEEP); } diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 9261d5313b5b..dd57c7c5a3da 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -2781,7 +2781,7 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode) case LOAD_OPEN: netif_tx_start_all_queues(bp->dev); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); break; case LOAD_DIAG: @@ -4939,9 +4939,9 @@ void bnx2x_update_coalesce_sb_index(struct bnx2x *bp, u8 fw_sb_id, void bnx2x_schedule_sp_rtnl(struct bnx2x *bp, enum sp_rtnl_flag flag, u32 verbose) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(flag, &bp->sp_rtnl_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); DP((BNX2X_MSG_SP | verbose), "Scheduling sp_rtnl task [Flag: %d]\n", flag); schedule_delayed_work(&bp->sp_rtnl_task, 0); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index a78edaccceee..16391db2e8c9 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -1858,10 +1858,10 @@ void bnx2x_sp_event(struct bnx2x_fastpath *fp, union eth_rx_cqe *rr_cqe) return; #endif - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&bp->cq_spq_left); /* push the change in bp->spq_left and towards the memory */ - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); DP(BNX2X_MSG_SP, "bp->cq_spq_left %x\n", atomic_read(&bp->cq_spq_left)); @@ -1876,11 +1876,11 @@ void bnx2x_sp_event(struct bnx2x_fastpath *fp, union eth_rx_cqe *rr_cqe) * sp_state is cleared, and this order prevents * races */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(BNX2X_AFEX_PENDING_VIFSET_MCP_ACK, &bp->sp_state); wmb(); clear_bit(BNX2X_AFEX_FCOE_Q_UPDATE_PENDING, &bp->sp_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* schedule the sp task as mcp ack is required */ bnx2x_schedule_sp_task(bp); @@ -5272,9 +5272,9 @@ static void bnx2x_after_function_update(struct bnx2x *bp) __clear_bit(RAMROD_COMP_WAIT, &queue_params.ramrod_flags); /* mark latest Q bit */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(BNX2X_AFEX_FCOE_Q_UPDATE_PENDING, &bp->sp_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* send Q update ramrod for FCoE Q */ rc = bnx2x_queue_state_change(bp, &queue_params); @@ -5500,7 +5500,7 @@ next_spqe: spqe_cnt++; } /* for */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_add(spqe_cnt, &bp->eq_spq_left); bp->eq_cons = sw_cons; @@ -13869,9 +13869,9 @@ static int bnx2x_drv_ctl(struct net_device *dev, struct drv_ctl_info *ctl) case DRV_CTL_RET_L2_SPQ_CREDIT_CMD: { int count = ctl->data.credit.credit_count; - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_add(count, &bp->cq_spq_left); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); break; } case DRV_CTL_ULP_REGISTER_CMD: { diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c index 31297266b743..d725317c4277 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c @@ -258,16 +258,16 @@ static bool bnx2x_raw_check_pending(struct bnx2x_raw_obj *o) static void bnx2x_raw_clear_pending(struct bnx2x_raw_obj *o) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(o->state, o->pstate); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static void bnx2x_raw_set_pending(struct bnx2x_raw_obj *o) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(o->state, o->pstate); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } /** @@ -2131,7 +2131,7 @@ static int bnx2x_set_rx_mode_e1x(struct bnx2x *bp, /* The operation is completed */ clear_bit(p->state, p->pstate); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return 0; } @@ -3576,16 +3576,16 @@ error_exit1: static void bnx2x_mcast_clear_sched(struct bnx2x_mcast_obj *o) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(o->sched_state, o->raw.pstate); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static void bnx2x_mcast_set_sched(struct bnx2x_mcast_obj *o) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(o->sched_state, o->raw.pstate); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static bool bnx2x_mcast_check_sched(struct bnx2x_mcast_obj *o) @@ -4200,7 +4200,7 @@ int bnx2x_queue_state_change(struct bnx2x *bp, if (rc) { o->next_state = BNX2X_Q_STATE_MAX; clear_bit(pending_bit, pending); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return rc; } @@ -4288,7 +4288,7 @@ static int bnx2x_queue_comp_cmd(struct bnx2x *bp, wmb(); clear_bit(cmd, &o->pending); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return 0; } @@ -5279,7 +5279,7 @@ static inline int bnx2x_func_state_change_comp(struct bnx2x *bp, wmb(); clear_bit(cmd, &o->pending); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return 0; } @@ -5926,7 +5926,7 @@ int bnx2x_func_state_change(struct bnx2x *bp, if (rc) { o->next_state = BNX2X_F_STATE_MAX; clear_bit(cmd, pending); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return rc; } diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c index 5c523b32db70..f82ac5ac2336 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c @@ -1626,9 +1626,9 @@ static void bnx2x_vf_handle_filters_eqe(struct bnx2x *bp, struct bnx2x_virtf *vf) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(BNX2X_FILTER_RX_MODE_PENDING, &vf->filter_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static void bnx2x_vf_handle_rss_update_eqe(struct bnx2x *bp, @@ -2960,9 +2960,9 @@ void bnx2x_iov_task(struct work_struct *work) void bnx2x_schedule_iov_task(struct bnx2x *bp, enum bnx2x_iov_flag flag) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(flag, &bp->iov_task_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); DP(BNX2X_MSG_IOV, "Scheduling iov task [Flag: %d]\n", flag); queue_delayed_work(bnx2x_iov_wq, &bp->iov_task, 0); } diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c index 09f3fefcbf9c..4dd48d2fa804 100644 --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c @@ -436,7 +436,7 @@ static int cnic_offld_prep(struct cnic_sock *csk) static int cnic_close_prep(struct cnic_sock *csk) { clear_bit(SK_F_CONNECT_START, &csk->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (test_and_clear_bit(SK_F_OFFLD_COMPLETE, &csk->flags)) { while (test_and_set_bit(SK_F_OFFLD_SCHED, &csk->flags)) @@ -450,7 +450,7 @@ static int cnic_close_prep(struct cnic_sock *csk) static int cnic_abort_prep(struct cnic_sock *csk) { clear_bit(SK_F_CONNECT_START, &csk->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); while (test_and_set_bit(SK_F_OFFLD_SCHED, &csk->flags)) msleep(1); @@ -3646,7 +3646,7 @@ static int cnic_cm_destroy(struct cnic_sock *csk) csk_hold(csk); clear_bit(SK_F_INUSE, &csk->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); while (atomic_read(&csk->ref_count) != 1) msleep(1); cnic_cm_cleanup(csk); @@ -4026,7 +4026,7 @@ static void cnic_cm_process_kcqe(struct cnic_dev *dev, struct kcqe *kcqe) L4_KCQE_COMPLETION_STATUS_PARITY_ERROR) set_bit(SK_F_HW_ERR, &csk->flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(SK_F_OFFLD_SCHED, &csk->flags); cnic_cm_upcall(cp, csk, opcode); break; diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c index 675550fe8ee9..3a77f9ead004 100644 --- a/drivers/net/ethernet/brocade/bna/bnad.c +++ b/drivers/net/ethernet/brocade/bna/bnad.c @@ -249,7 +249,7 @@ bnad_tx_complete(struct bnad *bnad, struct bna_tcb *tcb) if (likely(test_bit(BNAD_TXQ_TX_STARTED, &tcb->flags))) bna_ib_ack(tcb->i_dbell, sent); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(BNAD_TXQ_FREE_SENT, &tcb->flags); return sent; @@ -1126,7 +1126,7 @@ bnad_tx_cleanup(struct delayed_work *work) bnad_txq_cleanup(bnad, tcb); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(BNAD_TXQ_FREE_SENT, &tcb->flags); } @@ -2992,7 +2992,7 @@ bnad_start_xmit(struct sk_buff *skb, struct net_device *netdev) sent = bnad_txcmpl_process(bnad, tcb); if (likely(test_bit(BNAD_TXQ_TX_STARTED, &tcb->flags))) bna_ib_ack(tcb->i_dbell, sent); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(BNAD_TXQ_FREE_SENT, &tcb->flags); } else { netif_stop_queue(netdev); diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c index 0fe7ff750d77..05613a85ce61 100644 --- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c @@ -281,7 +281,7 @@ static int cxgb_close(struct net_device *dev) if (adapter->params.stats_update_period && !(adapter->open_device_map & PORT_MASK)) { /* Stop statistics accumulation. */ - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); spin_lock(&adapter->work_lock); /* sync with update task */ spin_unlock(&adapter->work_lock); cancel_mac_stats_update(adapter); diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c index 8b069f96e920..3dfcf600fcc6 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c @@ -1379,7 +1379,7 @@ static inline int check_desc_avail(struct adapter *adap, struct sge_txq *q, struct sge_qset *qs = txq_to_qset(q, qid); set_bit(qid, &qs->txq_stopped); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (should_restart_tx(q) && test_and_clear_bit(qid, &qs->txq_stopped)) @@ -1492,7 +1492,7 @@ static void restart_ctrlq(unsigned long data) if (!skb_queue_empty(&q->sendq)) { set_bit(TXQ_CTRL, &qs->txq_stopped); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (should_restart_tx(q) && test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) @@ -1697,7 +1697,7 @@ again: reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK); if (unlikely(q->size - q->in_use < ndesc)) { set_bit(TXQ_OFLD, &qs->txq_stopped); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (should_restart_tx(q) && test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index ca95cf2954eb..e249528c8e60 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -2031,7 +2031,7 @@ static void sge_rx_timer_cb(unsigned long data) struct sge_fl *fl = s->egr_map[id]; clear_bit(id, s->starving_fl); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (fl_starving(fl)) { rxq = container_of(fl, struct sge_eth_rxq, fl); diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c index 9cfa4b4bb089..9d88c1d50b49 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c @@ -1951,7 +1951,7 @@ static void sge_rx_timer_cb(unsigned long data) struct sge_fl *fl = s->egr_map[id]; clear_bit(id, s->starving_fl); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* * Since we are accessing fl without a lock there's a diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 9125d9abf099..d82f092cae90 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -1797,9 +1797,9 @@ void stop_gfar(struct net_device *dev) netif_tx_stop_all_queues(dev); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(GFAR_DOWN, &priv->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); disable_napi(priv); @@ -2042,9 +2042,9 @@ int startup_gfar(struct net_device *ndev) gfar_init_tx_rx_base(priv); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(GFAR_DOWN, &priv->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* Start Rx/Tx DMA and enable the interrupts */ gfar_start(priv); diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 861b722c2672..1e526c072a44 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -4671,7 +4671,7 @@ static void i40e_service_event_complete(struct i40e_pf *pf) BUG_ON(!test_bit(__I40E_SERVICE_SCHED, &pf->state)); /* flush memory to make sure state is correct before next watchog */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__I40E_SERVICE_SCHED, &pf->state); } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index c4c526b7f99f..2fecc2626de5 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -376,7 +376,7 @@ static void ixgbe_service_event_complete(struct ixgbe_adapter *adapter) BUG_ON(!test_bit(__IXGBE_SERVICE_SCHED, &adapter->state)); /* flush memory to make sure state is correct before next watchdog */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__IXGBE_SERVICE_SCHED, &adapter->state); } @@ -4671,7 +4671,7 @@ static void ixgbe_up_complete(struct ixgbe_adapter *adapter) if (hw->mac.ops.enable_tx_laser) hw->mac.ops.enable_tx_laser(hw); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__IXGBE_DOWN, &adapter->state); ixgbe_napi_enable_all(adapter); @@ -5567,7 +5567,7 @@ static int ixgbe_resume(struct pci_dev *pdev) e_dev_err("Cannot enable PCI device from suspend\n"); return err; } - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__IXGBE_DISABLED, &adapter->state); pci_set_master(pdev); @@ -8541,7 +8541,7 @@ static pci_ers_result_t ixgbe_io_slot_reset(struct pci_dev *pdev) e_err(probe, "Cannot re-enable PCI device after reset.\n"); result = PCI_ERS_RESULT_DISCONNECT; } else { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__IXGBE_DISABLED, &adapter->state); adapter->hw.hw_addr = adapter->io_addr; pci_set_master(pdev); diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index d0799e8e31e4..de2793b06305 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -1668,7 +1668,7 @@ static void ixgbevf_up_complete(struct ixgbevf_adapter *adapter) spin_unlock_bh(&adapter->mbx_lock); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__IXGBEVF_DOWN, &adapter->state); ixgbevf_napi_enable_all(adapter); @@ -3354,7 +3354,7 @@ static int ixgbevf_resume(struct pci_dev *pdev) dev_err(&pdev->dev, "Cannot enable PCI device from suspend\n"); return err; } - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__IXGBEVF_DISABLED, &adapter->state); pci_set_master(pdev); @@ -3712,7 +3712,7 @@ static pci_ers_result_t ixgbevf_io_slot_reset(struct pci_dev *pdev) return PCI_ERS_RESULT_DISCONNECT; } - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__IXGBEVF_DISABLED, &adapter->state); pci_set_master(pdev); diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index ed88d3913483..e71eae353368 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -543,7 +543,7 @@ static int wlcore_irq_locked(struct wl1271 *wl) * wl1271_ps_elp_wakeup cannot be called concurrently. */ clear_bit(WL1271_FLAG_IRQ_RUNNING, &wl->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); ret = wlcore_fw_status(wl, wl->fw_status); if (ret < 0) diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 179b8edc2262..53df39a22c8a 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -662,9 +662,9 @@ static void pcifront_do_aer(struct work_struct *data) notify_remote_via_evtchn(pdev->evtchn); /*in case of we lost an aer request in four lines time_window*/ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(_PDEVB_op_active, &pdev->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); schedule_pcifront_aer_op(pdev); diff --git a/drivers/scsi/isci/remote_device.c b/drivers/scsi/isci/remote_device.c index 96a26f454673..cc51f38b116d 100644 --- a/drivers/scsi/isci/remote_device.c +++ b/drivers/scsi/isci/remote_device.c @@ -1541,7 +1541,7 @@ void isci_remote_device_release(struct kref *kref) clear_bit(IDEV_STOP_PENDING, &idev->flags); clear_bit(IDEV_IO_READY, &idev->flags); clear_bit(IDEV_GONE, &idev->flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(IDEV_ALLOCATED, &idev->flags); wake_up(&ihost->eventq); } diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c index c886ad1c39fb..73ab75ddaf42 100644 --- a/drivers/target/loopback/tcm_loop.c +++ b/drivers/target/loopback/tcm_loop.c @@ -951,7 +951,7 @@ static int tcm_loop_port_link( struct tcm_loop_hba *tl_hba = tl_tpg->tl_hba; atomic_inc(&tl_tpg->tl_tpg_port_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* * Add Linux/SCSI struct scsi_device by HCTL */ @@ -986,7 +986,7 @@ static void tcm_loop_port_unlink( scsi_device_put(sd); atomic_dec(&tl_tpg->tl_tpg_port_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); pr_debug("TCM_Loop_ConfigFS: Port Unlink Successful\n"); } diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c index fcbe6125b73e..0b79b852f4b2 100644 --- a/drivers/target/target_core_alua.c +++ b/drivers/target/target_core_alua.c @@ -393,7 +393,7 @@ target_emulate_set_target_port_groups(struct se_cmd *cmd) continue; atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); @@ -404,7 +404,7 @@ target_emulate_set_target_port_groups(struct se_cmd *cmd) spin_lock(&dev->t10_alua.tg_pt_gps_lock); atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); break; } spin_unlock(&dev->t10_alua.tg_pt_gps_lock); @@ -990,7 +990,7 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work) * TARGET PORT GROUPS command */ atomic_inc(&mem->tg_pt_gp_mem_ref_cnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&tg_pt_gp->tg_pt_gp_lock); spin_lock_bh(&port->sep_alua_lock); @@ -1020,7 +1020,7 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work) spin_lock(&tg_pt_gp->tg_pt_gp_lock); atomic_dec(&mem->tg_pt_gp_mem_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } spin_unlock(&tg_pt_gp->tg_pt_gp_lock); /* @@ -1054,7 +1054,7 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work) core_alua_dump_state(tg_pt_gp->tg_pt_gp_alua_pending_state)); spin_lock(&dev->t10_alua.tg_pt_gps_lock); atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); if (tg_pt_gp->tg_pt_gp_transition_complete) @@ -1116,7 +1116,7 @@ static int core_alua_do_transition_tg_pt( */ spin_lock(&dev->t10_alua.tg_pt_gps_lock); atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); if (!explicit && tg_pt_gp->tg_pt_gp_implicit_trans_secs) { @@ -1159,7 +1159,7 @@ int core_alua_do_port_transition( spin_lock(&local_lu_gp_mem->lu_gp_mem_lock); lu_gp = local_lu_gp_mem->lu_gp; atomic_inc(&lu_gp->lu_gp_ref_cnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&local_lu_gp_mem->lu_gp_mem_lock); /* * For storage objects that are members of the 'default_lu_gp', @@ -1176,7 +1176,7 @@ int core_alua_do_port_transition( rc = core_alua_do_transition_tg_pt(l_tg_pt_gp, new_state, explicit); atomic_dec(&lu_gp->lu_gp_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); return rc; } /* @@ -1190,7 +1190,7 @@ int core_alua_do_port_transition( dev = lu_gp_mem->lu_gp_mem_dev; atomic_inc(&lu_gp_mem->lu_gp_mem_ref_cnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&lu_gp->lu_gp_lock); spin_lock(&dev->t10_alua.tg_pt_gps_lock); @@ -1219,7 +1219,7 @@ int core_alua_do_port_transition( tg_pt_gp->tg_pt_gp_alua_nacl = NULL; } atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); /* * core_alua_do_transition_tg_pt() will always return @@ -1230,7 +1230,7 @@ int core_alua_do_port_transition( spin_lock(&dev->t10_alua.tg_pt_gps_lock); atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); if (rc) break; } @@ -1238,7 +1238,7 @@ int core_alua_do_port_transition( spin_lock(&lu_gp->lu_gp_lock); atomic_dec(&lu_gp_mem->lu_gp_mem_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } spin_unlock(&lu_gp->lu_gp_lock); @@ -1252,7 +1252,7 @@ int core_alua_do_port_transition( } atomic_dec(&lu_gp->lu_gp_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); return rc; } diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index 65001e133670..72618776ede4 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -225,7 +225,7 @@ struct se_dev_entry *core_get_se_deve_from_rtpi( continue; atomic_inc(&deve->pr_ref_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock_irq(&nacl->device_list_lock); return deve; @@ -1392,7 +1392,7 @@ int core_dev_add_initiator_node_lun_acl( spin_lock(&lun->lun_acl_lock); list_add_tail(&lacl->lacl_list, &lun->lun_acl_list); atomic_inc(&lun->lun_acl_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&lun->lun_acl_lock); pr_debug("%s_TPG[%hu]_LUN[%u->%u] - Added %s ACL for " @@ -1426,7 +1426,7 @@ int core_dev_del_initiator_node_lun_acl( spin_lock(&lun->lun_acl_lock); list_del(&lacl->lacl_list); atomic_dec(&lun->lun_acl_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); spin_unlock(&lun->lun_acl_lock); core_disable_device_list_for_node(lun, NULL, lacl->mapped_lun, diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 9e0232cca92e..7e6b857c6b3f 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -323,7 +323,7 @@ static void iblock_bio_done(struct bio *bio, int err) * Bump the ib_bio_err_cnt and release bio. */ atomic_inc(&ibr->ib_bio_err_cnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } bio_put(bio); diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index 3013287a2aaa..df357862286e 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -675,7 +675,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( spin_lock(&dev->se_port_lock); list_for_each_entry_safe(port, port_tmp, &dev->dev_sep_list, sep_list) { atomic_inc(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&dev->se_port_lock); spin_lock_bh(&port->sep_alua_lock); @@ -710,7 +710,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( continue; atomic_inc(&deve_tmp->pr_ref_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock_bh(&port->sep_alua_lock); /* * Grab a configfs group dependency that is released @@ -723,9 +723,9 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( pr_err("core_scsi3_lunacl_depend" "_item() failed\n"); atomic_dec(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); atomic_dec(&deve_tmp->pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); goto out; } /* @@ -740,9 +740,9 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( sa_res_key, all_tg_pt, aptpl); if (!pr_reg_atp) { atomic_dec(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); atomic_dec(&deve_tmp->pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); core_scsi3_lunacl_undepend_item(deve_tmp); goto out; } @@ -755,7 +755,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( spin_lock(&dev->se_port_lock); atomic_dec(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } spin_unlock(&dev->se_port_lock); @@ -1110,7 +1110,7 @@ static struct t10_pr_registration *__core_scsi3_locate_pr_reg( continue; } atomic_inc(&pr_reg->pr_res_holders); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&pr_tmpl->registration_lock); return pr_reg; } @@ -1125,7 +1125,7 @@ static struct t10_pr_registration *__core_scsi3_locate_pr_reg( continue; atomic_inc(&pr_reg->pr_res_holders); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&pr_tmpl->registration_lock); return pr_reg; } @@ -1155,7 +1155,7 @@ static struct t10_pr_registration *core_scsi3_locate_pr_reg( static void core_scsi3_put_pr_reg(struct t10_pr_registration *pr_reg) { atomic_dec(&pr_reg->pr_res_holders); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } static int core_scsi3_check_implicit_release( @@ -1349,7 +1349,7 @@ static void core_scsi3_tpg_undepend_item(struct se_portal_group *tpg) &tpg->tpg_group.cg_item); atomic_dec(&tpg->tpg_pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } static int core_scsi3_nodeacl_depend_item(struct se_node_acl *nacl) @@ -1369,7 +1369,7 @@ static void core_scsi3_nodeacl_undepend_item(struct se_node_acl *nacl) if (nacl->dynamic_node_acl) { atomic_dec(&nacl->acl_pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); return; } @@ -1377,7 +1377,7 @@ static void core_scsi3_nodeacl_undepend_item(struct se_node_acl *nacl) &nacl->acl_group.cg_item); atomic_dec(&nacl->acl_pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } static int core_scsi3_lunacl_depend_item(struct se_dev_entry *se_deve) @@ -1408,7 +1408,7 @@ static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve) */ if (!lun_acl) { atomic_dec(&se_deve->pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); return; } nacl = lun_acl->se_lun_nacl; @@ -1418,7 +1418,7 @@ static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve) &lun_acl->se_lun_group.cg_item); atomic_dec(&se_deve->pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } static sense_reason_t @@ -1552,14 +1552,14 @@ core_scsi3_decode_spec_i_port( continue; atomic_inc(&tmp_tpg->tpg_pr_ref_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&dev->se_port_lock); if (core_scsi3_tpg_depend_item(tmp_tpg)) { pr_err(" core_scsi3_tpg_depend_item()" " for tmp_tpg\n"); atomic_dec(&tmp_tpg->tpg_pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out_unmap; } @@ -1573,7 +1573,7 @@ core_scsi3_decode_spec_i_port( tmp_tpg, i_str); if (dest_node_acl) { atomic_inc(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } spin_unlock_irq(&tmp_tpg->acl_node_lock); @@ -1587,7 +1587,7 @@ core_scsi3_decode_spec_i_port( pr_err("configfs_depend_item() failed" " for dest_node_acl->acl_group\n"); atomic_dec(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); core_scsi3_tpg_undepend_item(tmp_tpg); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out_unmap; @@ -1647,7 +1647,7 @@ core_scsi3_decode_spec_i_port( pr_err("core_scsi3_lunacl_depend_item()" " failed\n"); atomic_dec(&dest_se_deve->pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); core_scsi3_nodeacl_undepend_item(dest_node_acl); core_scsi3_tpg_undepend_item(dest_tpg); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; @@ -3168,14 +3168,14 @@ core_scsi3_emulate_pro_register_and_move(struct se_cmd *cmd, u64 res_key, continue; atomic_inc(&dest_se_tpg->tpg_pr_ref_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&dev->se_port_lock); if (core_scsi3_tpg_depend_item(dest_se_tpg)) { pr_err("core_scsi3_tpg_depend_item() failed" " for dest_se_tpg\n"); atomic_dec(&dest_se_tpg->tpg_pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out_put_pr_reg; } @@ -3273,7 +3273,7 @@ after_iport_check: initiator_str); if (dest_node_acl) { atomic_inc(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } spin_unlock_irq(&dest_se_tpg->acl_node_lock); @@ -3289,7 +3289,7 @@ after_iport_check: pr_err("core_scsi3_nodeacl_depend_item() for" " dest_node_acl\n"); atomic_dec(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); dest_node_acl = NULL; ret = TCM_INVALID_PARAMETER_LIST; goto out; @@ -3314,7 +3314,7 @@ after_iport_check: if (core_scsi3_lunacl_depend_item(dest_se_deve)) { pr_err("core_scsi3_lunacl_depend_item() failed\n"); atomic_dec(&dest_se_deve->pr_ref_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); dest_se_deve = NULL; ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out; @@ -3880,7 +3880,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) add_desc_len = 0; atomic_inc(&pr_reg->pr_res_holders); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock(&pr_tmpl->registration_lock); /* * Determine expected length of $FABRIC_MOD specific @@ -3894,7 +3894,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) " out of buffer: %d\n", cmd->data_length); spin_lock(&pr_tmpl->registration_lock); atomic_dec(&pr_reg->pr_res_holders); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); break; } /* @@ -3956,7 +3956,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) spin_lock(&pr_tmpl->registration_lock); atomic_dec(&pr_reg->pr_res_holders); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); /* * Set the ADDITIONAL DESCRIPTOR LENGTH */ diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index d4b98690a736..4badca1cd625 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -736,7 +736,7 @@ void target_qf_do_work(struct work_struct *work) list_for_each_entry_safe(cmd, cmd_tmp, &qf_cmd_list, se_qf_node) { list_del(&cmd->se_qf_node); atomic_dec(&dev->dev_qf_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); pr_debug("Processing %s cmd: %p QUEUE_FULL in work queue" " context: %s\n", cmd->se_tfo->get_fabric_name(), cmd, @@ -1148,7 +1148,7 @@ transport_check_alloc_task_attr(struct se_cmd *cmd) * Dormant to Active status. */ cmd->se_ordered_id = atomic_inc_return(&dev->dev_ordered_id); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); pr_debug("Allocated se_ordered_id: %u for Task Attr: 0x%02x on %s\n", cmd->se_ordered_id, cmd->sam_task_attr, dev->transport->name); @@ -1705,7 +1705,7 @@ static bool target_handle_task_attr(struct se_cmd *cmd) return false; case MSG_ORDERED_TAG: atomic_inc(&dev->dev_ordered_sync); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); pr_debug("Added ORDERED for CDB: 0x%02x to ordered list, " " se_ordered_id: %u\n", @@ -1723,7 +1723,7 @@ static bool target_handle_task_attr(struct se_cmd *cmd) * For SIMPLE and UNTAGGED Task Attribute commands */ atomic_inc(&dev->simple_cmds); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); break; } @@ -1828,7 +1828,7 @@ static void transport_complete_task_attr(struct se_cmd *cmd) if (cmd->sam_task_attr == MSG_SIMPLE_TAG) { atomic_dec(&dev->simple_cmds); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); dev->dev_cur_ordered_id++; pr_debug("Incremented dev->dev_cur_ordered_id: %u for" " SIMPLE: %u\n", dev->dev_cur_ordered_id, @@ -1840,7 +1840,7 @@ static void transport_complete_task_attr(struct se_cmd *cmd) cmd->se_ordered_id); } else if (cmd->sam_task_attr == MSG_ORDERED_TAG) { atomic_dec(&dev->dev_ordered_sync); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); dev->dev_cur_ordered_id++; pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED:" @@ -1899,7 +1899,7 @@ static void transport_handle_queue_full( spin_lock_irq(&dev->qf_cmd_lock); list_add_tail(&cmd->se_qf_node, &cmd->se_dev->qf_cmd_list); atomic_inc(&dev->dev_qf_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); spin_unlock_irq(&cmd->se_dev->qf_cmd_lock); schedule_work(&cmd->se_dev->qf_work_queue); @@ -2875,7 +2875,7 @@ void transport_send_task_abort(struct se_cmd *cmd) if (cmd->se_tfo->write_pending_status(cmd) != 0) { cmd->transport_state |= CMD_T_ABORTED; cmd->se_cmd_flags |= SCF_SEND_DELAYED_TAS; - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); return; } } diff --git a/drivers/target/target_core_ua.c b/drivers/target/target_core_ua.c index 505519b10cb7..101858e245b3 100644 --- a/drivers/target/target_core_ua.c +++ b/drivers/target/target_core_ua.c @@ -162,7 +162,7 @@ int core_scsi3_ua_allocate( spin_unlock_irq(&nacl->device_list_lock); atomic_inc(&deve->ua_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); return 0; } list_add_tail(&ua->ua_nacl_list, &deve->ua_list); @@ -175,7 +175,7 @@ int core_scsi3_ua_allocate( asc, ascq); atomic_inc(&deve->ua_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); return 0; } @@ -190,7 +190,7 @@ void core_scsi3_ua_release_all( kmem_cache_free(se_ua_cache, ua); atomic_dec(&deve->ua_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } spin_unlock(&deve->ua_lock); } @@ -251,7 +251,7 @@ void core_scsi3_ua_for_check_condition( kmem_cache_free(se_ua_cache, ua); atomic_dec(&deve->ua_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } spin_unlock(&deve->ua_lock); spin_unlock_irq(&nacl->device_list_lock); @@ -310,7 +310,7 @@ int core_scsi3_ua_clear_for_request_sense( kmem_cache_free(se_ua_cache, ua); atomic_dec(&deve->ua_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } spin_unlock(&deve->ua_lock); spin_unlock_irq(&nacl->device_list_lock); diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index 41fe8a047d37..746ae80b972f 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -2041,7 +2041,7 @@ static int canon_copy_from_read_buf(struct tty_struct *tty, if (found) clear_bit(eol, ldata->read_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); ldata->read_tail += c; if (found) { diff --git a/drivers/tty/serial/mxs-auart.c b/drivers/tty/serial/mxs-auart.c index aa97fd845b4d..4b5b3c2fe328 100644 --- a/drivers/tty/serial/mxs-auart.c +++ b/drivers/tty/serial/mxs-auart.c @@ -200,7 +200,7 @@ static void dma_tx_callback(void *param) /* clear the bit used to serialize the DMA tx. */ clear_bit(MXS_AUART_DMA_TX_SYNC, &s->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* wake up the possible processes. */ if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) @@ -275,7 +275,7 @@ static void mxs_auart_tx_chars(struct mxs_auart_port *s) mxs_auart_dma_tx(s, i); } else { clear_bit(MXS_AUART_DMA_TX_SYNC, &s->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } return; } diff --git a/drivers/usb/gadget/tcm_usb_gadget.c b/drivers/usb/gadget/tcm_usb_gadget.c index f058c0368d61..819875c7e394 100644 --- a/drivers/usb/gadget/tcm_usb_gadget.c +++ b/drivers/usb/gadget/tcm_usb_gadget.c @@ -1851,7 +1851,7 @@ static int usbg_port_link(struct se_portal_group *se_tpg, struct se_lun *lun) struct usbg_tpg *tpg = container_of(se_tpg, struct usbg_tpg, se_tpg); atomic_inc(&tpg->tpg_port_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); return 0; } @@ -1861,7 +1861,7 @@ static void usbg_port_unlink(struct se_portal_group *se_tpg, struct usbg_tpg *tpg = container_of(se_tpg, struct usbg_tpg, se_tpg); atomic_dec(&tpg->tpg_port_count); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } static int usbg_check_stop_free(struct se_cmd *se_cmd) diff --git a/drivers/usb/serial/usb_wwan.c b/drivers/usb/serial/usb_wwan.c index 640fe0173236..f1ec1680e822 100644 --- a/drivers/usb/serial/usb_wwan.c +++ b/drivers/usb/serial/usb_wwan.c @@ -325,7 +325,7 @@ static void usb_wwan_outdat_callback(struct urb *urb) for (i = 0; i < N_OUT_URB; ++i) { if (portdata->out_urbs[i] == urb) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(i, &portdata->out_busy); break; } diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index cf50ce93975b..aeb513108448 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1255,7 +1255,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, tpg->tv_tpg_vhost_count++; tpg->vhost_scsi = vs; vs_tpg[tpg->tport_tpgt] = tpg; - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); match = true; } mutex_unlock(&tpg->tv_tpg_mutex); diff --git a/drivers/w1/w1_family.c b/drivers/w1/w1_family.c index 3bff6b37b472..3651ec801f45 100644 --- a/drivers/w1/w1_family.c +++ b/drivers/w1/w1_family.c @@ -139,9 +139,9 @@ void w1_family_get(struct w1_family *f) void __w1_family_get(struct w1_family *f) { - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&f->refcnt); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } EXPORT_SYMBOL(w1_unregister_family); diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index 607e41460c0d..c4a0666de6f5 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -348,9 +348,9 @@ void xen_pcibk_do_op(struct work_struct *data) notify_remote_via_irq(pdev->evtchn_irq); /* Mark that we're done. */ - smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */ + smp_mb__before_atomic(); /* /after/ clearing PCIF_active */ clear_bit(_PDEVF_op_active, &pdev->flags); - smp_mb__after_clear_bit(); /* /before/ final check for work */ + smp_mb__after_atomic(); /* /before/ final check for work */ /* Check to see if the driver domain tried to start another request in * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c9a24444ec9a..2256e9cceec5 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -279,7 +279,7 @@ static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3955e475ceec..f29a54e454d4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3458,7 +3458,7 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, static void end_extent_buffer_writeback(struct extent_buffer *eb) { clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5f805bc944fa..5a3b8371772e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7126,7 +7126,7 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) * before atomic variable goto zero, we must make sure * dip->errors is perceived to be set. */ - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); } /* if there are more bios still pending for this dio, just exit */ @@ -7306,7 +7306,7 @@ out_err: * before atomic variable goto zero, we must * make sure dip->errors is perceived to be set. */ - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); if (atomic_dec_and_test(&dip->pending_bios)) bio_io_error(dip->orig_bio); @@ -7449,7 +7449,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, return 0; atomic_inc(&inode->i_dio_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* * The generic stuff only does filemap_write_and_wait_range, which diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e79ff6b90cb7..f45040a4bb76 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -642,7 +642,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, return -EINVAL; atomic_inc(&root->will_be_snapshoted); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); btrfs_wait_nocow_write(root); ret = btrfs_start_delalloc_inodes(root, 0); diff --git a/fs/buffer.c b/fs/buffer.c index 9ddb9fc7d923..6a8110c03a47 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -77,7 +77,7 @@ EXPORT_SYMBOL(__lock_buffer); void unlock_buffer(struct buffer_head *bh) { clear_bit_unlock(BH_Lock, &bh->b_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&bh->b_state, BH_Lock); } EXPORT_SYMBOL(unlock_buffer); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index f3b84cd9de56..08b3c116915b 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -42,7 +42,7 @@ int ext4_resize_begin(struct super_block *sb) void ext4_resize_end(struct super_block *sb) { clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index aec7f73832f0..c355f7320e44 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -277,7 +277,7 @@ static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holde static void gfs2_holder_wake(struct gfs2_holder *gh) { clear_bit(HIF_WAIT, &gh->gh_iflags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&gh->gh_iflags, HIF_WAIT); } @@ -411,7 +411,7 @@ static void gfs2_demote_wake(struct gfs2_glock *gl) { gl->gl_demote_state = LM_ST_EXCLUSIVE; clear_bit(GLF_DEMOTE, &gl->gl_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&gl->gl_flags, GLF_DEMOTE); } @@ -620,7 +620,7 @@ out: out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); gl->gl_lockref.count++; if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gl->gl_lockref.count--; @@ -628,7 +628,7 @@ out_sched: out_unlock: clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return; } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 54b66809e818..74d9a3dbf16f 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -221,7 +221,7 @@ static void inode_go_sync(struct gfs2_glock *gl) * Writeback of the data mapping may cause the dirty flag to be set * so we have to clear it again here. */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(GLF_DIRTY, &gl->gl_flags); } diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index c1eb555dc588..91f274de1246 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1134,7 +1134,7 @@ static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots, queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0); clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY); spin_unlock(&ls->ls_recover_spin); } @@ -1271,7 +1271,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); return 0; diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 7ad4094d68c0..fe7a56fb6084 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -587,7 +587,7 @@ fail: gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); done: clear_bit(JDF_RECOVERY, &jd->jd_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&jd->jd_flags, JDF_RECOVERY); } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index de25d5577e5d..529d9a9eb897 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -333,7 +333,7 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); else if (val == 0) { clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); gfs2_glock_thaw(sdp); } else { ret = -EINVAL; @@ -482,7 +482,7 @@ static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) rv = jid = -EINVAL; sdp->sd_lockstruct.ls_jid = jid; clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); out: spin_unlock(&sdp->sd_jindex_spin); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 5f26139a165a..6fac74349856 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -43,7 +43,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) clear_buffer_uptodate(bh); if (orig_bh) { clear_bit_unlock(BH_Shadow, &orig_bh->b_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&orig_bh->b_state, BH_Shadow); } unlock_buffer(bh); @@ -239,7 +239,7 @@ static int journal_submit_data_buffers(journal_t *journal, spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } spin_unlock(&journal->j_list_lock); @@ -277,7 +277,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, } spin_lock(&journal->j_list_lock); clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d9f3d067cd15..4a3d4ef76127 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2032,9 +2032,9 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry) { put_rpccred(entry->cred); kfree(entry); - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_long_dec(&nfs_access_nr_entries); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } static void nfs_access_free_list(struct list_head *head) @@ -2082,9 +2082,9 @@ nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc) else { remove_lru_entry: list_del_init(&nfsi->access_cache_inode_lru); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } spin_unlock(&inode->i_lock); } @@ -2232,9 +2232,9 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) nfs_access_add_rbtree(inode, cache); /* Update accounting */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_long_inc(&nfs_access_nr_entries); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* Add inode to global LRU list */ if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 0c438973f3c8..e6f7398d2b3c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1085,7 +1085,7 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(bitlock, NFS_INO_INVALIDATING); out: return ret; diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index efac602edb37..b9c61efe9660 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -789,9 +789,9 @@ static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NFS4DS_CONNECTING, &ds->ds_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2349518eef2c..c0583b9bef71 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1140,9 +1140,9 @@ static int nfs4_run_state_manager(void *); static void nfs4_clear_state_manager_bit(struct nfs_client *clp) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING); rpc_wake_up(&clp->cl_rpcwaitq); } diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 2ffebf2081ce..03ed984ab4d8 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -95,7 +95,7 @@ nfs_iocounter_dec(struct nfs_io_counter *c) { if (atomic_dec_and_test(&c->io_count)) { clear_bit(NFS_IO_INPROGRESS, &c->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&c->flags, NFS_IO_INPROGRESS); } } @@ -193,9 +193,9 @@ void nfs_unlock_request(struct nfs_page *req) printk(KERN_ERR "NFS: Invalid unlock attempted\n"); BUG(); } - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(PG_BUSY, &req->wb_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&req->wb_flags, PG_BUSY); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index cb53d450ae32..fd9536e494bc 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1810,7 +1810,7 @@ static void pnfs_clear_layoutcommitting(struct inode *inode) unsigned long *bitlock = &NFS_I(inode)->flags; clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 023793909778..c3058a076596 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -275,7 +275,7 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg) { if (lseg) { atomic_inc(&lseg->pls_refcount); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } return lseg; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 9a3b6a4cd6b9..ffb9459f180b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -405,7 +405,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) nfs_pageio_complete(&pgio); clear_bit_unlock(NFS_INO_FLUSHING, bitlock); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(bitlock, NFS_INO_FLUSHING); if (err < 0) @@ -1458,7 +1458,7 @@ static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) static void nfs_commit_clear_lock(struct nfs_inode *nfsi) { clear_bit(NFS_INO_COMMIT, &nfsi->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); } diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 4b826abb1528..45d4e96a6bac 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -460,9 +460,9 @@ static int write_cnodes(struct ubifs_info *c) * important. */ clear_bit(DIRTY_CNODE, &cnode->flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(COW_CNODE, &cnode->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); offs += len; dbg_chk_lpt_sz(c, 1, len); cnode = cnode->cnext; diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 52a6559275c4..3600994f8411 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -895,9 +895,9 @@ static int write_index(struct ubifs_info *c) * the reason for the second barrier. */ clear_bit(DIRTY_ZNODE, &znode->flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(COW_ZNODE, &znode->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* * We have marked the znode as clean but have not updated the diff --git a/include/asm-generic/bitops/atomic.h b/include/asm-generic/bitops/atomic.h index 9ae6c34dc191..49673510b484 100644 --- a/include/asm-generic/bitops/atomic.h +++ b/include/asm-generic/bitops/atomic.h @@ -80,7 +80,7 @@ static inline void set_bit(int nr, volatile unsigned long *addr) * * clear_bit() is atomic and may not be reordered. However, it does * not contain a memory barrier, so if it is used for locking purposes, - * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic() * in order to ensure changes are visible on other processors. */ static inline void clear_bit(int nr, volatile unsigned long *addr) diff --git a/include/asm-generic/bitops/lock.h b/include/asm-generic/bitops/lock.h index 308a9e22c802..c30266e94806 100644 --- a/include/asm-generic/bitops/lock.h +++ b/include/asm-generic/bitops/lock.h @@ -20,7 +20,7 @@ */ #define clear_bit_unlock(nr, addr) \ do { \ - smp_mb__before_clear_bit(); \ + smp_mb__before_atomic(); \ clear_bit(nr, addr); \ } while (0) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index c40302f909ce..7cbf837a279c 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -278,7 +278,7 @@ static inline void get_bh(struct buffer_head *bh) static inline void put_bh(struct buffer_head *bh) { - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&bh->b_count); } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 9f3c275e053e..ec274e0f4ed2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -649,7 +649,7 @@ static inline void hd_ref_init(struct hd_struct *part) static inline void hd_struct_get(struct hd_struct *part) { atomic_inc(&part->ref); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } static inline int hd_struct_try_get(struct hd_struct *part) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index c7bfac1c4a7b..157111043281 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -453,7 +453,7 @@ static inline int tasklet_trylock(struct tasklet_struct *t) static inline void tasklet_unlock(struct tasklet_struct *t) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(TASKLET_STATE_RUN, &(t)->state); } @@ -501,7 +501,7 @@ static inline void tasklet_hi_schedule_first(struct tasklet_struct *t) static inline void tasklet_disable_nosync(struct tasklet_struct *t) { atomic_inc(&t->count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } static inline void tasklet_disable(struct tasklet_struct *t) @@ -513,13 +513,13 @@ static inline void tasklet_disable(struct tasklet_struct *t) static inline void tasklet_enable(struct tasklet_struct *t) { - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&t->count); } static inline void tasklet_hi_enable(struct tasklet_struct *t) { - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&t->count); } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7ed3a3aa6604..616415a4fee4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -493,7 +493,7 @@ static inline void napi_disable(struct napi_struct *n) static inline void napi_enable(struct napi_struct *n) { BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); } diff --git a/include/linux/sched.h b/include/linux/sched.h index 25f54c79f757..010cde3b44cb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2782,10 +2782,8 @@ static inline bool __must_check current_set_polling_and_test(void) /* * Polling state must be visible before we test NEED_RESCHED, * paired by resched_task() - * - * XXX: assumes set/clear bit are identical barrier wise. */ - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return unlikely(tif_need_resched()); } @@ -2803,7 +2801,7 @@ static inline bool __must_check current_clr_polling_and_test(void) * Polling state must be visible before we test NEED_RESCHED, * paired by resched_task() */ - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return unlikely(tif_need_resched()); } diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 3a847de83fab..ad7dbe2cfecd 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -142,18 +142,18 @@ struct rpc_task_setup { test_and_set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) #define rpc_clear_running(t) \ do { \ - smp_mb__before_clear_bit(); \ + smp_mb__before_atomic(); \ clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate); \ - smp_mb__after_clear_bit(); \ + smp_mb__after_atomic(); \ } while (0) #define RPC_IS_QUEUED(t) test_bit(RPC_TASK_QUEUED, &(t)->tk_runstate) #define rpc_set_queued(t) set_bit(RPC_TASK_QUEUED, &(t)->tk_runstate) #define rpc_clear_queued(t) \ do { \ - smp_mb__before_clear_bit(); \ + smp_mb__before_atomic(); \ clear_bit(RPC_TASK_QUEUED, &(t)->tk_runstate); \ - smp_mb__after_clear_bit(); \ + smp_mb__after_atomic(); \ } while (0) #define RPC_IS_ACTIVATED(t) test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 3e5efb2b236e..3876f0f1dfd3 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -379,9 +379,9 @@ static inline int xprt_test_and_clear_connected(struct rpc_xprt *xprt) static inline void xprt_clear_connecting(struct rpc_xprt *xprt) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTING, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static inline int xprt_connecting(struct rpc_xprt *xprt) @@ -411,9 +411,9 @@ static inline void xprt_clear_bound(struct rpc_xprt *xprt) static inline void xprt_clear_binding(struct rpc_xprt *xprt) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_BINDING, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt) diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 1e98b5530425..6f8ab7da27c4 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -191,7 +191,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) * pairs with task_work_add()->set_notify_resume() after * hlist_add_head(task->task_works); */ - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (unlikely(current->task_works)) task_work_run(); } diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 5679d927562b..624a8a54806d 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1204,7 +1204,7 @@ static inline bool __ip_vs_conn_get(struct ip_vs_conn *cp) /* put back the conn without restarting its timer */ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) { - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&cp->refcnt); } void ip_vs_conn_put(struct ip_vs_conn *cp); @@ -1408,7 +1408,7 @@ static inline void ip_vs_dest_hold(struct ip_vs_dest *dest) static inline void ip_vs_dest_put(struct ip_vs_dest *dest) { - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&dest->refcnt); } diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 2956c8da1605..1adf62b39b96 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -534,7 +534,7 @@ return_normal: kgdb_info[cpu].exception_state &= ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); kgdb_info[cpu].enter_kgdb--; - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&slaves_in_kgdb); dbg_touch_watchdogs(); local_irq_restore(flags); @@ -662,7 +662,7 @@ kgdb_restore: kgdb_info[cpu].exception_state &= ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); kgdb_info[cpu].enter_kgdb--; - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&masters_in_kgdb); /* Free kgdb_active */ atomic_set(&kgdb_active, -1); diff --git a/kernel/futex.c b/kernel/futex.c index 5f589279e462..b991ec05b8f9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -267,7 +267,7 @@ static inline void futex_get_mm(union futex_key *key) * get_futex_key() implies a full barrier. This is relied upon * as full barrier (B), see the ordering comment above. */ - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } /* @@ -280,7 +280,7 @@ static inline void hb_waiters_inc(struct futex_hash_bucket *hb) /* * Full barrier (A), see the ordering comment above. */ - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); #endif } diff --git a/kernel/kmod.c b/kernel/kmod.c index 6b375af4958d..0ac67a5861c5 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -498,7 +498,7 @@ int __usermodehelper_disable(enum umh_disable_depth depth) static void helper_lock(void) { atomic_inc(&running_helpers); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } static void helper_unlock(void) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0c47e300210a..88b4a1dcb58c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -387,9 +387,9 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, } rcu_prepare_for_idle(smp_processor_id()); /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ + smp_mb__before_atomic(); /* See above. */ atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ + smp_mb__after_atomic(); /* Force ordering with next sojourn. */ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); /* @@ -507,10 +507,10 @@ void rcu_irq_exit(void) static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, int user) { - smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ + smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic_inc(); /* See above. */ + smp_mb__after_atomic(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); rcu_cleanup_after_idle(smp_processor_id()); trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); @@ -635,10 +635,10 @@ void rcu_nmi_enter(void) (atomic_read(&rdtp->dynticks) & 0x1)) return; rdtp->dynticks_nmi_nesting++; - smp_mb__before_atomic_inc(); /* Force delay from prior write. */ + smp_mb__before_atomic(); /* Force delay from prior write. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic_inc(); /* See above. */ + smp_mb__after_atomic(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); } @@ -657,9 +657,9 @@ void rcu_nmi_exit(void) --rdtp->dynticks_nmi_nesting != 0) return; /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ + smp_mb__before_atomic(); /* See above. */ atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force delay to next write. */ + smp_mb__after_atomic(); /* Force delay to next write. */ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); } @@ -2790,7 +2790,7 @@ void synchronize_sched_expedited(void) s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { /* ensure test happens before caller kfree */ - smp_mb__before_atomic_inc(); /* ^^^ */ + smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone1); return; } @@ -2808,7 +2808,7 @@ void synchronize_sched_expedited(void) s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { /* ensure test happens before caller kfree */ - smp_mb__before_atomic_inc(); /* ^^^ */ + smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_workdone2); return; } @@ -2837,7 +2837,7 @@ void synchronize_sched_expedited(void) s = atomic_long_read(&rsp->expedited_done); if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { /* ensure test happens before caller kfree */ - smp_mb__before_atomic_inc(); /* ^^^ */ + smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(&rsp->expedited_done_lost); break; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 962d1d589929..56db2f853e43 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2523,9 +2523,9 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) /* Record start of fully idle period. */ j = jiffies; ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); } @@ -2590,9 +2590,9 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) } /* Record end of idle period. */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); /* diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 8b836b376d91..746bc9344969 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -165,7 +165,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * do a write memory barrier, and then update the count, to * make sure the vector is visible when count is set. */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_inc(&(vec)->count); do_mb = 1; } @@ -185,14 +185,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * the new priority vec. */ if (do_mb) - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* * When removing from the vector, we decrement the counter first * do a memory barrier and then clear the mask. */ atomic_dec(&(vec)->count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); cpumask_clear_cpu(cpu, vec->mask); } diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 7d50f794e248..0ffa20ae657b 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -394,7 +394,7 @@ EXPORT_SYMBOL(__wake_up_bit); * * In order for this to function properly, as it uses waitqueue_active() * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_clear_bit(), but in some + * this. Typically, this will be smp_mb__after_atomic(), but in some * cases where bitflags are manipulated non-atomically under a lock, one * may need to use a less regular barrier, such fs/inode.c's smp_mb(), * because spin_unlock() does not guarantee a memory barrier. diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 09d9591b7708..1706cbbdf5f0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -557,7 +557,7 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) bit = sync ? BDI_sync_congested : BDI_async_congested; if (test_and_clear_bit(bit, &bdi->state)) atomic_dec(&nr_bdi_congested[sync]); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (waitqueue_active(wqh)) wake_up(wqh); } diff --git a/mm/filemap.c b/mm/filemap.c index a82fbe4c9e8e..c73535c914cc 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -740,7 +740,7 @@ void unlock_page(struct page *page) { VM_BUG_ON_PAGE(!PageLocked(page), page); clear_bit_unlock(PG_locked, &page->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_page(page, PG_locked); } EXPORT_SYMBOL(unlock_page); @@ -757,7 +757,7 @@ void end_page_writeback(struct page *page) if (!test_clear_page_writeback(page)) BUG(); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_page(page, PG_writeback); } EXPORT_SYMBOL(end_page_writeback); diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c index 8c93267ce969..c4e09846d1de 100644 --- a/net/atm/pppoatm.c +++ b/net/atm/pppoatm.c @@ -252,7 +252,7 @@ static int pppoatm_may_send(struct pppoatm_vcc *pvcc, int size) * we need to ensure there's a memory barrier after it. The bit * *must* be set before we do the atomic_inc() on pvcc->inflight. * There's no smp_mb__after_set_bit(), so it's this or abuse - * smp_mb__after_clear_bit(). + * smp_mb__after_atomic(). */ test_and_set_bit(BLOCKED, &pvcc->blocked); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 49774912cb01..74014420b3c7 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -45,7 +45,7 @@ static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb) return; clear_bit(HCI_INQUIRY, &hdev->flags); - smp_mb__after_clear_bit(); /* wake_up_bit advises about this barrier */ + smp_mb__after_atomic(); /* wake_up_bit advises about this barrier */ wake_up_bit(&hdev->flags, HCI_INQUIRY); hci_conn_check_pending(hdev); @@ -1768,7 +1768,7 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (!test_and_clear_bit(HCI_INQUIRY, &hdev->flags)) return; - smp_mb__after_clear_bit(); /* wake_up_bit advises about this barrier */ + smp_mb__after_atomic(); /* wake_up_bit advises about this barrier */ wake_up_bit(&hdev->flags, HCI_INQUIRY); if (!test_bit(HCI_MGMT, &hdev->dev_flags)) diff --git a/net/core/dev.c b/net/core/dev.c index 5b3042e69f85..e14f1cba591a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1326,7 +1326,7 @@ static int __dev_close_many(struct list_head *head) * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ - smp_mb__after_clear_bit(); /* Commit netif_running(). */ + smp_mb__after_atomic(); /* Commit netif_running(). */ } dev_deactivate_many(head); @@ -3343,7 +3343,7 @@ static void net_tx_action(struct softirq_action *h) root_lock = qdisc_lock(q); if (spin_trylock(root_lock)) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); @@ -3353,7 +3353,7 @@ static void net_tx_action(struct softirq_action *h) &q->state)) { __netif_reschedule(q); } else { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); } @@ -4244,7 +4244,7 @@ void __napi_complete(struct napi_struct *n) BUG_ON(n->gro_list); list_del(&n->poll_list); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NAPI_STATE_SCHED, &n->state); } EXPORT_SYMBOL(__napi_complete); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 9c3a839322ba..bd0767e6b2b3 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -147,7 +147,7 @@ static void linkwatch_do_dev(struct net_device *dev) * Make sure the above read is complete since it can be * rewritten as soon as we clear the bit below. */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); /* We are about to handle this device, * so new events can be accepted diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 48f424465112..56cd458a1b8c 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -522,7 +522,7 @@ EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { p->dtime = (__u32)jiffies; - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&p->refcnt); } EXPORT_SYMBOL_GPL(inet_putpeer); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 025e25093984..366cf06587b8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1930,10 +1930,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, /* It is possible TX completion already happened * before we set TSQ_THROTTLED, so we must * test again the condition. - * We abuse smp_mb__after_clear_bit() because - * there is no smp_mb__after_set_bit() yet */ - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (atomic_read(&sk->sk_wmem_alloc) > limit) break; } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 75421f2ba8be..1f4f954c4b47 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -914,7 +914,7 @@ void nf_conntrack_free(struct nf_conn *ct) nf_ct_ext_destroy(ct); nf_ct_ext_free(ct); kmem_cache_free(net->ct.nf_conntrack_cachep, ct); - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_dec(&net->ct.count); } EXPORT_SYMBOL_GPL(nf_conntrack_free); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index b7ebe23cdedf..d67de453c35a 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -598,7 +598,7 @@ static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, { atomic64_set(&ic->i_ack_next, seq); if (ack_required) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } } @@ -606,7 +606,7 @@ static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, static u64 rds_ib_get_ack(struct rds_ib_connection *ic) { clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return atomic64_read(&ic->i_ack_next); } diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c index 45033358358e..aa8bf6786008 100644 --- a/net/rds/iw_recv.c +++ b/net/rds/iw_recv.c @@ -429,7 +429,7 @@ static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, { atomic64_set(&ic->i_ack_next, seq); if (ack_required) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } } @@ -437,7 +437,7 @@ static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, static u64 rds_iw_get_ack(struct rds_iw_connection *ic) { clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return atomic64_read(&ic->i_ack_next); } diff --git a/net/rds/send.c b/net/rds/send.c index a82fb660ec00..23718160d71e 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -107,7 +107,7 @@ static int acquire_in_xmit(struct rds_connection *conn) static void release_in_xmit(struct rds_connection *conn) { clear_bit(RDS_IN_XMIT, &conn->c_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* * We don't use wait_on_bit()/wake_up_bit() because our waking is in a * hot path and finding waiters is very rare. We don't want to walk @@ -661,7 +661,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack, /* order flag updates with spin locks */ if (!list_empty(&list)) - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); spin_unlock_irqrestore(&conn->c_lock, flags); @@ -691,7 +691,7 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) } /* order flag updates with the rs lock */ - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); spin_unlock_irqrestore(&rs->rs_lock, flags); diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 81cf5a4c5e40..53b17ca0dff5 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -93,7 +93,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, rm->m_ack_seq = tc->t_last_sent_nxt + sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1; - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); tc->t_last_expected_una = rm->m_ack_seq + 1; diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 5285ead196c0..247e973544bf 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -296,7 +296,7 @@ static void rpcauth_unhash_cred_locked(struct rpc_cred *cred) { hlist_del_rcu(&cred->cr_hash); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 36e431ee1c90..b6e440baccc3 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -143,7 +143,7 @@ gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx) gss_get_ctx(ctx); rcu_assign_pointer(gss_cred->gc_ctx, ctx); set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags); } diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 3513d559bc45..9761a0da964d 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -244,10 +244,10 @@ void xprt_free_bc_request(struct rpc_rqst *req) dprintk("RPC: free backchannel req=%p\n", req); req->rq_connect_cookie = xprt->connect_cookie - 1; - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state)); clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (!xprt_need_to_requeue(xprt)) { /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index d173f79947c6..89d051de6b3e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -230,9 +230,9 @@ static void xprt_clear_locked(struct rpc_xprt *xprt) { xprt->snd_task = NULL; if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_LOCKED, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } else queue_work(rpciod_workqueue, &xprt->task_cleanup); } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 25a3dcf15cae..402a7e9a16b7 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -893,11 +893,11 @@ static void xs_close(struct rpc_xprt *xprt) xs_reset_transport(transport); xprt->reestablish_timeout = 0; - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); xprt_disconnect_done(xprt); } @@ -1497,12 +1497,12 @@ static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt) static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static void xs_sock_mark_closed(struct rpc_xprt *xprt) @@ -1556,10 +1556,10 @@ static void xs_tcp_state_change(struct sock *sk) xprt->connect_cookie++; xprt->reestablish_timeout = 0; set_bit(XPRT_CLOSING, &xprt->state); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTED, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); break; case TCP_CLOSE_WAIT: @@ -1578,9 +1578,9 @@ static void xs_tcp_state_change(struct sock *sk) case TCP_LAST_ACK: set_bit(XPRT_CLOSING, &xprt->state); xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(XPRT_CONNECTED, &xprt->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); break; case TCP_CLOSE: xs_tcp_cancel_linger_timeout(xprt); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index bb7e8ba821f4..749f80c21e22 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1207,7 +1207,7 @@ restart: sk->sk_state = TCP_ESTABLISHED; sock_hold(newsk); - smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */ + smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ unix_peer(sk) = newsk; unix_state_unlock(sk); diff --git a/sound/pci/bt87x.c b/sound/pci/bt87x.c index 8546711d12f9..70951fd9b354 100644 --- a/sound/pci/bt87x.c +++ b/sound/pci/bt87x.c @@ -443,7 +443,7 @@ static int snd_bt87x_pcm_open(struct snd_pcm_substream *substream) _error: clear_bit(0, &chip->opened); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return err; } @@ -458,7 +458,7 @@ static int snd_bt87x_close(struct snd_pcm_substream *substream) chip->substream = NULL; clear_bit(0, &chip->opened); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return 0; } -- cgit From 1413c03893332366e5b4d1e26f942ada25f3e82a Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 8 Jan 2014 14:21:46 -0500 Subject: lockdep: Increase static allocations Fuzzing a recent kernel with a large configuration hits the static allocation limits and disables lockdep. This patch doubles the limits. Signed-off-by: Sasha Levin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1389208906-24338-1-git-send-email-sasha.levin@oracle.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep_internals.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 4f560cfedc8f..51c4b24b6328 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -54,9 +54,9 @@ enum { * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ -#define MAX_LOCKDEP_ENTRIES 16384UL +#define MAX_LOCKDEP_ENTRIES 32768UL -#define MAX_LOCKDEP_CHAINS_BITS 15 +#define MAX_LOCKDEP_CHAINS_BITS 16 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) @@ -65,7 +65,7 @@ enum { * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the hash_lock. */ -#define MAX_STACK_TRACE_ENTRIES 262144UL +#define MAX_STACK_TRACE_ENTRIES 524288UL extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; -- cgit From 534a3fbb3f37c42ec32de9876681c3195be0b658 Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Fri, 18 Apr 2014 09:08:14 +0900 Subject: workqueue: simplify wq_update_unbound_numa() by jumping to use_dfl_pwq if the target cpumask equals wq's wq_update_unbound_numa(), when it's decided that the newly updated cpumask equals the default, looks at whether the current pwq is already the default one and skips setting pwq to the default one. This extra step is unnecessary and we can always jump to use_dfl_pwq instead. Simplify the code by removing the conditional. This doesn't make any functional difference. Signed-off-by: Daeseok Youn Signed-off-by: Tejun Heo --- kernel/workqueue.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8edc87185427..c3f076f0f8fd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4103,17 +4103,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, * Let's determine what needs to be done. If the target cpumask is * different from wq's, we need to compare it to @pwq's and create * a new one if they don't match. If the target cpumask equals - * wq's, the default pwq should be used. If @pwq is already the - * default one, nothing to do; otherwise, install the default one. + * wq's, the default pwq should be used. */ if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) goto out_unlock; } else { - if (pwq == wq->dfl_pwq) - goto out_unlock; - else - goto use_dfl_pwq; + goto use_dfl_pwq; } mutex_unlock(&wq->mutex); -- cgit From 4104d326b670c2b66f575d2004daa28b2d1b4c8d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 10 Jan 2014 17:01:58 -0500 Subject: ftrace: Remove global function list and call function directly Instead of having a list of global functions that are called, as only one global function is allow to be enabled at a time, there's no reason to have a list. Instead, simply have all the users of the global ops, use the global ops directly, instead of registering their own ftrace_ops. Just switch what function is used before enabling the function tracer. This removes a lot of code as well as the complexity involved with it. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 20 ++--- kernel/trace/ftrace.c | 152 +++++++++++--------------------------- kernel/trace/trace.c | 2 + kernel/trace/trace.h | 19 +++-- kernel/trace/trace_functions.c | 55 +++++--------- kernel/trace/trace_irqsoff.c | 33 ++++----- kernel/trace/trace_sched_wakeup.c | 40 +++++----- kernel/trace/trace_selftest.c | 33 +++++---- 8 files changed, 133 insertions(+), 221 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9212b017bc72..f0ff2c2453e7 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -62,9 +62,6 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, * set in the flags member. * * ENABLED - set/unset when ftrace_ops is registered/unregistered - * GLOBAL - set manualy by ftrace_ops user to denote the ftrace_ops - * is part of the global tracers sharing the same filter - * via set_ftrace_* debugfs files. * DYNAMIC - set when ftrace_ops is registered to denote dynamically * allocated ftrace_ops which need special care * CONTROL - set manualy by ftrace_ops user to denote the ftrace_ops @@ -96,15 +93,14 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, - FTRACE_OPS_FL_GLOBAL = 1 << 1, - FTRACE_OPS_FL_DYNAMIC = 1 << 2, - FTRACE_OPS_FL_CONTROL = 1 << 3, - FTRACE_OPS_FL_SAVE_REGS = 1 << 4, - FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED = 1 << 5, - FTRACE_OPS_FL_RECURSION_SAFE = 1 << 6, - FTRACE_OPS_FL_STUB = 1 << 7, - FTRACE_OPS_FL_INITIALIZED = 1 << 8, - FTRACE_OPS_FL_DELETED = 1 << 9, + FTRACE_OPS_FL_DYNAMIC = 1 << 1, + FTRACE_OPS_FL_CONTROL = 1 << 2, + FTRACE_OPS_FL_SAVE_REGS = 1 << 3, + FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED = 1 << 4, + FTRACE_OPS_FL_RECURSION_SAFE = 1 << 5, + FTRACE_OPS_FL_STUB = 1 << 6, + FTRACE_OPS_FL_INITIALIZED = 1 << 7, + FTRACE_OPS_FL_DELETED = 1 << 8, }; /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1fd4b9479210..8f61ef70a297 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -62,7 +62,7 @@ #define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 -#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) +#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) #ifdef CONFIG_DYNAMIC_FTRACE #define INIT_REGEX_LOCK(opsname) \ @@ -103,7 +103,6 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; @@ -171,23 +170,6 @@ int ftrace_nr_registered_ops(void) return cnt; } -static void -ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) -{ - int bit; - - bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX); - if (bit < 0) - return; - - do_for_each_ftrace_op(op, ftrace_global_list) { - op->func(ip, parent_ip, op, regs); - } while_for_each_ftrace_op(op); - - trace_clear_recursion(bit); -} - static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { @@ -237,43 +219,6 @@ static int control_ops_alloc(struct ftrace_ops *ops) return 0; } -static void update_global_ops(void) -{ - ftrace_func_t func = ftrace_global_list_func; - void *private = NULL; - - /* The list has its own recursion protection. */ - global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; - - /* - * If there's only one function registered, then call that - * function directly. Otherwise, we need to iterate over the - * registered callers. - */ - if (ftrace_global_list == &ftrace_list_end || - ftrace_global_list->next == &ftrace_list_end) { - func = ftrace_global_list->func; - private = ftrace_global_list->private; - /* - * As we are calling the function directly. - * If it does not have recursion protection, - * the function_trace_op needs to be updated - * accordingly. - */ - if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)) - global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; - } - - /* If we filter on pids, update to use the pid function */ - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } - - global_ops.func = func; - global_ops.private = private; -} - static void ftrace_sync(struct work_struct *work) { /* @@ -301,8 +246,6 @@ static void update_ftrace_function(void) { ftrace_func_t func; - update_global_ops(); - /* * If we are at the end of the list and this ops is * recursion safe and not dynamic and the arch supports passing ops, @@ -314,10 +257,7 @@ static void update_ftrace_function(void) (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && !FTRACE_FORCE_LIST_FUNC)) { /* Set the ftrace_ops that the arch callback uses */ - if (ftrace_ops_list == &global_ops) - set_function_trace_op = ftrace_global_list; - else - set_function_trace_op = ftrace_ops_list; + set_function_trace_op = ftrace_ops_list; func = ftrace_ops_list->func; } else { /* Just use the default ftrace_ops */ @@ -434,16 +374,9 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (ops->flags & FTRACE_OPS_FL_DELETED) return -EINVAL; - if (FTRACE_WARN_ON(ops == &global_ops)) - return -EINVAL; - if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) return -EBUSY; - /* We don't support both control and global flags set. */ - if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) - return -EINVAL; - #ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS /* * If the ftrace_ops specifies SAVE_REGS, then it only can be used @@ -461,10 +394,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops); - ops->flags |= FTRACE_OPS_FL_ENABLED; - } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + if (ops->flags & FTRACE_OPS_FL_CONTROL) { if (control_ops_alloc(ops)) return -ENOMEM; add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); @@ -484,15 +414,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) return -EBUSY; - if (FTRACE_WARN_ON(ops == &global_ops)) - return -EINVAL; - - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ret = remove_ftrace_list_ops(&ftrace_global_list, - &global_ops, ops); - if (!ret) - ops->flags &= ~FTRACE_OPS_FL_ENABLED; - } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + if (ops->flags & FTRACE_OPS_FL_CONTROL) { ret = remove_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); } else @@ -2128,15 +2050,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) ftrace_start_up++; command |= FTRACE_UPDATE_CALLS; - /* ops marked global share the filter hashes */ - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ops = &global_ops; - /* Don't update hash if global is already set */ - if (global_start_up) - hash_enable = false; - global_start_up++; - } - ops->flags |= FTRACE_OPS_FL_ENABLED; if (hash_enable) ftrace_hash_rec_enable(ops, 1); @@ -2166,21 +2079,10 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) */ WARN_ON_ONCE(ftrace_start_up < 0); - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ops = &global_ops; - global_start_up--; - WARN_ON_ONCE(global_start_up < 0); - /* Don't update hash if global still has users */ - if (global_start_up) { - WARN_ON_ONCE(!ftrace_start_up); - hash_disable = false; - } - } - if (hash_disable) ftrace_hash_rec_disable(ops, 1); - if (ops != &global_ops || !global_start_up) + if (!global_start_up) ops->flags &= ~FTRACE_OPS_FL_ENABLED; command |= FTRACE_UPDATE_CALLS; @@ -3524,10 +3426,6 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, struct ftrace_hash *hash; int ret; - /* All global ops uses the global ops filters */ - if (ops->flags & FTRACE_OPS_FL_GLOBAL) - ops = &global_ops; - if (unlikely(ftrace_disabled)) return -ENODEV; @@ -4462,6 +4360,34 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) #endif /* CONFIG_DYNAMIC_FTRACE */ +__init void ftrace_init_global_array_ops(struct trace_array *tr) +{ + tr->ops = &global_ops; + tr->ops->private = tr; +} + +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) +{ + /* If we filter on pids, update to use the pid function */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + if (WARN_ON(tr->ops->func != ftrace_stub)) + printk("ftrace ops had %pS for function\n", + tr->ops->func); + /* Only the top level instance does pid tracing */ + if (!list_empty(&ftrace_pids)) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } + } + tr->ops->func = func; + tr->ops->private = tr; +} + +void ftrace_reset_array_ops(struct trace_array *tr) +{ + tr->ops->func = ftrace_stub; +} + static void ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) @@ -4520,9 +4446,16 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, */ preempt_disable_notrace(); do_for_each_ftrace_op(op, ftrace_ops_list) { - if (ftrace_ops_test(op, ip, regs)) + if (ftrace_ops_test(op, ip, regs)) { + if (WARN_ON(!op->func)) { + function_trace_stop = 1; + printk("op=%p %pS\n", op, op); + goto out; + } op->func(ip, parent_ip, op, regs); + } } while_for_each_ftrace_op(op); +out: preempt_enable_notrace(); trace_clear_recursion(bit); } @@ -5076,8 +5009,7 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, /* Just a place holder for function graph */ static struct ftrace_ops fgraph_ops __read_mostly = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL | - FTRACE_OPS_FL_RECURSION_SAFE, + .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_RECURSION_SAFE, }; static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 737b0efa1a62..fdd33aacdf05 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6629,6 +6629,8 @@ __init static int tracer_alloc_buffers(void) */ global_trace.current_trace = &nop_trace; + ftrace_init_global_array_ops(&global_trace); + register_tracer(&nop_trace); /* All seems OK, enable tracing */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2e29d7ba5a52..df5256be64cd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -416,13 +416,7 @@ enum { TRACE_FTRACE_IRQ_BIT, TRACE_FTRACE_SIRQ_BIT, - /* GLOBAL_BITs must be greater than FTRACE_BITs */ - TRACE_GLOBAL_BIT, - TRACE_GLOBAL_NMI_BIT, - TRACE_GLOBAL_IRQ_BIT, - TRACE_GLOBAL_SIRQ_BIT, - - /* INTERNAL_BITs must be greater than GLOBAL_BITs */ + /* INTERNAL_BITs must be greater than FTRACE_BITs */ TRACE_INTERNAL_BIT, TRACE_INTERNAL_NMI_BIT, TRACE_INTERNAL_IRQ_BIT, @@ -449,9 +443,6 @@ enum { #define TRACE_FTRACE_START TRACE_FTRACE_BIT #define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) -#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT -#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1) - #define TRACE_LIST_START TRACE_INTERNAL_BIT #define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) @@ -823,6 +814,9 @@ extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent); void ftrace_destroy_function_files(struct trace_array *tr); +void ftrace_init_global_array_ops(struct trace_array *tr); +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); +void ftrace_reset_array_ops(struct trace_array *tr); #else static inline int ftrace_trace_task(struct task_struct *task) { @@ -836,6 +830,11 @@ ftrace_create_function_files(struct trace_array *tr, return 0; } static inline void ftrace_destroy_function_files(struct trace_array *tr) { } +static inline __init void +ftrace_init_global_array_ops(struct trace_array *tr) { } +static inline void ftrace_reset_array_ops(struct trace_array *tr) { } +/* ftace_func_t type is not defined, use macro instead of static inline */ +#define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index ffd56351b521..2d9482b8f26a 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -26,8 +26,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs); -static struct ftrace_ops trace_ops; -static struct ftrace_ops trace_stack_ops; static struct tracer_flags func_flags; /* Our option */ @@ -83,28 +81,24 @@ void ftrace_destroy_function_files(struct trace_array *tr) static int function_trace_init(struct trace_array *tr) { - struct ftrace_ops *ops; - - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { - /* There's only one global tr */ - if (!trace_ops.private) { - trace_ops.private = tr; - trace_stack_ops.private = tr; - } + ftrace_func_t func; - if (func_flags.val & TRACE_FUNC_OPT_STACK) - ops = &trace_stack_ops; - else - ops = &trace_ops; - tr->ops = ops; - } else if (!tr->ops) { - /* - * Instance trace_arrays get their ops allocated - * at instance creation. Unless it failed - * the allocation. - */ + /* + * Instance trace_arrays get their ops allocated + * at instance creation. Unless it failed + * the allocation. + */ + if (!tr->ops) return -ENOMEM; - } + + /* Currently only the global instance can do stack tracing */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL && + func_flags.val & TRACE_FUNC_OPT_STACK) + func = function_stack_trace_call; + else + func = function_trace_call; + + ftrace_init_array_ops(tr, func); tr->trace_buffer.cpu = get_cpu(); put_cpu(); @@ -118,6 +112,7 @@ static void function_trace_reset(struct trace_array *tr) { tracing_stop_function_trace(tr); tracing_stop_cmdline_record(); + ftrace_reset_array_ops(tr); } static void function_trace_start(struct trace_array *tr) @@ -199,18 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, local_irq_restore(flags); } -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = function_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; - -static struct ftrace_ops trace_stack_ops __read_mostly = -{ - .func = function_stack_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; - static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, @@ -248,10 +231,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) unregister_ftrace_function(tr->ops); if (set) { - tr->ops = &trace_stack_ops; + tr->ops->func = function_stack_trace_call; register_ftrace_function(tr->ops); } else { - tr->ops = &trace_ops; + tr->ops->func = function_trace_call; register_ftrace_function(tr->ops); } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 8ff02cbb892f..b5cb047df3e9 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -151,12 +151,6 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, atomic_dec(&data->disabled); } - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = irqsoff_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -531,7 +525,7 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ -static int register_irqsoff_function(int graph, int set) +static int register_irqsoff_function(struct trace_array *tr, int graph, int set) { int ret; @@ -543,7 +537,7 @@ static int register_irqsoff_function(int graph, int set) ret = register_ftrace_graph(&irqsoff_graph_return, &irqsoff_graph_entry); else - ret = register_ftrace_function(&trace_ops); + ret = register_ftrace_function(tr->ops); if (!ret) function_enabled = true; @@ -551,7 +545,7 @@ static int register_irqsoff_function(int graph, int set) return ret; } -static void unregister_irqsoff_function(int graph) +static void unregister_irqsoff_function(struct trace_array *tr, int graph) { if (!function_enabled) return; @@ -559,17 +553,17 @@ static void unregister_irqsoff_function(int graph) if (graph) unregister_ftrace_graph(); else - unregister_ftrace_function(&trace_ops); + unregister_ftrace_function(tr->ops); function_enabled = false; } -static void irqsoff_function_set(int set) +static void irqsoff_function_set(struct trace_array *tr, int set) { if (set) - register_irqsoff_function(is_graph(), 1); + register_irqsoff_function(tr, is_graph(), 1); else - unregister_irqsoff_function(is_graph()); + unregister_irqsoff_function(tr, is_graph()); } static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) @@ -577,7 +571,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) struct tracer *tracer = tr->current_trace; if (mask & TRACE_ITER_FUNCTION) - irqsoff_function_set(set); + irqsoff_function_set(tr, set); return trace_keep_overwrite(tracer, mask, set); } @@ -586,7 +580,7 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph) { int ret; - ret = register_irqsoff_function(graph, 0); + ret = register_irqsoff_function(tr, graph, 0); if (!ret && tracing_is_enabled()) tracer_enabled = 1; @@ -600,7 +594,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) { tracer_enabled = 0; - unregister_irqsoff_function(graph); + unregister_irqsoff_function(tr, graph); } static void __irqsoff_tracer_init(struct trace_array *tr) @@ -617,7 +611,11 @@ static void __irqsoff_tracer_init(struct trace_array *tr) smp_wmb(); tracing_reset_online_cpus(&tr->trace_buffer); - if (start_irqsoff_tracer(tr, is_graph())) + ftrace_init_array_ops(tr, irqsoff_tracer_call); + + /* Only toplevel instance supports graph tracing */ + if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL && + is_graph()))) printk(KERN_ERR "failed to start irqsoff tracer\n"); } @@ -630,6 +628,7 @@ static void irqsoff_tracer_reset(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + ftrace_reset_array_ops(tr); } static void irqsoff_tracer_start(struct trace_array *tr) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e14da5e97a69..4dd986defa60 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -130,15 +130,9 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, atomic_dec(&data->disabled); preempt_enable_notrace(); } - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = wakeup_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; #endif /* CONFIG_FUNCTION_TRACER */ -static int register_wakeup_function(int graph, int set) +static int register_wakeup_function(struct trace_array *tr, int graph, int set) { int ret; @@ -150,7 +144,7 @@ static int register_wakeup_function(int graph, int set) ret = register_ftrace_graph(&wakeup_graph_return, &wakeup_graph_entry); else - ret = register_ftrace_function(&trace_ops); + ret = register_ftrace_function(tr->ops); if (!ret) function_enabled = true; @@ -158,7 +152,7 @@ static int register_wakeup_function(int graph, int set) return ret; } -static void unregister_wakeup_function(int graph) +static void unregister_wakeup_function(struct trace_array *tr, int graph) { if (!function_enabled) return; @@ -166,17 +160,17 @@ static void unregister_wakeup_function(int graph) if (graph) unregister_ftrace_graph(); else - unregister_ftrace_function(&trace_ops); + unregister_ftrace_function(tr->ops); function_enabled = false; } -static void wakeup_function_set(int set) +static void wakeup_function_set(struct trace_array *tr, int set) { if (set) - register_wakeup_function(is_graph(), 1); + register_wakeup_function(tr, is_graph(), 1); else - unregister_wakeup_function(is_graph()); + unregister_wakeup_function(tr, is_graph()); } static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) @@ -184,16 +178,16 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) struct tracer *tracer = tr->current_trace; if (mask & TRACE_ITER_FUNCTION) - wakeup_function_set(set); + wakeup_function_set(tr, set); return trace_keep_overwrite(tracer, mask, set); } -static int start_func_tracer(int graph) +static int start_func_tracer(struct trace_array *tr, int graph) { int ret; - ret = register_wakeup_function(graph, 0); + ret = register_wakeup_function(tr, graph, 0); if (!ret && tracing_is_enabled()) tracer_enabled = 1; @@ -203,11 +197,11 @@ static int start_func_tracer(int graph) return ret; } -static void stop_func_tracer(int graph) +static void stop_func_tracer(struct trace_array *tr, int graph) { tracer_enabled = 0; - unregister_wakeup_function(graph); + unregister_wakeup_function(tr, graph); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -221,12 +215,12 @@ wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (!(is_graph() ^ set)) return 0; - stop_func_tracer(!set); + stop_func_tracer(tr, !set); wakeup_reset(wakeup_trace); tracing_max_latency = 0; - return start_func_tracer(set); + return start_func_tracer(tr, set); } static int wakeup_graph_entry(struct ftrace_graph_ent *trace) @@ -587,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - if (start_func_tracer(is_graph())) + if (start_func_tracer(tr, is_graph())) printk(KERN_ERR "failed to start wakeup tracer\n"); return; @@ -600,7 +594,7 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - stop_func_tracer(is_graph()); + stop_func_tracer(tr, is_graph()); unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_wakeup, NULL); unregister_trace_sched_wakeup(probe_wakeup, NULL); @@ -617,6 +611,7 @@ static int __wakeup_tracer_init(struct trace_array *tr) tracing_max_latency = 0; wakeup_trace = tr; + ftrace_init_array_ops(tr, wakeup_tracer_call); start_wakeup_tracer(tr); return 0; } @@ -653,6 +648,7 @@ static void wakeup_tracer_reset(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + ftrace_reset_array_ops(tr); } static void wakeup_tracer_start(struct trace_array *tr) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index e98fca60974f..519d04affe38 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -161,11 +161,6 @@ static struct ftrace_ops test_probe3 = { .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; -static struct ftrace_ops test_global = { - .func = trace_selftest_test_global_func, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; - static void print_counts(void) { printk("(%d %d %d %d %d) ", @@ -185,7 +180,7 @@ static void reset_counts(void) trace_selftest_test_dyn_cnt = 0; } -static int trace_selftest_ops(int cnt) +static int trace_selftest_ops(struct trace_array *tr, int cnt) { int save_ftrace_enabled = ftrace_enabled; struct ftrace_ops *dyn_ops; @@ -220,7 +215,11 @@ static int trace_selftest_ops(int cnt) register_ftrace_function(&test_probe1); register_ftrace_function(&test_probe2); register_ftrace_function(&test_probe3); - register_ftrace_function(&test_global); + /* First time we are running with main function */ + if (cnt > 1) { + ftrace_init_array_ops(tr, trace_selftest_test_global_func); + register_ftrace_function(tr->ops); + } DYN_FTRACE_TEST_NAME(); @@ -232,8 +231,10 @@ static int trace_selftest_ops(int cnt) goto out; if (trace_selftest_test_probe3_cnt != 1) goto out; - if (trace_selftest_test_global_cnt == 0) - goto out; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out; + } DYN_FTRACE_TEST_NAME2(); @@ -269,8 +270,10 @@ static int trace_selftest_ops(int cnt) goto out_free; if (trace_selftest_test_probe3_cnt != 3) goto out_free; - if (trace_selftest_test_global_cnt == 0) - goto out; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out; + } if (trace_selftest_test_dyn_cnt == 0) goto out_free; @@ -295,7 +298,9 @@ static int trace_selftest_ops(int cnt) unregister_ftrace_function(&test_probe1); unregister_ftrace_function(&test_probe2); unregister_ftrace_function(&test_probe3); - unregister_ftrace_function(&test_global); + if (cnt > 1) + unregister_ftrace_function(tr->ops); + ftrace_reset_array_ops(tr); /* Make sure everything is off */ reset_counts(); @@ -388,7 +393,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, } /* Test the ops with global tracing running */ - ret = trace_selftest_ops(1); + ret = trace_selftest_ops(tr, 1); trace->reset(tr); out: @@ -399,7 +404,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* Test the ops with global tracing off */ if (!ret) - ret = trace_selftest_ops(2); + ret = trace_selftest_ops(tr, 2); return ret; } -- cgit From 6d9b3fa5e7f663bbfb9d2d80d46136f75319cb28 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 14 Jan 2014 11:28:38 -0500 Subject: tracing: Move tracing_max_latency into trace_array In preparation for letting the latency tracers be used by instances, remove the global tracing_max_latency variable and add a max_latency field to the trace_array that the latency tracers will now use. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 14 ++++++-------- kernel/trace/trace.h | 3 +-- kernel/trace/trace_irqsoff.c | 14 +++++++------- kernel/trace/trace_sched_wakeup.c | 12 ++++++------ kernel/trace/trace_selftest.c | 26 +++++++++++++------------- 5 files changed, 33 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fdd33aacdf05..f5fc56bf0227 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -982,8 +982,6 @@ static arch_spinlock_t ftrace_max_lock = unsigned long __read_mostly tracing_thresh; #ifdef CONFIG_TRACER_MAX_TRACE -unsigned long __read_mostly tracing_max_latency; - /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, @@ -1000,7 +998,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) max_buf->cpu = cpu; max_buf->time_start = data->preempt_timestamp; - max_data->saved_latency = tracing_max_latency; + max_data->saved_latency = tr->max_latency; max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; @@ -6328,6 +6326,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("tracing_on", 0644, d_tracer, tr, &rb_simple_fops); +#ifdef CONFIG_TRACER_MAX_TRACE + trace_create_file("tracing_max_latency", 0644, d_tracer, + &tr->max_latency, &tracing_max_lat_fops); +#endif + if (ftrace_create_function_files(tr, d_tracer)) WARN(1, "Could not allocate function filter files"); @@ -6353,11 +6356,6 @@ static __init int tracer_init_debugfs(void) init_tracer_debugfs(&global_trace, d_tracer); -#ifdef CONFIG_TRACER_MAX_TRACE - trace_create_file("tracing_max_latency", 0644, d_tracer, - &tracing_max_latency, &tracing_max_lat_fops); -#endif - trace_create_file("tracing_thresh", 0644, d_tracer, &tracing_thresh, &tracing_max_lat_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index df5256be64cd..644a8b533e1d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -190,6 +190,7 @@ struct trace_array { */ struct trace_buffer max_buffer; bool allocated_snapshot; + unsigned long max_latency; #endif int buffer_disabled; #ifdef CONFIG_FTRACE_SYSCALLS @@ -599,8 +600,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; #ifdef CONFIG_TRACER_MAX_TRACE -extern unsigned long tracing_max_latency; - void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index b5cb047df3e9..40aa300d3491 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -170,7 +170,7 @@ irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) for_each_possible_cpu(cpu) per_cpu(tracing_cpu, cpu) = 0; - tracing_max_latency = 0; + tr->max_latency = 0; tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); return start_irqsoff_tracer(irqsoff_trace, set); @@ -297,13 +297,13 @@ static void irqsoff_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static int report_latency(cycle_t delta) +static int report_latency(struct trace_array *tr, cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) return 0; } else { - if (delta <= tracing_max_latency) + if (delta <= tr->max_latency) return 0; } return 1; @@ -327,13 +327,13 @@ check_critical_timing(struct trace_array *tr, pc = preempt_count(); - if (!report_latency(delta)) + if (!report_latency(tr, delta)) goto out; raw_spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ - if (!report_latency(delta)) + if (!report_latency(tr, delta)) goto out_unlock; __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); @@ -346,7 +346,7 @@ check_critical_timing(struct trace_array *tr, data->critical_end = parent_ip; if (likely(!is_tracing_stopped())) { - tracing_max_latency = delta; + tr->max_latency = delta; update_max_tr_single(tr, current, cpu); } @@ -605,7 +605,7 @@ static void __irqsoff_tracer_init(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); - tracing_max_latency = 0; + tr->max_latency = 0; irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4dd986defa60..41e0b8aa78ed 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -218,7 +218,7 @@ wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) stop_func_tracer(tr, !set); wakeup_reset(wakeup_trace); - tracing_max_latency = 0; + tr->max_latency = 0; return start_func_tracer(tr, set); } @@ -344,13 +344,13 @@ static void wakeup_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static int report_latency(cycle_t delta) +static int report_latency(struct trace_array *tr, cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) return 0; } else { - if (delta <= tracing_max_latency) + if (delta <= tr->max_latency) return 0; } return 1; @@ -418,11 +418,11 @@ probe_wakeup_sched_switch(void *ignore, T1 = ftrace_now(cpu); delta = T1-T0; - if (!report_latency(delta)) + if (!report_latency(wakeup_trace, delta)) goto out_unlock; if (likely(!is_tracing_stopped())) { - tracing_max_latency = delta; + wakeup_trace->max_latency = delta; update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); } @@ -609,7 +609,7 @@ static int __wakeup_tracer_init(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); - tracing_max_latency = 0; + tr->max_latency = 0; wakeup_trace = tr; ftrace_init_array_ops(tr, wakeup_tracer_call); start_wakeup_tracer(tr); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 519d04affe38..ac3185892960 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -807,7 +807,7 @@ out: int trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; unsigned long count; int ret; @@ -819,7 +819,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; /* disable interrupts for a bit */ local_irq_disable(); udelay(100); @@ -846,7 +846,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) ret = -1; } - tracing_max_latency = save_max; + tr->max_latency = save_max; return ret; } @@ -856,7 +856,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) int trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; unsigned long count; int ret; @@ -881,7 +881,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; /* disable preemption for a bit */ preempt_disable(); udelay(100); @@ -908,7 +908,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) ret = -1; } - tracing_max_latency = save_max; + tr->max_latency = save_max; return ret; } @@ -918,7 +918,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) int trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; unsigned long count; int ret; @@ -943,7 +943,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; /* disable preemption and interrupts for a bit */ preempt_disable(); @@ -978,7 +978,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * } /* do the test by disabling interrupts first this time */ - tracing_max_latency = 0; + tr->max_latency = 0; tracing_start(); trace->start(tr); @@ -1009,7 +1009,7 @@ out: tracing_start(); out_no_start: trace->reset(tr); - tracing_max_latency = save_max; + tr->max_latency = save_max; return ret; } @@ -1062,7 +1062,7 @@ static int trace_wakeup_test_thread(void *data) int trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; struct task_struct *p; struct completion is_ready; unsigned long count; @@ -1088,7 +1088,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; while (p->on_rq) { /* @@ -1118,7 +1118,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) trace->reset(tr); tracing_start(); - tracing_max_latency = save_max; + tr->max_latency = save_max; /* kill the thread */ kthread_stop(p); -- cgit From 0b9b12c1b884eb34773312f15c194220025e0416 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 14 Jan 2014 10:04:59 -0500 Subject: tracing: Move ftrace_max_lock into trace_array In preparation for having tracers enabled in instances, the max_lock should be unique as updating the max for one tracer is a separate operation than updating it for another tracer using a different max. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 40 ++++++++++++++-------------------------- kernel/trace/trace.h | 14 ++++++++++++++ kernel/trace/trace_selftest.c | 4 ++-- 3 files changed, 30 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f5fc56bf0227..bb5147a55be5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -963,22 +963,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) return cnt; } -/* - * ftrace_max_lock is used to protect the swapping of buffers - * when taking a max snapshot. The buffers themselves are - * protected by per_cpu spinlocks. But the action of the swap - * needs its own lock. - * - * This is defined as a arch_spinlock_t in order to help - * with performance when lockdep debugging is enabled. - * - * It is also used in other places outside the update_max_tr - * so it needs to be defined outside of the - * CONFIG_TRACER_MAX_TRACE. - */ -static arch_spinlock_t ftrace_max_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - unsigned long __read_mostly tracing_thresh; #ifdef CONFIG_TRACER_MAX_TRACE @@ -1046,14 +1030,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; } - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); buf = tr->trace_buffer.buffer; tr->trace_buffer.buffer = tr->max_buffer.buffer; tr->max_buffer.buffer = buf; __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); } /** @@ -1079,7 +1063,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; } - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); @@ -1097,7 +1081,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); } #endif /* CONFIG_TRACER_MAX_TRACE */ @@ -1351,7 +1335,7 @@ void tracing_start(void) } /* Prevent the buffers from switching */ - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&global_trace.max_lock); buffer = global_trace.trace_buffer.buffer; if (buffer) @@ -1363,7 +1347,7 @@ void tracing_start(void) ring_buffer_record_enable(buffer); #endif - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&global_trace.max_lock); ftrace_start(); out: @@ -1418,7 +1402,7 @@ void tracing_stop(void) goto out; /* Prevent the buffers from switching */ - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&global_trace.max_lock); buffer = global_trace.trace_buffer.buffer; if (buffer) @@ -1430,7 +1414,7 @@ void tracing_stop(void) ring_buffer_record_disable(buffer); #endif - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&global_trace.max_lock); out: raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); @@ -3331,7 +3315,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); local_irq_disable(); - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); for_each_tracing_cpu(cpu) { /* * Increase/decrease the disabled counter if we are @@ -3348,7 +3332,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); } } - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); local_irq_enable(); cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); @@ -6129,6 +6113,8 @@ static int new_instance_create(const char *name) raw_spin_lock_init(&tr->start_lock); + tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + tr->current_trace = &nop_trace; INIT_LIST_HEAD(&tr->systems); @@ -6627,6 +6613,8 @@ __init static int tracer_alloc_buffers(void) */ global_trace.current_trace = &nop_trace; + global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + ftrace_init_global_array_ops(&global_trace); register_tracer(&nop_trace); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 644a8b533e1d..5d2f07d6746c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -192,6 +192,20 @@ struct trace_array { bool allocated_snapshot; unsigned long max_latency; #endif + /* + * max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a arch_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + * + * It is also used in other places outside the update_max_tr + * so it needs to be defined outside of the + * CONFIG_TRACER_MAX_TRACE. + */ + arch_spinlock_t max_lock; int buffer_disabled; #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index ac3185892960..1c95cd7a98c8 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -65,7 +65,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) /* Don't allow flipping of max traces now */ local_irq_save(flags); - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&buf->tr->max_lock); cnt = ring_buffer_entries(buf->buffer); @@ -83,7 +83,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) break; } tracing_on(); - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&buf->tr->max_lock); local_irq_restore(flags); if (count) -- cgit From 65daaca7c6dac4db0ef64f2baac0e448cf5d847f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 14 Jan 2014 07:06:29 -0500 Subject: tracing: Allow wakeup tracers to be used by instances The wakeup and wakeup_rt tracers can now be used by instances. But they may only be used by one instance at a time (including the top level directory). This allows multiple tracers to run while the wakeup tracer is running simultaneously. Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_wakeup.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 41e0b8aa78ed..1573c03640d2 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -601,6 +601,8 @@ static void stop_wakeup_tracer(struct trace_array *tr) unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); } +static bool wakeup_busy; + static int __wakeup_tracer_init(struct trace_array *tr) { save_flags = trace_flags; @@ -613,11 +615,16 @@ static int __wakeup_tracer_init(struct trace_array *tr) wakeup_trace = tr; ftrace_init_array_ops(tr, wakeup_tracer_call); start_wakeup_tracer(tr); + + wakeup_busy = true; return 0; } static int wakeup_tracer_init(struct trace_array *tr) { + if (wakeup_busy) + return -EBUSY; + wakeup_dl = 0; wakeup_rt = 0; return __wakeup_tracer_init(tr); @@ -625,6 +632,9 @@ static int wakeup_tracer_init(struct trace_array *tr) static int wakeup_rt_tracer_init(struct trace_array *tr) { + if (wakeup_busy) + return -EBUSY; + wakeup_dl = 0; wakeup_rt = 1; return __wakeup_tracer_init(tr); @@ -632,6 +642,9 @@ static int wakeup_rt_tracer_init(struct trace_array *tr) static int wakeup_dl_tracer_init(struct trace_array *tr) { + if (wakeup_busy) + return -EBUSY; + wakeup_dl = 1; wakeup_rt = 0; return __wakeup_tracer_init(tr); @@ -649,6 +662,7 @@ static void wakeup_tracer_reset(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); ftrace_reset_array_ops(tr); + wakeup_busy = false; } static void wakeup_tracer_start(struct trace_array *tr) @@ -680,6 +694,7 @@ static struct tracer wakeup_tracer __read_mostly = #endif .open = wakeup_trace_open, .close = wakeup_trace_close, + .allow_instances = true, .use_max_tr = true, }; @@ -702,6 +717,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = #endif .open = wakeup_trace_open, .close = wakeup_trace_close, + .allow_instances = true, .use_max_tr = true, }; -- cgit From 02f2f7646fd5d0482e805d728601e8e4305ad56f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 14 Jan 2014 09:32:58 -0500 Subject: tracing: Allow irq/preempt tracers to be used by instances The irqsoff, preemptoff and preemptirqsoff tracers can now be used by instances. But they may only be used by one instance at a time (including the top level directory). This allows multiple tracers to run while the irqsoff (and friends) tracer is running simultaneously. Signed-off-by: Steven Rostedt --- kernel/trace/trace_irqsoff.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 40aa300d3491..9bb104f748d0 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -597,8 +597,13 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) unregister_irqsoff_function(tr, graph); } -static void __irqsoff_tracer_init(struct trace_array *tr) +static bool irqsoff_busy; + +static int __irqsoff_tracer_init(struct trace_array *tr) { + if (irqsoff_busy) + return -EBUSY; + save_flags = trace_flags; /* non overwrite screws up the latency tracers */ @@ -617,6 +622,9 @@ static void __irqsoff_tracer_init(struct trace_array *tr) if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL && is_graph()))) printk(KERN_ERR "failed to start irqsoff tracer\n"); + + irqsoff_busy = true; + return 0; } static void irqsoff_tracer_reset(struct trace_array *tr) @@ -629,6 +637,8 @@ static void irqsoff_tracer_reset(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); ftrace_reset_array_ops(tr); + + irqsoff_busy = false; } static void irqsoff_tracer_start(struct trace_array *tr) @@ -646,8 +656,7 @@ static int irqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF; - __irqsoff_tracer_init(tr); - return 0; + return __irqsoff_tracer_init(tr); } static struct tracer irqsoff_tracer __read_mostly = { @@ -667,6 +676,7 @@ static struct tracer irqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .allow_instances = true, .use_max_tr = true, }; # define register_irqsoff(trace) register_tracer(&trace) @@ -679,8 +689,7 @@ static int preemptoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_PREEMPT_OFF; - __irqsoff_tracer_init(tr); - return 0; + return __irqsoff_tracer_init(tr); } static struct tracer preemptoff_tracer __read_mostly = @@ -701,6 +710,7 @@ static struct tracer preemptoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .allow_instances = true, .use_max_tr = true, }; # define register_preemptoff(trace) register_tracer(&trace) @@ -715,8 +725,7 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; - __irqsoff_tracer_init(tr); - return 0; + return __irqsoff_tracer_init(tr); } static struct tracer preemptirqsoff_tracer __read_mostly = @@ -737,6 +746,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .allow_instances = true, .use_max_tr = true, }; -- cgit From 8275f69f076d8b1ecc6d93305451e5a8f7336d4e Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 30 Mar 2014 15:31:50 +0200 Subject: ftrace: Statically initialize pm notifier block Instead of initializing the pm notifier block in register_ftrace_graph(), initialize it statically. This safes us some code. Found in the PaX patch, written by the PaX Team. Link: http://lkml.kernel.org/p/1396186310-3156-1-git-send-email-minipli@googlemail.com Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: PaX Team Signed-off-by: Mathias Krause Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8f61ef70a297..846888ea2ba4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4860,7 +4860,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int ftrace_graph_active; -static struct notifier_block ftrace_suspend_notifier; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { @@ -5036,6 +5035,10 @@ static void update_function_graph_func(void) ftrace_graph_entry = ftrace_graph_entry_test; } +static struct notifier_block ftrace_suspend_notifier = { + .notifier_call = ftrace_suspend_notifier_call, +}; + int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) { @@ -5049,7 +5052,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, goto out; } - ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; register_pm_notifier(&ftrace_suspend_notifier); ftrace_graph_active++; -- cgit From ad1438a076e275b70d1a04de1364bc483e5a81db Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 17 Apr 2014 21:44:42 +0200 Subject: tracing: Add static to local functions This patch adds static to the following functions: -cycle_t buffer_ftrace_now -void free_snapshot -int trace_selftest_startup_dynamic_tracing Link: http://lkml.kernel.org/p/20140417214442.d7abc7c0b0e4b90e7fedecc9@skynet.be Signed-off-by: Fabian Frederick Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- kernel/trace/trace_selftest.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb5147a55be5..cb41e98cc64b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -275,7 +275,7 @@ int call_filter_check_discard(struct ftrace_event_call *call, void *rec, } EXPORT_SYMBOL_GPL(call_filter_check_discard); -cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) +static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { u64 ts; @@ -599,7 +599,7 @@ static int alloc_snapshot(struct trace_array *tr) return 0; } -void free_snapshot(struct trace_array *tr) +static void free_snapshot(struct trace_array *tr) { /* * We don't free the ring buffer. instead, resize it because diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 1c95cd7a98c8..5ef60499dc8e 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -320,9 +320,9 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt) } /* Test dynamic code modification and ftrace filters */ -int trace_selftest_startup_dynamic_tracing(struct tracer *trace, - struct trace_array *tr, - int (*func)(void)) +static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, + struct trace_array *tr, + int (*func)(void)) { int save_ftrace_enabled = ftrace_enabled; unsigned long count; -- cgit From c04ae71c9c264312a6f57d2665a79f7bbccf8758 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 8 Apr 2014 17:33:19 -0700 Subject: sched_clock: Remove deprecated setup_sched_clock() API Remove the 32-bit only setup_sched_clock() API now that all users have been converted to the 64-bit friendly sched_clock_register(). Signed-off-by: Stephen Boyd Signed-off-by: John Stultz --- include/linux/sched_clock.h | 1 - kernel/time/sched_clock.c | 13 ------------- 2 files changed, 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h index cddf0c2940b6..efa931c5cef1 100644 --- a/include/linux/sched_clock.h +++ b/include/linux/sched_clock.h @@ -14,7 +14,6 @@ extern void sched_clock_postinit(void); static inline void sched_clock_postinit(void) { } #endif -extern void setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate); extern void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 4d23dc4d8139..445106d2c729 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -49,13 +49,6 @@ static u64 notrace jiffy_sched_clock_read(void) return (u64)(jiffies - INITIAL_JIFFIES); } -static u32 __read_mostly (*read_sched_clock_32)(void); - -static u64 notrace read_sched_clock_32_wrapper(void) -{ - return read_sched_clock_32(); -} - static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) @@ -176,12 +169,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits, pr_debug("Registered %pF as sched_clock source\n", read); } -void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) -{ - read_sched_clock_32 = read; - sched_clock_register(read_sched_clock_32_wrapper, bits, rate); -} - void __init sched_clock_postinit(void) { /* -- cgit From 3a101b8de0d39403b2c7e5c23fd0b005668acf48 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 22 Apr 2014 21:31:56 -0400 Subject: audit: add netlink audit protocol bind to check capabilities on multicast join Register a netlink per-protocol bind fuction for audit to check userspace process capabilities before allowing a multicast group connection. Signed-off-by: Richard Guy Briggs Signed-off-by: David S. Miller --- include/uapi/linux/capability.h | 7 ++++++- kernel/audit.c | 10 ++++++++++ security/selinux/include/classmap.h | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 154dd6d3c8fe..12c37a197d24 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -347,7 +347,12 @@ struct vfs_cap_data { #define CAP_BLOCK_SUSPEND 36 -#define CAP_LAST_CAP CAP_BLOCK_SUSPEND +/* Allow reading the audit log via multicast netlink socket */ + +#define CAP_AUDIT_READ 37 + + +#define CAP_LAST_CAP CAP_AUDIT_READ #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) diff --git a/kernel/audit.c b/kernel/audit.c index 7c2893602d06..223cb746f141 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1076,10 +1076,20 @@ static void audit_receive(struct sk_buff *skb) mutex_unlock(&audit_cmd_mutex); } +/* Run custom bind function on netlink socket group connect or bind requests. */ +static int audit_bind(int group) +{ + if (!capable(CAP_AUDIT_READ)) + return -EPERM; + + return 0; +} + static int __net_init audit_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = audit_receive, + .bind = audit_bind, }; struct audit_net *aunet = net_generic(net, audit_net_id); diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 14d04e63b1f0..be491a74c1ed 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -147,7 +147,7 @@ struct security_class_mapping secclass_map[] = { { "peer", { "recv", NULL } }, { "capability2", { "mac_override", "mac_admin", "syslog", "wake_alarm", "block_suspend", - NULL } }, + "audit_read", NULL } }, { "kernel_service", { "use_as_override", "create_files_as", NULL } }, { "tun_socket", { COMMON_SOCK_PERMS, "attach_queue", NULL } }, -- cgit From 451f921639fea4600dfb9ab2889332bdcc7b48d3 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 22 Apr 2014 21:31:57 -0400 Subject: audit: add netlink multicast group for log read Add a netlink multicast socket with one group to kaudit for "best-effort" delivery to read-only userspace clients such as systemd, in addition to the existing bidirectional unicast auditd userspace client. Currently, auditd is intended to use the CAP_AUDIT_CONTROL and CAP_AUDIT_WRITE capabilities, but actually uses CAP_NET_ADMIN. The CAP_AUDIT_READ capability is added for use by read-only AUDIT_NLGRP_READLOG netlink multicast group clients to the kaudit subsystem. This will safely give access to services such as systemd to consume audit logs while ensuring write access remains restricted for integrity. Signed-off-by: Richard Guy Briggs Signed-off-by: David S. Miller --- include/uapi/linux/audit.h | 8 ++++++++ kernel/audit.c | 51 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 55 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 11917f747cb4..dfa4c860ccef 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -373,6 +373,14 @@ enum { */ #define AUDIT_MESSAGE_TEXT_MAX 8560 +/* Multicast Netlink socket groups (default up to 32) */ +enum audit_nlgrps { + AUDIT_NLGRP_NONE, /* Group 0 not used */ + AUDIT_NLGRP_READLOG, /* "best effort" read only socket */ + __AUDIT_NLGRP_MAX +}; +#define AUDIT_NLGRP_MAX (__AUDIT_NLGRP_MAX - 1) + struct audit_status { __u32 mask; /* Bit mask for valid entries */ __u32 enabled; /* 1 = enabled, 0 = disabled */ diff --git a/kernel/audit.c b/kernel/audit.c index 223cb746f141..d272cc11dff4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -423,6 +423,35 @@ static void kauditd_send_skb(struct sk_buff *skb) consume_skb(skb); } +/* + * kauditd_send_multicast_skb - send the skb to multicast userspace listeners + * + * This function doesn't consume an skb as might be expected since it has to + * copy it anyways. + */ +static void kauditd_send_multicast_skb(struct sk_buff *skb) +{ + struct sk_buff *copy; + struct audit_net *aunet = net_generic(&init_net, audit_net_id); + struct sock *sock = aunet->nlsk; + + /* + * The seemingly wasteful skb_copy() rather than bumping the refcount + * using skb_get() is necessary because non-standard mods are made to + * the skb by the original kaudit unicast socket send routine. The + * existing auditd daemon assumes this breakage. Fixing this would + * require co-ordinating a change in the established protocol between + * the kaudit kernel subsystem and the auditd userspace code. There is + * no reason for new multicast clients to continue with this + * non-compliance. + */ + copy = skb_copy(skb, GFP_KERNEL); + if (!copy) + return; + + nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); +} + /* * flush_hold_queue - empty the hold queue if auditd appears * @@ -1090,6 +1119,8 @@ static int __net_init audit_net_init(struct net *net) struct netlink_kernel_cfg cfg = { .input = audit_receive, .bind = audit_bind, + .flags = NL_CFG_F_NONROOT_RECV, + .groups = AUDIT_NLGRP_MAX, }; struct audit_net *aunet = net_generic(net, audit_net_id); @@ -1911,10 +1942,10 @@ out: * audit_log_end - end one audit record * @ab: the audit_buffer * - * The netlink_* functions cannot be called inside an irq context, so - * the audit buffer is placed on a queue and a tasklet is scheduled to - * remove them from the queue outside the irq context. May be called in - * any context. + * netlink_unicast() cannot be called inside an irq context because it blocks + * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed + * on a queue and a tasklet is scheduled to remove them from the queue outside + * the irq context. May be called in any context. */ void audit_log_end(struct audit_buffer *ab) { @@ -1924,6 +1955,18 @@ void audit_log_end(struct audit_buffer *ab) audit_log_lost("rate limit exceeded"); } else { struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); + + kauditd_send_multicast_skb(ab->skb); + + /* + * The original kaudit unicast socket sends up messages with + * nlmsg_len set to the payload length rather than the entire + * message length. This breaks the standard set by netlink. + * The existing auditd daemon assumes this breakage. Fixing + * this would require co-ordinating a change in the established + * protocol between the kaudit kernel subsystem and the auditd + * userspace code. + */ nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; if (audit_pid) { -- cgit From 7f74ecd788a8b2a122d4d8bdc4d517cc60b8b638 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 22 Apr 2014 21:31:58 -0400 Subject: audit: send multicast messages only if there are listeners Test first to see if there are any userspace multicast listeners bound to the socket before starting the multicast send work. Signed-off-by: Richard Guy Briggs Signed-off-by: David S. Miller --- kernel/audit.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d272cc11dff4..33531d72e4a2 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -435,6 +435,9 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb) struct audit_net *aunet = net_generic(&init_net, audit_net_id); struct sock *sock = aunet->nlsk; + if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG)) + return; + /* * The seemingly wasteful skb_copy() rather than bumping the refcount * using skb_get() is necessary because non-standard mods are made to -- cgit From ea8fd3b47ff4ed4b1b5942bf3e0cb8d8f590ec59 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:14 -0400 Subject: cgroup: cgroup_apply_cftypes() shouldn't skip the default hierarhcy cgroup_apply_cftypes() skip creating or removing files if the subsystem is attached to the default hierarchy, which led to missing files in the root of the default hierarchy. Skipping made sense when the default hierarchy was dummy; however, now that the default hierarchy is full functional and planned to be used as the unified hierarchy, it shouldn't be skipped over. Reported-by: Li Zefan Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 11a03d67635a..a6894272353b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2436,10 +2436,6 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) lockdep_assert_held(&cgroup_tree_mutex); - /* don't bother if @ss isn't attached */ - if (ss->root == &cgrp_dfl_root) - return 0; - /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; -- cgit From f392e51cd6ae6f6ee5b9b6d611cdc282b4c1711e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:14 -0400 Subject: cgroup: update cgroup->subsys_mask to ->child_subsys_mask and restore cgroup_root->subsys_mask 944196278d3d ("cgroup: move ->subsys_mask from cgroupfs_root to cgroup") moved ->subsys_mask from cgroup_root to cgroup to prepare for the unified hierarhcy; however, it turns out that carrying the subsys_mask of the children in the parent, instead of itself, is a lot more natural. This patch restores cgroup_root->subsys_mask and morphs cgroup->subsys_mask into cgroup->child_subsys_mask. * Uses of root->cgrp.subsys_mask are restored to root->subsys_mask. * Remove automatic setting and clearing of cgrp->subsys_mask and instead just inherit ->child_subsys_mask from the parent during cgroup creation. Note that this doesn't affect any current behaviors. * Undo __kill_css() separation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 7 ++++-- kernel/cgroup.c | 64 +++++++++++++++++++++----------------------------- 2 files changed, 32 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c2515851c1aa..1b5b2fe1b228 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -173,8 +173,8 @@ struct cgroup { */ u64 serial_nr; - /* The bitmask of subsystems attached to this cgroup */ - unsigned long subsys_mask; + /* the bitmask of subsystems enabled on the child cgroups */ + unsigned long child_subsys_mask; /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; @@ -282,6 +282,9 @@ enum { struct cgroup_root { struct kernfs_root *kf_root; + /* The bitmask of subsystems attached to this hierarchy */ + unsigned long subsys_mask; + /* Unique id for this hierarchy. */ int hierarchy_id; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a6894272353b..f944619077f4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -529,7 +529,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * won't change, so no need for locking. */ for_each_subsys(ss, i) { - if (root->cgrp.subsys_mask & (1UL << i)) { + if (root->subsys_mask & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -742,7 +742,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) BUG_ON(!list_empty(&cgrp->children)); /* Rebind all subsystems back to the default hierarchy */ - rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); + rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); /* * Release all the links from cset_links to this hierarchy's @@ -1050,8 +1050,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root, ss->root = dst_root; css->cgroup = &dst_root->cgrp; - src_root->cgrp.subsys_mask &= ~(1 << ssid); - dst_root->cgrp.subsys_mask |= 1 << ssid; + src_root->subsys_mask &= ~(1 << ssid); + src_root->cgrp.child_subsys_mask &= ~(1 << ssid); + + dst_root->subsys_mask |= 1 << ssid; + dst_root->cgrp.child_subsys_mask |= 1 << ssid; if (ss->bind) ss->bind(css); @@ -1069,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, int ssid; for_each_subsys(ss, ssid) - if (root->cgrp.subsys_mask & (1 << ssid)) + if (root->subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) seq_puts(seq, ",sane_behavior"); @@ -1273,12 +1276,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) + if (opts.subsys_mask != root->subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); - added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; - removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; + added_mask = opts.subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || @@ -1535,7 +1538,7 @@ retry: * subsystems) then they must match. */ if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->cgrp.subsys_mask)) { + (opts.subsys_mask != root->subsys_mask)) { if (!name_match) continue; ret = -EBUSY; @@ -3658,8 +3661,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) cgroup_get(cgrp); css_get(css->parent); - cgrp->subsys_mask |= 1 << ss->id; - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", @@ -3780,13 +3781,15 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* let's create and online css's */ for_each_subsys(ss, ssid) { - if (root->cgrp.subsys_mask & (1 << ssid)) { + if (parent->child_subsys_mask & (1 << ssid)) { err = create_css(cgrp, ss); if (err) goto err_destroy; } } + cgrp->child_subsys_mask = parent->child_subsys_mask; + kernfs_activate(kn); mutex_unlock(&cgroup_mutex); @@ -3882,7 +3885,16 @@ static void css_killed_ref_fn(struct percpu_ref *ref) queue_work(cgroup_destroy_wq, &css->destroy_work); } -static void __kill_css(struct cgroup_subsys_state *css) +/** + * kill_css - destroy a css + * @css: css to destroy + * + * This function initiates destruction of @css by removing cgroup interface + * files and putting its base reference. ->css_offline() will be invoked + * asynchronously once css_tryget() is guaranteed to fail and when the + * reference count reaches zero, @css will be released. + */ +static void kill_css(struct cgroup_subsys_state *css) { lockdep_assert_held(&cgroup_tree_mutex); @@ -3911,28 +3923,6 @@ static void __kill_css(struct cgroup_subsys_state *css) percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } -/** - * kill_css - destroy a css - * @css: css to destroy - * - * This function initiates destruction of @css by removing cgroup interface - * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget() is guaranteed to fail and when the - * reference count reaches zero, @css will be released. - */ -static void kill_css(struct cgroup_subsys_state *css) -{ - struct cgroup *cgrp = css->cgroup; - - lockdep_assert_held(&cgroup_tree_mutex); - - /* if already killed, noop */ - if (cgrp->subsys_mask & (1 << css->ss->id)) { - cgrp->subsys_mask &= ~(1 << css->ss->id); - __kill_css(css); - } -} - /** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed @@ -4145,7 +4135,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); - cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id; + cgrp_dfl_root.subsys_mask |= 1 << ss->id; mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); @@ -4302,7 +4292,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(ss, ssid) - if (root->cgrp.subsys_mask & (1 << ssid)) + if (root->subsys_mask & (1 << ssid)) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", -- cgit From aec3dfcb2e43892180ee053e8c260dcdeccf4392 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:14 -0400 Subject: cgroup: introduce effective cgroup_subsys_state In the planned default unified hierarchy, controllers may get dynamically attached to and detached from a cgroup and a cgroup may not have csses for all the controllers associated with the hierarchy. When a cgroup doesn't have its own css for a given controller, the css of the nearest ancestor with the controller enabled will be used, which is called the effective css. This patch introduces cgroup_e_css() and for_each_e_css() to access the effective csses and convert compare_css_sets(), find_existing_css_set() and cgroup_migrate() to use the effective csses so that they can handle cgroups with partial csses correctly. This means that for two css_sets to be considered identical, they should have both matching csses and cgroups. compare_css_sets() already compares both, not for correctness but for optimization. As this now becomes a matter of correctness, update the comments accordingly. For all !default hierarchies, cgroup_e_css() always equals cgroup_css(), so this patch doesn't change behavior. While at it, fix incorrect locking comment for for_each_css(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 83 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f944619077f4..4eb2dd1bb5b1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -208,6 +208,34 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, return &cgrp->dummy_css; } +/** + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest (%NULL returns the dummy_css) + * + * Similar to cgroup_css() but returns the effctive css, which is defined + * as the matching css of the nearest ancestor including self which has @ss + * enabled. If @ss is associated with the hierarchy @cgrp is on, this + * function is guaranteed to return non-NULL css. + */ +static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + lockdep_assert_held(&cgroup_mutex); + + if (!ss) + return &cgrp->dummy_css; + + if (!(cgrp->root->subsys_mask & (1 << ss->id))) + return NULL; + + while (cgrp->parent && + !(cgrp->parent->child_subsys_mask & (1 << ss->id))) + cgrp = cgrp->parent; + + return cgroup_css(cgrp, ss); +} + /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { @@ -273,7 +301,7 @@ static int notify_on_release(const struct cgroup *cgrp) * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * - * Should be called under cgroup_mutex. + * Should be called under cgroup_[tree_]mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ @@ -283,6 +311,20 @@ static int notify_on_release(const struct cgroup *cgrp) lockdep_is_held(&cgroup_mutex)))) { } \ else +/** + * for_each_e_css - iterate all effective css's of a cgroup + * @css: the iteration cursor + * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end + * @cgrp: the target cgroup to iterate css's of + * + * Should be called under cgroup_[tree_]mutex. + */ +#define for_each_e_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ + ; \ + else + /** * for_each_subsys - iterate all enabled cgroup subsystems * @ss: the iteration cursor @@ -452,20 +494,20 @@ static bool compare_css_sets(struct css_set *cset, { struct list_head *l1, *l2; - if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { - /* Not all subsystems matched */ + /* + * On the default hierarchy, there can be csets which are + * associated with the same set of cgroups but different csses. + * Let's first ensure that csses match. + */ + if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; - } /* * Compare cgroup pointers in order to distinguish between - * different cgroups in heirarchies with no subsystems. We - * could get by with just this check alone (and skip the - * memcmp above) but on most setups the memcmp check will - * avoid the need for this more expensive check on almost all - * candidates. + * different cgroups in hierarchies. As different cgroups may + * share the same effective css, this comparison is always + * necessary. */ - l1 = &cset->cgrp_links; l2 = &old_cset->cgrp_links; while (1) { @@ -530,13 +572,16 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, */ for_each_subsys(ss, i) { if (root->subsys_mask & (1UL << i)) { - /* Subsystem is in this hierarchy. So we want - * the subsystem state from the new - * cgroup */ - template[i] = cgroup_css(cgrp, ss); + /* + * @ss is in this hierarchy, so we want the + * effective css from @cgrp. + */ + template[i] = cgroup_e_css(cgrp, ss); } else { - /* Subsystem is not in this hierarchy, so we - * don't want to change the subsystem state */ + /* + * @ss is not in this hierarchy, so we don't want + * to change the css. + */ template[i] = old_cset->subsys[i]; } } @@ -1969,7 +2014,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, return 0; /* check that we can legitimately attach to the cgroup */ - for_each_css(css, i, cgrp) { + for_each_e_css(css, i, cgrp) { if (css->ss->can_attach) { ret = css->ss->can_attach(css, &tset); if (ret) { @@ -1999,7 +2044,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, */ tset.csets = &tset.dst_csets; - for_each_css(css, i, cgrp) + for_each_e_css(css, i, cgrp) if (css->ss->attach) css->ss->attach(css, &tset); @@ -2007,7 +2052,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, goto out_release_tset; out_cancel_attach: - for_each_css(css, i, cgrp) { + for_each_e_css(css, i, cgrp) { if (css == failed_css) break; if (css->ss->cancel_attach) -- cgit From 2d8f243a5e6efa57fb7c46fe83fafa45b33d0ec2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: implement cgroup->e_csets[] On the default unified hierarchy, a cgroup may be associated with csses of its ancestors, which means that a css of a given cgroup may be associated with css_sets of descendant cgroups. This means that we can't walk all tasks associated with a css by iterating the css_sets associated with the cgroup as there are css_sets which are pointing to the css but linked on the descendants. This patch adds per-subsystem list heads cgroup->e_csets[]. Any css_set which is pointing to a css is linked to css->cgroup->e_csets[$SUBSYS_ID] through css_set->e_cset_node[$SUBSYS_ID]. The lists are protected by css_set_rwsem and will allow us to walk all css_sets associated with a given css so that we can find out all associated tasks. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 18 ++++++++++++++++++ kernel/cgroup.c | 30 ++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1b5b2fe1b228..33a0043ef454 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -187,6 +187,15 @@ struct cgroup { */ struct list_head cset_links; + /* + * On the default hierarchy, a css_set for a cgroup with some + * susbsys disabled will point to css's which are associated with + * the closest ancestor which has the subsys enabled. The + * following lists all css_sets which point to this cgroup's css + * for the given subsystem. + */ + struct list_head e_csets[CGROUP_SUBSYS_COUNT]; + /* * Linked list running through all cgroups that can * potentially be reaped by the release agent. Protected by @@ -369,6 +378,15 @@ struct css_set { struct cgroup *mg_src_cgrp; struct css_set *mg_dst_cset; + /* + * On the default hierarhcy, ->subsys[ssid] may point to a css + * attached to an ancestor instead of the cgroup this css_set is + * associated with. The following node is anchored at + * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to + * iterate through all css's attached to a given cgroup. + */ + struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; + /* For RCU-protected deletion */ struct rcu_head rcu_head; }; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4eb2dd1bb5b1..37d966289978 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -425,6 +425,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) static void put_css_set_locked(struct css_set *cset, bool taskexit) { struct cgrp_cset_link *link, *tmp_link; + struct cgroup_subsys *ss; + int ssid; lockdep_assert_held(&css_set_rwsem); @@ -432,6 +434,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) return; /* This css_set is dead. unlink it and release cgroup refcounts */ + for_each_subsys(ss, ssid) + list_del(&cset->e_cset_node[ssid]); hash_del(&cset->hlist); css_set_count--; @@ -673,7 +677,9 @@ static struct css_set *find_css_set(struct css_set *old_cset, struct css_set *cset; struct list_head tmp_links; struct cgrp_cset_link *link; + struct cgroup_subsys *ss; unsigned long key; + int ssid; lockdep_assert_held(&cgroup_mutex); @@ -724,10 +730,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, css_set_count++; - /* Add this cgroup group to the hash table */ + /* Add @cset to the hash table */ key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); + for_each_subsys(ss, ssid) + list_add_tail(&cset->e_cset_node[ssid], + &cset->subsys[ssid]->cgroup->e_csets[ssid]); + up_write(&css_set_rwsem); return cset; @@ -1028,7 +1038,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask) { struct cgroup_subsys *ss; - int ssid, ret; + int ssid, i, ret; lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); @@ -1081,6 +1091,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, for_each_subsys(ss, ssid) { struct cgroup_root *src_root; struct cgroup_subsys_state *css; + struct css_set *cset; if (!(ss_mask & (1 << ssid))) continue; @@ -1095,6 +1106,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, ss->root = dst_root; css->cgroup = &dst_root->cgrp; + down_write(&css_set_rwsem); + hash_for_each(css_set_table, i, cset, hlist) + list_move_tail(&cset->e_cset_node[ss->id], + &dst_root->cgrp.e_csets[ss->id]); + up_write(&css_set_rwsem); + src_root->subsys_mask &= ~(1 << ssid); src_root->cgrp.child_subsys_mask &= ~(1 << ssid); @@ -1417,6 +1434,9 @@ out_unlock: static void init_cgroup_housekeeping(struct cgroup *cgrp) { + struct cgroup_subsys *ss; + int ssid; + atomic_set(&cgrp->refcnt, 1); INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); @@ -1425,6 +1445,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->dummy_css.cgroup = cgrp; + + for_each_subsys(ss, ssid) + INIT_LIST_HEAD(&cgrp->e_csets[ssid]); } static void init_cgroup_root(struct cgroup_root *root, @@ -4249,6 +4272,9 @@ int __init cgroup_init(void) if (!ss->early_init) cgroup_init_subsys(ss); + list_add_tail(&init_css_set.e_cset_node[ssid], + &cgrp_dfl_root.cgrp.e_csets[ssid]); + /* * cftype registration needs kmalloc and can't be done * during early_init. Register base cftypes separately. -- cgit From 3b281afbc3a06cd69c54e6db1a04a8e73997723f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: make css_next_child() skip missing csses css_next_child() walks the children of the specified css. It does this by finding the next cgroup and then returning the requested css. On the default unified hierarchy, a cgroup may not have a css associated with it even if the hierarchy has the subsystem enabled. This patch updates css_next_child() so that it skips children without the requested css associated. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 37d966289978..0edc186cd545 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2708,10 +2708,19 @@ css_next_child(struct cgroup_subsys_state *pos_css, break; } - if (&next->sibling == &cgrp->children) - return NULL; + /* + * @next, if not pointing to the head, can be dereferenced and is + * the next sibling; however, it might have @ss disabled. If so, + * fast-forward to the next enabled one. + */ + while (&next->sibling != &cgrp->children) { + struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss); - return cgroup_css(next, parent_css->ss); + if (next_css) + return next_css; + next = list_entry_rcu(next->sibling.next, struct cgroup, sibling); + } + return NULL; } /** -- cgit From 0f0a2b4fa6210147131082999f1f16d7fb79abf8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: reorganize css_task_iter This patch reorganizes css_task_iter so that adding effective css support is easier. * s/->cset_link/->cset_pos/ and s/->task/->task_pos/ for consistency * ->origin_css is used to determine whether the iteration reached the last css_set. Replace it with explicit ->cset_head so that css_advance_task_iter() doesn't have to know the termination condition directly. * css_task_iter_next() currently assumes that it's walking list of cgrp_cset_link and reaches into the current cset through the current link to determine the termination conditions for task walking. As this won't always be true for effective css walking, add ->tasks_head and ->mg_tasks_head and use them to control task walking so that css_task_iter_next() doesn't have to know how css_sets are being walked. This patch doesn't make any behavior changes. The iteration logic stays unchanged after the patch. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 9 ++++++--- kernel/cgroup.c | 33 +++++++++++++++++---------------- 2 files changed, 23 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 33a0043ef454..bee390586120 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -842,9 +842,12 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, /* A css_task_iter should be treated as an opaque object */ struct css_task_iter { - struct cgroup_subsys_state *origin_css; - struct list_head *cset_link; - struct list_head *task; + struct list_head *cset_pos; + struct list_head *cset_head; + + struct list_head *task_pos; + struct list_head *tasks_head; + struct list_head *mg_tasks_head; }; void css_task_iter_start(struct cgroup_subsys_state *css, diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0edc186cd545..d48163b26196 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2857,27 +2857,30 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, */ static void css_advance_task_iter(struct css_task_iter *it) { - struct list_head *l = it->cset_link; + struct list_head *l = it->cset_pos; struct cgrp_cset_link *link; struct css_set *cset; /* Advance to the next non-empty css_set */ do { l = l->next; - if (l == &it->origin_css->cgroup->cset_links) { - it->cset_link = NULL; + if (l == it->cset_head) { + it->cset_pos = NULL; return; } link = list_entry(l, struct cgrp_cset_link, cset_link); cset = link->cset; } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); - it->cset_link = l; + it->cset_pos = l; if (!list_empty(&cset->tasks)) - it->task = cset->tasks.next; + it->task_pos = cset->tasks.next; else - it->task = cset->mg_tasks.next; + it->task_pos = cset->mg_tasks.next; + + it->tasks_head = &cset->tasks; + it->mg_tasks_head = &cset->mg_tasks; } /** @@ -2903,8 +2906,8 @@ void css_task_iter_start(struct cgroup_subsys_state *css, down_read(&css_set_rwsem); - it->origin_css = css; - it->cset_link = &css->cgroup->cset_links; + it->cset_pos = &css->cgroup->cset_links; + it->cset_head = it->cset_pos; css_advance_task_iter(it); } @@ -2920,12 +2923,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css, struct task_struct *css_task_iter_next(struct css_task_iter *it) { struct task_struct *res; - struct list_head *l = it->task; - struct cgrp_cset_link *link = list_entry(it->cset_link, - struct cgrp_cset_link, cset_link); + struct list_head *l = it->task_pos; /* If the iterator cg is NULL, we have no tasks */ - if (!it->cset_link) + if (!it->cset_pos) return NULL; res = list_entry(l, struct task_struct, cg_list); @@ -2936,13 +2937,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) */ l = l->next; - if (l == &link->cset->tasks) - l = link->cset->mg_tasks.next; + if (l == it->tasks_head) + l = it->mg_tasks_head->next; - if (l == &link->cset->mg_tasks) + if (l == it->mg_tasks_head) css_advance_task_iter(it); else - it->task = l; + it->task_pos = l; return res; } -- cgit From 3ebb2b6ef38875b866ec0118bfae7bc52afd0166 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: teach css_task_iter about effective csses Currently, css_task_iter iterates tasks associated with a css by visiting each css_set associated with the owning cgroup and walking tasks of each of them. This works fine for !unified hierarchies as each cgroup has its own css for each associated subsystem on the hierarchy; however, on the planned unified hierarchy, a cgroup may not have csses associated and its tasks would be considered associated with the matching css of the nearest ancestor which has the subsystem enabled. This means that on the default unified hierarchy, just walking all tasks associated with a cgroup isn't enough to walk all tasks which are associated with the specified css. If any of its children doesn't have the matching css enabled, task iteration should also include all tasks from the subtree. We already added cgroup->e_csets[] to list all css_sets effectively associated with a given css and walk css_sets on that list instead to achieve such iteration. This patch updates css_task_iter iteration such that it walks css_sets on cgroup->e_csets[] instead of cgroup->cset_links if iteration is requested on an non-dummy css. Thanks to the previous iteration update, this change can be achieved with the addition of css_task_iter->ss and minimal updates to css_advance_task_iter() and css_task_iter_start(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 2 ++ kernel/cgroup.c | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index bee390586120..18fcae39e63e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -842,6 +842,8 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, /* A css_task_iter should be treated as an opaque object */ struct css_task_iter { + struct cgroup_subsys *ss; + struct list_head *cset_pos; struct list_head *cset_head; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d48163b26196..ad28866ed44c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2868,8 +2868,14 @@ static void css_advance_task_iter(struct css_task_iter *it) it->cset_pos = NULL; return; } - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; + + if (it->ss) { + cset = container_of(l, struct css_set, + e_cset_node[it->ss->id]); + } else { + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); it->cset_pos = l; @@ -2906,7 +2912,13 @@ void css_task_iter_start(struct cgroup_subsys_state *css, down_read(&css_set_rwsem); - it->cset_pos = &css->cgroup->cset_links; + it->ss = css->ss; + + if (it->ss) + it->cset_pos = &css->cgroup->e_csets[css->ss->id]; + else + it->cset_pos = &css->cgroup->cset_links; + it->cset_head = it->cset_pos; css_advance_task_iter(it); -- cgit From e32978031016f56be977a9a856ba4d9f447db51f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: cgroup->subsys[] should be cleared after the css is offlined After a css finishes offlining, offline_css() mistakenly performs RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css) which just sets the cgroup->subsys[] pointer to the current value. The intention was to clear it after offline is complete, not reassign the same value. Update it to assign NULL instead of the current value. This makes cgroup_css() to return NULL once offline is complete. All the existing users of the function either can handle NULL return already or guarantee that the css doesn't get offlined. While this is a bugfix, as css lifetime is currently tied to the cgroup it belongs to, this bug doesn't cause any actual problems. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ad28866ed44c..83a8fff43d68 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3710,7 +3710,7 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; - RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); + RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); } /** -- cgit From bd53d617b34c781dac8e22dbc75e8f182d918ecf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: allow cgroup creation and suppress automatic css creation in the unified hierarchy Now that effective css handling has been added and iterators updated accordingly, it's safe to allow cgroup creation in the default hierarchy. Unblock cgroup creation in the default hierarchy. As the default hierarchy will implement explicit enabling and disabling of controllers on each cgroup, suppress automatic css enabling on cgroup creation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 83a8fff43d68..2a4f88db3205 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1115,8 +1115,10 @@ static int rebind_subsystems(struct cgroup_root *dst_root, src_root->subsys_mask &= ~(1 << ssid); src_root->cgrp.child_subsys_mask &= ~(1 << ssid); + /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; - dst_root->cgrp.child_subsys_mask |= 1 << ssid; + if (dst_root != &cgrp_dfl_root) + dst_root->cgrp.child_subsys_mask |= 1 << ssid; if (ss->bind) ss->bind(css); @@ -3786,13 +3788,6 @@ static long cgroup_create(struct cgroup *parent, const char *name, struct cgroup_subsys *ss; struct kernfs_node *kn; - /* - * XXX: The default hierarchy isn't fully implemented yet. Block - * !root cgroup creation on it for now. - */ - if (root == &cgrp_dfl_root) - return -EINVAL; - /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) @@ -3878,7 +3873,12 @@ static long cgroup_create(struct cgroup *parent, const char *name, } } - cgrp->child_subsys_mask = parent->child_subsys_mask; + /* + * On the default hierarchy, a child doesn't automatically inherit + * child_subsys_mask from the parent. Each is configured manually. + */ + if (!cgroup_on_dfl(cgrp)) + cgrp->child_subsys_mask = parent->child_subsys_mask; kernfs_activate(kn); -- cgit From 6803c006282768ec850760766a6e4eb1a6ff87df Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: add css_set->dfl_cgrp To implement the unified hierarchy behavior, we'll need to be able to determine the associated cgroup on the default hierarchy from css_set. Let's add css_set->dfl_cgrp so that it can be accessed conveniently and efficiently. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 3 +++ kernel/cgroup.c | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 18fcae39e63e..c49d161a71cd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -354,6 +354,9 @@ struct css_set { */ struct list_head cgrp_links; + /* the default cgroup associated with this css_set */ + struct cgroup *dfl_cgrp; + /* * Set of subsystem states, one for each subsystem. This array is * immutable after creation apart from the init_css_set during diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a4f88db3205..c66bfc8ee8a7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -651,6 +651,10 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, struct cgrp_cset_link *link; BUG_ON(list_empty(tmp_links)); + + if (cgroup_on_dfl(cgrp)) + cset->dfl_cgrp = cgrp; + link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; -- cgit From 7fd8c565d8a501486d63d7ee07fd6582e97db437 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: update subsystem rebind restrictions Because the default root couldn't have any non-root csses attached to it, rebinding away from it was always allowed; however, the default hierarchy will soon host the unified hierarchy and have non-root csses so the rebind restrictions need to be updated accordingly. Instead of special casing rebinding from the default hierarchy and then checking whether the source hierarchy has children cgroups, which implies non-root csses for !dfl hierarchies, simply check whether the source hierarchy has non-root csses for the subsystem using css_next_child(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c66bfc8ee8a7..15eb2273d80b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1051,16 +1051,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, if (!(ss_mask & (1 << ssid))) continue; - /* if @ss is on the dummy_root, we can always move it */ - if (ss->root == &cgrp_dfl_root) - continue; - - /* if @ss has non-root cgroups attached to it, can't move */ - if (!list_empty(&ss->root->cgrp.children)) + /* if @ss has non-root csses attached to it, can't move */ + if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) return -EBUSY; /* can't move between two non-dummy roots either */ - if (dst_root != &cgrp_dfl_root) + if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; } -- cgit From f817de98513d060023be4fa1d061b29a6515273e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: prepare migration path for unified hierarchy Unified hierarchy implementation would require re-migrating tasks onto the same cgroup on the default hierarchy to reflect updated effective csses. Update cgroup_migrate_prepare_dst() so that it accepts NULL as the destination cgrp. When NULL is specified, the destination is considered to be the cgroup on the default hierarchy associated with each css_set. After this change, the identity check in cgroup_migrate_add_src() isn't sufficient for noop detection as the associated csses may change without any cgroup association changing. The only way to tell whether a migration is noop or not is testing whether the source and destination csets are identical. The noop check in cgroup_migrate_add_src() is removed and cset identity test is added to cgroup_migreate_prepare_dst(). If it's detected that source and destination csets are identical, the cset is removed removed from @preloaded_csets and all the migration nodes are cleared which makes cgroup_migrate() ignore the cset. Also, make the function append the destination css_sets to @preloaded_list so that destination css_sets always come after source css_sets. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 15eb2273d80b..8c2835a9e192 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1902,10 +1902,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); - /* nothing to do if this cset already belongs to the cgroup */ - if (src_cgrp == dst_cgrp) - return; - if (!list_empty(&src_cset->mg_preload_node)) return; @@ -1920,13 +1916,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @dst_cgrp: the destination cgroup + * @dst_cgrp: the destination cgroup (may be %NULL) * @preloaded_csets: list of preloaded source css_sets * * Tasks are about to be moved to @dst_cgrp and all the source css_sets * have been preloaded to @preloaded_csets. This function looks up and - * pins all destination css_sets, links each to its source, and put them on - * @preloaded_csets. + * pins all destination css_sets, links each to its source, and append them + * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each + * source css_set is assumed to be its cgroup on the default hierarchy. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed @@ -1937,19 +1934,34 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, struct list_head *preloaded_csets) { LIST_HEAD(csets); - struct css_set *src_cset; + struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); /* look up the dst cset for each src cset and link it to src */ - list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { + list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; - dst_cset = find_css_set(src_cset, dst_cgrp); + dst_cset = find_css_set(src_cset, + dst_cgrp ?: src_cset->dfl_cgrp); if (!dst_cset) goto err; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); + + /* + * If src cset equals dst, it's noop. Drop the src. + * cgroup_migrate() will skip the cset too. Note that we + * can't handle src == dst as some nodes are used by both. + */ + if (src_cset == dst_cset) { + src_cset->mg_src_cgrp = NULL; + list_del_init(&src_cset->mg_preload_node); + put_css_set(src_cset, false); + put_css_set(dst_cset, false); + continue; + } + src_cset->mg_dst_cset = dst_cset; if (list_empty(&dst_cset->mg_preload_node)) @@ -1958,7 +1970,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, put_css_set(dst_cset, false); } - list_splice(&csets, preloaded_csets); + list_splice_tail(&csets, preloaded_csets); return 0; err: cgroup_migrate_finish(&csets); -- cgit From f8f22e53a262ebee37fc98004f16b066cf5bc125 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: implement dynamic subtree controller enable/disable on the default hierarchy cgroup is switching away from multiple hierarchies and will use one unified default hierarchy where controllers can be dynamically enabled and disabled per subtree. The default hierarchy will serve as the unified hierarchy to which all controllers are attached and a css on the default hierarchy would need to also serve the tasks of descendant cgroups which don't have the controller enabled - ie. the tree may be collapsed from leaf towards root when viewed from specific controllers. This has been implemented through effective css in the previous patches. This patch finally implements dynamic subtree controller enable/disable on the default hierarchy via a new knob - "cgroup.subtree_control" which controls which controllers are enabled on the child cgroups. Let's assume a hierarchy like the following. root - A - B - C \ D root's "cgroup.subtree_control" determines which controllers are enabled on A. A's on B. B's on C and D. This coincides with the fact that controllers on the immediate sub-level are used to distribute the resources of the parent. In fact, it's natural to assume that resource control knobs of a child belong to its parent. Enabling a controller in "cgroup.subtree_control" declares that distribution of the respective resources of the cgroup will be controlled. Note that this means that controller enable states are shared among siblings. The default hierarchy has an extra restriction - only cgroups which don't contain any task may have controllers enabled in "cgroup.subtree_control". Combined with the other properties of the default hierarchy, this guarantees that, from the view point of controllers, tasks are only on the leaf cgroups. In other words, only leaf csses may contain tasks. This rules out situations where child cgroups compete against internal tasks of the parent, which is a competition between two different types of entities without any clear way to determine resource distribution between the two. Different controllers handle it differently and all the implemented behaviors are ambiguous, ad-hoc, cumbersome and/or just wrong. Having this structural constraints imposed from cgroup core removes the burden from controller implementations and enables showing one consistent behavior across all controllers. When a controller is enabled or disabled, css associations for the controller in the subtrees of each child should be updated. After enabling, the whole subtree of a child should point to the new css of the child. After disabling, the whole subtree of a child should point to the cgroup's css. This is implemented by first updating cgroup states such that cgroup_e_css() result points to the appropriate css and then invoking cgroup_update_dfl_csses() which migrates all tasks in the affected subtrees to the self cgroup on the default hierarchy. * When read, "cgroup.subtree_control" lists all the currently enabled controllers on the children of the cgroup. * White-space separated list of controller names prefixed with either '+' or '-' can be written to "cgroup.subtree_control". The ones prefixed with '+' are enabled on the controller and '-' disabled. * A controller can be enabled iff the parent's "cgroup.subtree_control" enables it and disabled iff no child's "cgroup.subtree_control" has it enabled. * If a cgroup has tasks, no controller can be enabled via "cgroup.subtree_control". Likewise, if "cgroup.subtree_control" has some controllers enabled, tasks can't be migrated into the cgroup. * All controllers which aren't bound on other hierarchies are automatically associated with the root cgroup of the default hierarchy. All the controllers which are bound to the default hierarchy are listed in the read-only file "cgroup.controllers" in the root directory. * "cgroup.controllers" in all non-root cgroups is read-only file whose content is equal to that of "cgroup.subtree_control" of the parent. This indicates which controllers can be used in the cgroup's "cgroup.subtree_control". This is still experimental and there are some holes, one of which is that ->can_attach() failure during cgroup_update_dfl_csses() may leave the cgroups in an undefined state. The issues will be addressed by future patches. v2: Non-root cgroups now also have "cgroup.controllers". Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 5 + kernel/cgroup.c | 367 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 370 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c49d161a71cd..ada239253ec7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -21,6 +21,7 @@ #include #include #include +#include #ifdef CONFIG_CGROUPS @@ -164,6 +165,7 @@ struct cgroup { struct cgroup *parent; /* my parent */ struct kernfs_node *kn; /* cgroup kernfs entry */ + struct kernfs_node *control_kn; /* kn for "cgroup.subtree_control" */ /* * Monotonically increasing unique serial number which defines a @@ -216,6 +218,9 @@ struct cgroup { /* For css percpu_ref killing and RCU-protected deletion */ struct rcu_head rcu_head; struct work_struct destroy_work; + + /* used to wait for offlining of csses */ + wait_queue_head_t offline_waitq; }; #define MAX_CGROUP_ROOT_NAMELEN 64 diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c2835a9e192..809dd903ceb8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -182,6 +182,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask); static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); @@ -338,6 +340,14 @@ static int notify_on_release(const struct cgroup *cgrp) #define for_each_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) +/* iterate over child cgrps, lock should be held throughout iteration */ +#define cgroup_for_each_live_child(child, cgrp) \ + list_for_each_entry((child), &(cgrp)->children, sibling) \ + if (({ lockdep_assert_held(&cgroup_tree_mutex); \ + cgroup_is_dead(child); })) \ + ; \ + else + /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness @@ -1450,6 +1460,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); + + init_waitqueue_head(&cgrp->offline_waitq); } static void init_cgroup_root(struct cgroup_root *root, @@ -1938,6 +1950,14 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, lockdep_assert_held(&cgroup_mutex); + /* + * Except for the root, child_subsys_mask must be zero for a cgroup + * with tasks so that child cgroups don't compete against tasks. + */ + if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && dst_cgrp->parent && + dst_cgrp->child_subsys_mask) + return -EBUSY; + /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; @@ -2303,6 +2323,326 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } +static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask) +{ + struct cgroup_subsys *ss; + bool printed = false; + int ssid; + + for_each_subsys(ss, ssid) { + if (ss_mask & (1 << ssid)) { + if (printed) + seq_putc(seq, ' '); + seq_printf(seq, "%s", ss->name); + printed = true; + } + } + if (printed) + seq_putc(seq, '\n'); +} + +/* show controllers which are currently attached to the default hierarchy */ +static int cgroup_root_controllers_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + cgroup_print_ss_mask(seq, cgrp->root->subsys_mask); + return 0; +} + +/* show controllers which are enabled from the parent */ +static int cgroup_controllers_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + cgroup_print_ss_mask(seq, cgrp->parent->child_subsys_mask); + return 0; +} + +/* show controllers which are enabled for a given cgroup's children */ +static int cgroup_subtree_control_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); + return 0; +} + +/** + * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy + * @cgrp: root of the subtree to update csses for + * + * @cgrp's child_subsys_mask has changed and its subtree's (self excluded) + * css associations need to be updated accordingly. This function looks up + * all css_sets which are attached to the subtree, creates the matching + * updated css_sets and migrates the tasks to the new ones. + */ +static int cgroup_update_dfl_csses(struct cgroup *cgrp) +{ + LIST_HEAD(preloaded_csets); + struct cgroup_subsys_state *css; + struct css_set *src_cset; + int ret; + + lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); + + /* look up all csses currently attached to @cgrp's subtree */ + down_read(&css_set_rwsem); + css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { + struct cgrp_cset_link *link; + + /* self is not affected by child_subsys_mask change */ + if (css->cgroup == cgrp) + continue; + + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, cgrp, + &preloaded_csets); + } + up_read(&css_set_rwsem); + + /* NULL dst indicates self on default hierarchy */ + ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); + if (ret) + goto out_finish; + + list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { + struct task_struct *last_task = NULL, *task; + + /* src_csets precede dst_csets, break on the first dst_cset */ + if (!src_cset->mg_src_cgrp) + break; + + /* + * All tasks in src_cset need to be migrated to the + * matching dst_cset. Empty it process by process. We + * walk tasks but migrate processes. The leader might even + * belong to a different cset but such src_cset would also + * be among the target src_csets because the default + * hierarchy enforces per-process membership. + */ + while (true) { + down_read(&css_set_rwsem); + task = list_first_entry_or_null(&src_cset->tasks, + struct task_struct, cg_list); + if (task) { + task = task->group_leader; + WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp); + get_task_struct(task); + } + up_read(&css_set_rwsem); + + if (!task) + break; + + /* guard against possible infinite loop */ + if (WARN(last_task == task, + "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n")) + goto out_finish; + last_task = task; + + threadgroup_lock(task); + /* raced against de_thread() from another thread? */ + if (!thread_group_leader(task)) { + threadgroup_unlock(task); + put_task_struct(task); + continue; + } + + ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); + + threadgroup_unlock(task); + put_task_struct(task); + + if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) + goto out_finish; + } + } + +out_finish: + cgroup_migrate_finish(&preloaded_csets); + return ret; +} + +/* change the enabled child controllers for a cgroup in the default hierarchy */ +static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, + struct cftype *cft, char *buffer) +{ + unsigned long enable_req = 0, disable_req = 0, enable, disable; + struct cgroup *cgrp = dummy_css->cgroup, *child; + struct cgroup_subsys *ss; + char *tok, *p; + int ssid, ret; + + /* + * Parse input - white space separated list of subsystem names + * prefixed with either + or -. + */ + p = buffer; + while ((tok = strsep(&p, " \t\n"))) { + for_each_subsys(ss, ssid) { + if (ss->disabled || strcmp(tok + 1, ss->name)) + continue; + + if (*tok == '+') { + enable_req |= 1 << ssid; + disable_req &= ~(1 << ssid); + } else if (*tok == '-') { + disable_req |= 1 << ssid; + enable_req &= ~(1 << ssid); + } else { + return -EINVAL; + } + break; + } + if (ssid == CGROUP_SUBSYS_COUNT) + return -EINVAL; + } + + /* + * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * active_ref. cgroup_lock_live_group() already provides enough + * protection. Ensure @cgrp stays accessible and break the + * active_ref protection. + */ + cgroup_get(cgrp); + kernfs_break_active_protection(cgrp->control_kn); +retry: + enable = enable_req; + disable = disable_req; + + mutex_lock(&cgroup_tree_mutex); + + for_each_subsys(ss, ssid) { + if (enable & (1 << ssid)) { + if (cgrp->child_subsys_mask & (1 << ssid)) { + enable &= ~(1 << ssid); + continue; + } + + /* + * Because css offlining is asynchronous, userland + * might try to re-enable the same controller while + * the previous instance is still around. In such + * cases, wait till it's gone using offline_waitq. + */ + cgroup_for_each_live_child(child, cgrp) { + wait_queue_t wait; + + if (!cgroup_css(child, ss)) + continue; + + prepare_to_wait(&child->offline_waitq, &wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&cgroup_tree_mutex); + schedule(); + finish_wait(&child->offline_waitq, &wait); + goto retry; + } + + /* unavailable or not enabled on the parent? */ + if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || + (cgrp->parent && + !(cgrp->parent->child_subsys_mask & (1 << ssid)))) { + ret = -ENOENT; + goto out_unlock_tree; + } + } else if (disable & (1 << ssid)) { + if (!(cgrp->child_subsys_mask & (1 << ssid))) { + disable &= ~(1 << ssid); + continue; + } + + /* a child has it enabled? */ + cgroup_for_each_live_child(child, cgrp) { + if (child->child_subsys_mask & (1 << ssid)) { + ret = -EBUSY; + goto out_unlock_tree; + } + } + } + } + + if (!enable && !disable) { + ret = 0; + goto out_unlock_tree; + } + + if (!cgroup_lock_live_group(cgrp)) { + ret = -ENODEV; + goto out_unlock_tree; + } + + /* + * Except for the root, child_subsys_mask must be zero for a cgroup + * with tasks so that child cgroups don't compete against tasks. + */ + if (enable && cgrp->parent && !list_empty(&cgrp->cset_links)) { + ret = -EBUSY; + goto out_unlock; + } + + /* + * Create csses for enables and update child_subsys_mask. This + * changes cgroup_e_css() results which in turn makes the + * subsequent cgroup_update_dfl_csses() associate all tasks in the + * subtree to the updated csses. + */ + for_each_subsys(ss, ssid) { + if (!(enable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) { + ret = create_css(child, ss); + if (ret) + goto err_undo_css; + } + } + + cgrp->child_subsys_mask |= enable; + cgrp->child_subsys_mask &= ~disable; + + ret = cgroup_update_dfl_csses(cgrp); + if (ret) + goto err_undo_css; + + /* all tasks are now migrated away from the old csses, kill them */ + for_each_subsys(ss, ssid) { + if (!(disable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) + kill_css(cgroup_css(child, ss)); + } + + kernfs_activate(cgrp->kn); + ret = 0; +out_unlock: + mutex_unlock(&cgroup_mutex); +out_unlock_tree: + mutex_unlock(&cgroup_tree_mutex); + kernfs_unbreak_active_protection(cgrp->control_kn); + cgroup_put(cgrp); + return ret; + +err_undo_css: + cgrp->child_subsys_mask &= ~enable; + cgrp->child_subsys_mask |= disable; + + for_each_subsys(ss, ssid) { + if (!(enable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) { + struct cgroup_subsys_state *css = cgroup_css(child, ss); + if (css) + kill_css(css); + } + } + goto out_unlock; +} + static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -2462,9 +2802,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) return PTR_ERR(kn); ret = cgroup_kn_set_ugid(kn); - if (ret) + if (ret) { kernfs_remove(kn); - return ret; + return ret; + } + + if (cft->seq_show == cgroup_subtree_control_show) + cgrp->control_kn = kn; + return 0; } /** @@ -3557,6 +3902,22 @@ static struct cftype cgroup_base_files[] = { .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_sane_behavior_show, }, + { + .name = "cgroup.controllers", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_root_controllers_show, + }, + { + .name = "cgroup.controllers", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_controllers_show, + }, + { + .name = "cgroup.subtree_control", + .flags = CFTYPE_ONLY_ON_DFL, + .seq_show = cgroup_subtree_control_show, + .write_string = cgroup_subtree_control_write, + }, /* * Historical crazy stuff. These don't have "cgroup." prefix and @@ -3725,6 +4086,8 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); + + wake_up_all(&css->cgroup->offline_waitq); } /** -- cgit From db66d756c74acb886c51f11b501c2fe622018a0a Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Fri, 18 Apr 2014 01:59:15 +0900 Subject: sched/docbook: Fix 'make htmldocs' warnings caused by missing description When 'flags' argument to sched_{set,get}attr() syscalls were added in: 6d35ab48090b ("sched: Add 'flags' argument to sched_{set,get}attr() syscalls") no description for 'flags' was added. It causes the following warnings on "make htmldocs": Warning(/kernel/sched/core.c:3645): No description found for parameter 'flags' Warning(/kernel/sched/core.c:3789): No description found for parameter 'flags' Signed-off-by: Masanari Iida Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1397753955-2914-1-git-send-email-standby24x7@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45ea238c..9fe2190005cb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3639,6 +3639,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) * sys_sched_setattr - same as above, but with extended sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. + * @flags: for future extension. */ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, flags) @@ -3783,6 +3784,7 @@ err_size: * @pid: the pid in question. * @uattr: structure containing the extended parameters. * @size: sizeof(attr) for fwd/bwd comp. + * @flags: for future extension. */ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, size, unsigned int, flags) -- cgit From be8f274323c26ddc7e6fd6c44254b7abcdbe6389 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:16:58 +0900 Subject: kprobes: Prohibit probing on .entry.text code .entry.text is a code area which is used for interrupt/syscall entries, which includes many sensitive code. Thus, it is better to prohibit probing on all of such code instead of a part of that. Since some symbols are already registered on kprobe blacklist, this also removes them from the blacklist. Signed-off-by: Masami Hiramatsu Reviewed-by: Steven Rostedt Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Borislav Petkov Cc: David S. Miller Cc: Frederic Weisbecker Cc: Jan Kiszka Cc: Jiri Kosina Cc: Jonathan Lebon Cc: Seiji Aguchi Link: http://lkml.kernel.org/r/20140417081658.26341.57354.stgit@ltc230.yrl.intra.hitachi.co.jp Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 33 --------------------------------- arch/x86/kernel/entry_64.S | 20 -------------------- arch/x86/kernel/kprobes/core.c | 8 ++++++++ include/linux/kprobes.h | 1 + kernel/kprobes.c | 13 ++++++++----- 5 files changed, 17 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index a2a4f4697889..0ca5bf1697bb 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -314,10 +314,6 @@ ENTRY(ret_from_kernel_thread) CFI_ENDPROC ENDPROC(ret_from_kernel_thread) -/* - * Interrupt exit functions should be protected against kprobes - */ - .pushsection .kprobes.text, "ax" /* * Return to user mode is not as complex as all this looks, * but we want the default path for a system call return to @@ -372,10 +368,6 @@ need_resched: END(resume_kernel) #endif CFI_ENDPROC -/* - * End of kprobes section - */ - .popsection /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ @@ -495,10 +487,6 @@ sysexit_audit: PTGS_TO_GS_EX ENDPROC(ia32_sysenter_target) -/* - * syscall stub including irq exit should be protected against kprobes - */ - .pushsection .kprobes.text, "ax" # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway @@ -691,10 +679,6 @@ syscall_badsys: jmp resume_userspace END(syscall_badsys) CFI_ENDPROC -/* - * End of kprobes section - */ - .popsection .macro FIXUP_ESPFIX_STACK /* @@ -781,10 +765,6 @@ common_interrupt: ENDPROC(common_interrupt) CFI_ENDPROC -/* - * Irq entries should be protected against kprobes - */ - .pushsection .kprobes.text, "ax" #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ @@ -961,10 +941,6 @@ ENTRY(spurious_interrupt_bug) jmp error_code CFI_ENDPROC END(spurious_interrupt_bug) -/* - * End of kprobes section - */ - .popsection #ifdef CONFIG_XEN /* Xen doesn't set %esp to be precisely what the normal sysenter @@ -1239,11 +1215,6 @@ return_to_handler: jmp *%ecx #endif -/* - * Some functions should be protected against kprobes - */ - .pushsection .kprobes.text, "ax" - #ifdef CONFIG_TRACING ENTRY(trace_page_fault) RING0_EC_FRAME @@ -1453,7 +1424,3 @@ ENTRY(async_page_fault) END(async_page_fault) #endif -/* - * End of kprobes section - */ - .popsection diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1e96c3628bf2..43bb38951660 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -487,8 +487,6 @@ ENDPROC(native_usergs_sysret64) TRACE_IRQS_OFF .endm -/* save complete stack frame */ - .pushsection .kprobes.text, "ax" ENTRY(save_paranoid) XCPT_FRAME 1 RDI+8 cld @@ -517,7 +515,6 @@ ENTRY(save_paranoid) 1: ret CFI_ENDPROC END(save_paranoid) - .popsection /* * A newly forked process directly context switches into this address. @@ -975,10 +972,6 @@ END(interrupt) call \func .endm -/* - * Interrupt entry/exit should be protected against kprobes - */ - .pushsection .kprobes.text, "ax" /* * The interrupt stubs push (~vector+0x80) onto the stack and * then jump to common_interrupt. @@ -1113,10 +1106,6 @@ ENTRY(retint_kernel) CFI_ENDPROC END(common_interrupt) -/* - * End of kprobes section - */ - .popsection /* * APIC interrupts. @@ -1477,11 +1466,6 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ hyperv_callback_vector hyperv_vector_handler #endif /* CONFIG_HYPERV */ -/* - * Some functions should be protected against kprobes - */ - .pushsection .kprobes.text, "ax" - paranoidzeroentry_ist debug do_debug DEBUG_STACK paranoidzeroentry_ist int3 do_int3 DEBUG_STACK paranoiderrorentry stack_segment do_stack_segment @@ -1898,7 +1882,3 @@ ENTRY(ignore_sysret) CFI_ENDPROC END(ignore_sysret) -/* - * End of kprobes section - */ - .popsection diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index da7bdaa3ce15..7751b3dee53a 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1065,6 +1065,14 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } +bool arch_within_kprobe_blacklist(unsigned long addr) +{ + return (addr >= (unsigned long)__kprobes_text_start && + addr < (unsigned long)__kprobes_text_end) || + (addr >= (unsigned long)__entry_text_start && + addr < (unsigned long)__entry_text_end); +} + int __init arch_init_kprobes(void) { return 0; diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 925eaf28fca9..cdf9251f8249 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -265,6 +265,7 @@ extern void arch_disarm_kprobe(struct kprobe *p); extern int arch_init_kprobes(void); extern void show_registers(struct pt_regs *regs); extern void kprobes_inc_nmissed_count(struct kprobe *p); +extern bool arch_within_kprobe_blacklist(unsigned long addr); struct kprobe_insn_cache { struct mutex mutex; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ceeadfcabb76..5b5ac76671e7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -96,9 +96,6 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) static struct kprobe_blackpoint kprobe_blacklist[] = { {"preempt_schedule",}, {"native_get_debugreg",}, - {"irq_entries_start",}, - {"common_interrupt",}, - {"mcount",}, /* mcount can be called from everywhere */ {NULL} /* Terminator */ }; @@ -1324,12 +1321,18 @@ out: return ret; } +bool __weak arch_within_kprobe_blacklist(unsigned long addr) +{ + /* The __kprobes marked functions and entry code must not be probed */ + return addr >= (unsigned long)__kprobes_text_start && + addr < (unsigned long)__kprobes_text_end; +} + static int __kprobes in_kprobes_functions(unsigned long addr) { struct kprobe_blackpoint *kb; - if (addr >= (unsigned long)__kprobes_text_start && - addr < (unsigned long)__kprobes_text_end) + if (arch_within_kprobe_blacklist(addr)) return -EINVAL; /* * If there exists a kprobe_blacklist, verify and -- cgit From 376e242429bf8539ef39a080ac113c8799840b13 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:17:05 +0900 Subject: kprobes: Introduce NOKPROBE_SYMBOL() macro to maintain kprobes blacklist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce NOKPROBE_SYMBOL() macro which builds a kprobes blacklist at kernel build time. The usage of this macro is similar to EXPORT_SYMBOL(), placed after the function definition: NOKPROBE_SYMBOL(function); Since this macro will inhibit inlining of static/inline functions, this patch also introduces a nokprobe_inline macro for static/inline functions. In this case, we must use NOKPROBE_SYMBOL() for the inline function caller. When CONFIG_KPROBES=y, the macro stores the given function address in the "_kprobe_blacklist" section. Since the data structures are not fully initialized by the macro (because there is no "size" information), those are re-initialized at boot time by using kallsyms. Signed-off-by: Masami Hiramatsu Link: http://lkml.kernel.org/r/20140417081705.26341.96719.stgit@ltc230.yrl.intra.hitachi.co.jp Cc: Alok Kataria Cc: Ananth N Mavinakayanahalli Cc: Andrew Morton Cc: Anil S Keshavamurthy Cc: Arnd Bergmann Cc: Christopher Li Cc: Chris Wright Cc: David S. Miller Cc: Jan-Simon Möller Cc: Jeremy Fitzhardinge Cc: Linus Torvalds Cc: Randy Dunlap Cc: Rusty Russell Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-sparse@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Ingo Molnar --- Documentation/kprobes.txt | 16 +++++- arch/x86/include/asm/asm.h | 7 +++ arch/x86/kernel/paravirt.c | 4 ++ include/asm-generic/vmlinux.lds.h | 9 ++++ include/linux/compiler.h | 2 + include/linux/kprobes.h | 20 ++++++-- kernel/kprobes.c | 100 ++++++++++++++++++++------------------ kernel/sched/core.c | 1 + 8 files changed, 107 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 0cfb00fd86ff..4bbeca8483ed 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -22,8 +22,9 @@ Appendix B: The kprobes sysctl interface Kprobes enables you to dynamically break into any kernel routine and collect debugging and performance information non-disruptively. You -can trap at almost any kernel code address, specifying a handler +can trap at almost any kernel code address(*), specifying a handler routine to be invoked when the breakpoint is hit. +(*: some parts of the kernel code can not be trapped, see 1.5 Blacklist) There are currently three types of probes: kprobes, jprobes, and kretprobes (also called return probes). A kprobe can be inserted @@ -273,6 +274,19 @@ using one of the following techniques: or - Execute 'sysctl -w debug.kprobes_optimization=n' +1.5 Blacklist + +Kprobes can probe most of the kernel except itself. This means +that there are some functions where kprobes cannot probe. Probing +(trapping) such functions can cause a recursive trap (e.g. double +fault) or the nested probe handler may never be called. +Kprobes manages such functions as a blacklist. +If you want to add a function into the blacklist, you just need +to (1) include linux/kprobes.h and (2) use NOKPROBE_SYMBOL() macro +to specify a blacklisted function. +Kprobes checks the given probe address against the blacklist and +rejects registering it, if the given address is in the blacklist. + 2. Architectures Supported Kprobes, jprobes, and return probes are implemented on the following diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 4582e8e1cd1a..7730c1c5c83a 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -57,6 +57,12 @@ .long (from) - . ; \ .long (to) - . + 0x7ffffff0 ; \ .popsection + +# define _ASM_NOKPROBE(entry) \ + .pushsection "_kprobe_blacklist","aw" ; \ + _ASM_ALIGN ; \ + _ASM_PTR (entry); \ + .popsection #else # define _ASM_EXTABLE(from,to) \ " .pushsection \"__ex_table\",\"a\"\n" \ @@ -71,6 +77,7 @@ " .long (" #from ") - .\n" \ " .long (" #to ") - . + 0x7ffffff0\n" \ " .popsection\n" +/* For C file, we already have NOKPROBE_SYMBOL macro */ #endif #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1b10af835c31..e136869ae42e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -389,6 +390,9 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .end_context_switch = paravirt_nop, }; +/* At this point, native_get_debugreg has a real function entry */ +NOKPROBE_SYMBOL(native_get_debugreg); + struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC .startup_ipi_hook = paravirt_nop, diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 146e4fffd710..40ceb3ceba79 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -109,6 +109,14 @@ #define BRANCH_PROFILE() #endif +#ifdef CONFIG_KPROBES +#define KPROBE_BLACKLIST() VMLINUX_SYMBOL(__start_kprobe_blacklist) = .; \ + *(_kprobe_blacklist) \ + VMLINUX_SYMBOL(__stop_kprobe_blacklist) = .; +#else +#define KPROBE_BLACKLIST() +#endif + #ifdef CONFIG_EVENT_TRACING #define FTRACE_EVENTS() . = ALIGN(8); \ VMLINUX_SYMBOL(__start_ftrace_events) = .; \ @@ -507,6 +515,7 @@ *(.init.rodata) \ FTRACE_EVENTS() \ TRACE_SYSCALLS() \ + KPROBE_BLACKLIST() \ MEM_DISCARD(init.rodata) \ CLK_OF_TABLES() \ RESERVEDMEM_OF_TABLES() \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index ee7239ea1583..0300c0f5c88b 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -374,7 +374,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); /* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */ #ifdef CONFIG_KPROBES # define __kprobes __attribute__((__section__(".kprobes.text"))) +# define nokprobe_inline __always_inline #else # define __kprobes +# define nokprobe_inline inline #endif #endif /* __LINUX_COMPILER_H */ diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index cdf9251f8249..e059507c465d 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -205,10 +205,10 @@ struct kretprobe_blackpoint { void *addr; }; -struct kprobe_blackpoint { - const char *name; +struct kprobe_blacklist_entry { + struct list_head list; unsigned long start_addr; - unsigned long range; + unsigned long end_addr; }; #ifdef CONFIG_KPROBES @@ -477,4 +477,18 @@ static inline int enable_jprobe(struct jprobe *jp) return enable_kprobe(&jp->kp); } +#ifdef CONFIG_KPROBES +/* + * Blacklist ganerating macro. Specify functions which is not probed + * by using this macro. + */ +#define __NOKPROBE_SYMBOL(fname) \ +static unsigned long __used \ + __attribute__((section("_kprobe_blacklist"))) \ + _kbl_addr_##fname = (unsigned long)fname; +#define NOKPROBE_SYMBOL(fname) __NOKPROBE_SYMBOL(fname) +#else +#define NOKPROBE_SYMBOL(fname) +#endif + #endif /* _LINUX_KPROBES_H */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5b5ac76671e7..5ffc6875d2a7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -86,18 +86,8 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) return &(kretprobe_table_locks[hash].lock); } -/* - * Normally, functions that we'd want to prohibit kprobes in, are marked - * __kprobes. But, there are cases where such functions already belong to - * a different section (__sched for preempt_schedule) - * - * For such cases, we now have a blacklist - */ -static struct kprobe_blackpoint kprobe_blacklist[] = { - {"preempt_schedule",}, - {"native_get_debugreg",}, - {NULL} /* Terminator */ -}; +/* Blacklist -- list of struct kprobe_blacklist_entry */ +static LIST_HEAD(kprobe_blacklist); #ifdef __ARCH_WANT_KPROBES_INSN_SLOT /* @@ -1328,24 +1318,22 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr) addr < (unsigned long)__kprobes_text_end; } -static int __kprobes in_kprobes_functions(unsigned long addr) +static bool __kprobes within_kprobe_blacklist(unsigned long addr) { - struct kprobe_blackpoint *kb; + struct kprobe_blacklist_entry *ent; if (arch_within_kprobe_blacklist(addr)) - return -EINVAL; + return true; /* * If there exists a kprobe_blacklist, verify and * fail any probe registration in the prohibited area */ - for (kb = kprobe_blacklist; kb->name != NULL; kb++) { - if (kb->start_addr) { - if (addr >= kb->start_addr && - addr < (kb->start_addr + kb->range)) - return -EINVAL; - } + list_for_each_entry(ent, &kprobe_blacklist, list) { + if (addr >= ent->start_addr && addr < ent->end_addr) + return true; } - return 0; + + return false; } /* @@ -1436,7 +1424,7 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, /* Ensure it is not in reserved area nor out of text */ if (!kernel_text_address((unsigned long) p->addr) || - in_kprobes_functions((unsigned long) p->addr) || + within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr)) { ret = -EINVAL; goto out; @@ -2022,6 +2010,38 @@ void __kprobes dump_kprobe(struct kprobe *kp) kp->symbol_name, kp->addr, kp->offset); } +/* + * Lookup and populate the kprobe_blacklist. + * + * Unlike the kretprobe blacklist, we'll need to determine + * the range of addresses that belong to the said functions, + * since a kprobe need not necessarily be at the beginning + * of a function. + */ +static int __init populate_kprobe_blacklist(unsigned long *start, + unsigned long *end) +{ + unsigned long *iter; + struct kprobe_blacklist_entry *ent; + unsigned long offset = 0, size = 0; + + for (iter = start; iter < end; iter++) { + if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) { + pr_err("Failed to find blacklist %p\n", (void *)*iter); + continue; + } + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + ent->start_addr = *iter; + ent->end_addr = *iter + size; + INIT_LIST_HEAD(&ent->list); + list_add_tail(&ent->list, &kprobe_blacklist); + } + return 0; +} + /* Module notifier call back, checking kprobes on the module */ static int __kprobes kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2065,14 +2085,13 @@ static struct notifier_block kprobe_module_nb = { .priority = 0 }; +/* Markers of _kprobe_blacklist section */ +extern unsigned long __start_kprobe_blacklist[]; +extern unsigned long __stop_kprobe_blacklist[]; + static int __init init_kprobes(void) { int i, err = 0; - unsigned long offset = 0, size = 0; - char *modname, namebuf[KSYM_NAME_LEN]; - const char *symbol_name; - void *addr; - struct kprobe_blackpoint *kb; /* FIXME allocate the probe table, currently defined statically */ /* initialize all list heads */ @@ -2082,26 +2101,11 @@ static int __init init_kprobes(void) raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); } - /* - * Lookup and populate the kprobe_blacklist. - * - * Unlike the kretprobe blacklist, we'll need to determine - * the range of addresses that belong to the said functions, - * since a kprobe need not necessarily be at the beginning - * of a function. - */ - for (kb = kprobe_blacklist; kb->name != NULL; kb++) { - kprobe_lookup_name(kb->name, addr); - if (!addr) - continue; - - kb->start_addr = (unsigned long)addr; - symbol_name = kallsyms_lookup(kb->start_addr, - &size, &offset, &modname, namebuf); - if (!symbol_name) - kb->range = 0; - else - kb->range = size; + err = populate_kprobe_blacklist(__start_kprobe_blacklist, + __stop_kprobe_blacklist); + if (err) { + pr_err("kprobes: failed to populate blacklist: %d\n", err); + pr_err("Please take care of using kprobes.\n"); } if (kretprobe_blacklist_size) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45ea238c..6863631e8cd0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2804,6 +2804,7 @@ asmlinkage void __sched notrace preempt_schedule(void) barrier(); } while (need_resched()); } +NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #endif /* CONFIG_PREEMPT */ -- cgit From 55479f64756fc508182a05e35e52f01395a50d4d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:17:54 +0900 Subject: kprobes: Allow probe on some kprobe functions There is no need to prohibit probing on the functions used for preparation, registeration, optimization, controll etc. Those are safely probed because those are not invoked from breakpoint/fault/debug handlers, there is no chance to cause recursive exceptions. Following functions are now removed from the kprobes blacklist: add_new_kprobe aggr_kprobe_disabled alloc_aggr_kprobe alloc_aggr_kprobe arm_all_kprobes __arm_kprobe arm_kprobe arm_kprobe_ftrace check_kprobe_address_safe collect_garbage_slots collect_garbage_slots collect_one_slot debugfs_kprobe_init __disable_kprobe disable_kprobe disarm_all_kprobes __disarm_kprobe disarm_kprobe disarm_kprobe_ftrace do_free_cleaned_kprobes do_optimize_kprobes do_unoptimize_kprobes enable_kprobe force_unoptimize_kprobe free_aggr_kprobe free_aggr_kprobe __free_insn_slot __get_insn_slot get_optimized_kprobe __get_valid_kprobe init_aggr_kprobe init_aggr_kprobe in_nokprobe_functions kick_kprobe_optimizer kill_kprobe kill_optimized_kprobe kprobe_addr kprobe_optimizer kprobe_queued kprobe_seq_next kprobe_seq_start kprobe_seq_stop kprobes_module_callback kprobes_open optimize_all_kprobes optimize_kprobe prepare_kprobe prepare_optimized_kprobe register_aggr_kprobe register_jprobe register_jprobes register_kprobe register_kprobes register_kretprobe register_kretprobe register_kretprobes register_kretprobes report_probe show_kprobe_addr try_to_optimize_kprobe unoptimize_all_kprobes unoptimize_kprobe unregister_jprobe unregister_jprobes unregister_kprobe __unregister_kprobe_bottom unregister_kprobes __unregister_kprobe_top unregister_kretprobe unregister_kretprobe unregister_kretprobes unregister_kretprobes wait_for_kprobe_optimizer I tested those functions by putting kprobes on all instructions in the functions with the bash script I sent to LKML. See: https://lkml.org/lkml/2014/3/27/33 Signed-off-by: Masami Hiramatsu Link: http://lkml.kernel.org/r/20140417081753.26341.57889.stgit@ltc230.yrl.intra.hitachi.co.jp Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S. Miller Cc: fche@redhat.com Cc: systemtap@sourceware.org Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 153 +++++++++++++++++++++++++++---------------------------- 1 file changed, 76 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5ffc6875d2a7..4db2cc616f50 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -138,13 +138,13 @@ struct kprobe_insn_cache kprobe_insn_slots = { .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, }; -static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); +static int collect_garbage_slots(struct kprobe_insn_cache *c); /** * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) +kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip; kprobe_opcode_t *slot = NULL; @@ -201,7 +201,7 @@ out: } /* Return 1 if all garbages are collected, otherwise 0. */ -static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) +static int collect_one_slot(struct kprobe_insn_page *kip, int idx) { kip->slot_used[idx] = SLOT_CLEAN; kip->nused--; @@ -222,7 +222,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) return 0; } -static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) +static int collect_garbage_slots(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip, *next; @@ -244,8 +244,8 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) return 0; } -void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, - kprobe_opcode_t *slot, int dirty) +void __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty) { struct kprobe_insn_page *kip; @@ -361,7 +361,7 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) } /* Free optimized instructions and optimized_kprobe */ -static __kprobes void free_aggr_kprobe(struct kprobe *p) +static void free_aggr_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -399,7 +399,7 @@ static inline int kprobe_disarmed(struct kprobe *p) } /* Return true(!0) if the probe is queued on (un)optimizing lists */ -static int __kprobes kprobe_queued(struct kprobe *p) +static int kprobe_queued(struct kprobe *p) { struct optimized_kprobe *op; @@ -415,7 +415,7 @@ static int __kprobes kprobe_queued(struct kprobe *p) * Return an optimized kprobe whose optimizing code replaces * instructions including addr (exclude breakpoint). */ -static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) +static struct kprobe *get_optimized_kprobe(unsigned long addr) { int i; struct kprobe *p = NULL; @@ -447,7 +447,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); * Optimize (replace a breakpoint with a jump) kprobes listed on * optimizing_list. */ -static __kprobes void do_optimize_kprobes(void) +static void do_optimize_kprobes(void) { /* Optimization never be done when disarmed */ if (kprobes_all_disarmed || !kprobes_allow_optimization || @@ -475,7 +475,7 @@ static __kprobes void do_optimize_kprobes(void) * Unoptimize (replace a jump with a breakpoint and remove the breakpoint * if need) kprobes listed on unoptimizing_list. */ -static __kprobes void do_unoptimize_kprobes(void) +static void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; @@ -507,7 +507,7 @@ static __kprobes void do_unoptimize_kprobes(void) } /* Reclaim all kprobes on the free_list */ -static __kprobes void do_free_cleaned_kprobes(void) +static void do_free_cleaned_kprobes(void) { struct optimized_kprobe *op, *tmp; @@ -519,13 +519,13 @@ static __kprobes void do_free_cleaned_kprobes(void) } /* Start optimizer after OPTIMIZE_DELAY passed */ -static __kprobes void kick_kprobe_optimizer(void) +static void kick_kprobe_optimizer(void) { schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); } /* Kprobe jump optimizer */ -static __kprobes void kprobe_optimizer(struct work_struct *work) +static void kprobe_optimizer(struct work_struct *work) { mutex_lock(&kprobe_mutex); /* Lock modules while optimizing kprobes */ @@ -561,7 +561,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) } /* Wait for completing optimization and unoptimization */ -static __kprobes void wait_for_kprobe_optimizer(void) +static void wait_for_kprobe_optimizer(void) { mutex_lock(&kprobe_mutex); @@ -580,7 +580,7 @@ static __kprobes void wait_for_kprobe_optimizer(void) } /* Optimize kprobe if p is ready to be optimized */ -static __kprobes void optimize_kprobe(struct kprobe *p) +static void optimize_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -614,7 +614,7 @@ static __kprobes void optimize_kprobe(struct kprobe *p) } /* Short cut to direct unoptimizing */ -static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) +static void force_unoptimize_kprobe(struct optimized_kprobe *op) { get_online_cpus(); arch_unoptimize_kprobe(op); @@ -624,7 +624,7 @@ static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) } /* Unoptimize a kprobe if p is optimized */ -static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force) +static void unoptimize_kprobe(struct kprobe *p, bool force) { struct optimized_kprobe *op; @@ -684,7 +684,7 @@ static void reuse_unused_kprobe(struct kprobe *ap) } /* Remove optimized instructions */ -static void __kprobes kill_optimized_kprobe(struct kprobe *p) +static void kill_optimized_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -710,7 +710,7 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p) } /* Try to prepare optimized instructions */ -static __kprobes void prepare_optimized_kprobe(struct kprobe *p) +static void prepare_optimized_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -719,7 +719,7 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p) } /* Allocate new optimized_kprobe and try to prepare optimized instructions */ -static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) +static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -734,13 +734,13 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) return &op->kp; } -static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); +static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); /* * Prepare an optimized_kprobe and optimize it * NOTE: p must be a normal registered kprobe */ -static __kprobes void try_to_optimize_kprobe(struct kprobe *p) +static void try_to_optimize_kprobe(struct kprobe *p) { struct kprobe *ap; struct optimized_kprobe *op; @@ -774,7 +774,7 @@ out: } #ifdef CONFIG_SYSCTL -static void __kprobes optimize_all_kprobes(void) +static void optimize_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; @@ -797,7 +797,7 @@ out: mutex_unlock(&kprobe_mutex); } -static void __kprobes unoptimize_all_kprobes(void) +static void unoptimize_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; @@ -848,7 +848,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, #endif /* CONFIG_SYSCTL */ /* Put a breakpoint for a probe. Must be called with text_mutex locked */ -static void __kprobes __arm_kprobe(struct kprobe *p) +static void __arm_kprobe(struct kprobe *p) { struct kprobe *_p; @@ -863,7 +863,7 @@ static void __kprobes __arm_kprobe(struct kprobe *p) } /* Remove the breakpoint of a probe. Must be called with text_mutex locked */ -static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt) +static void __disarm_kprobe(struct kprobe *p, bool reopt) { struct kprobe *_p; @@ -898,13 +898,13 @@ static void reuse_unused_kprobe(struct kprobe *ap) BUG_ON(kprobe_unused(ap)); } -static __kprobes void free_aggr_kprobe(struct kprobe *p) +static void free_aggr_kprobe(struct kprobe *p) { arch_remove_kprobe(p); kfree(p); } -static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) +static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) { return kzalloc(sizeof(struct kprobe), GFP_KERNEL); } @@ -918,7 +918,7 @@ static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { static int kprobe_ftrace_enabled; /* Must ensure p->addr is really on ftrace */ -static int __kprobes prepare_kprobe(struct kprobe *p) +static int prepare_kprobe(struct kprobe *p) { if (!kprobe_ftrace(p)) return arch_prepare_kprobe(p); @@ -927,7 +927,7 @@ static int __kprobes prepare_kprobe(struct kprobe *p) } /* Caller must lock kprobe_mutex */ -static void __kprobes arm_kprobe_ftrace(struct kprobe *p) +static void arm_kprobe_ftrace(struct kprobe *p) { int ret; @@ -942,7 +942,7 @@ static void __kprobes arm_kprobe_ftrace(struct kprobe *p) } /* Caller must lock kprobe_mutex */ -static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) +static void disarm_kprobe_ftrace(struct kprobe *p) { int ret; @@ -962,7 +962,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) #endif /* Arm a kprobe with text_mutex */ -static void __kprobes arm_kprobe(struct kprobe *kp) +static void arm_kprobe(struct kprobe *kp) { if (unlikely(kprobe_ftrace(kp))) { arm_kprobe_ftrace(kp); @@ -979,7 +979,7 @@ static void __kprobes arm_kprobe(struct kprobe *kp) } /* Disarm a kprobe with text_mutex */ -static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt) +static void disarm_kprobe(struct kprobe *kp, bool reopt) { if (unlikely(kprobe_ftrace(kp))) { disarm_kprobe_ftrace(kp); @@ -1189,7 +1189,7 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp) * Add the new probe to ap->list. Fail if this is the * second jprobe at the address - two jprobes can't coexist */ -static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) +static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) { BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); @@ -1213,7 +1213,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) * Fill in the required fields of the "manager kprobe". Replace the * earlier kprobe in the hlist with the manager kprobe */ -static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) +static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) { /* Copy p's insn slot to ap */ copy_kprobe(p, ap); @@ -1239,8 +1239,7 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) * This is the second or subsequent kprobe at the address - handle * the intricacies */ -static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, - struct kprobe *p) +static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) { int ret = 0; struct kprobe *ap = orig_p; @@ -1318,7 +1317,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr) addr < (unsigned long)__kprobes_text_end; } -static bool __kprobes within_kprobe_blacklist(unsigned long addr) +static bool within_kprobe_blacklist(unsigned long addr) { struct kprobe_blacklist_entry *ent; @@ -1342,7 +1341,7 @@ static bool __kprobes within_kprobe_blacklist(unsigned long addr) * This returns encoded errors if it fails to look up symbol or invalid * combination of parameters. */ -static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) +static kprobe_opcode_t *kprobe_addr(struct kprobe *p) { kprobe_opcode_t *addr = p->addr; @@ -1365,7 +1364,7 @@ invalid: } /* Check passed kprobe is valid and return kprobe in kprobe_table. */ -static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) +static struct kprobe *__get_valid_kprobe(struct kprobe *p) { struct kprobe *ap, *list_p; @@ -1397,8 +1396,8 @@ static inline int check_kprobe_rereg(struct kprobe *p) return ret; } -static __kprobes int check_kprobe_address_safe(struct kprobe *p, - struct module **probed_mod) +static int check_kprobe_address_safe(struct kprobe *p, + struct module **probed_mod) { int ret = 0; unsigned long ftrace_addr; @@ -1460,7 +1459,7 @@ out: return ret; } -int __kprobes register_kprobe(struct kprobe *p) +int register_kprobe(struct kprobe *p) { int ret; struct kprobe *old_p; @@ -1522,7 +1521,7 @@ out: EXPORT_SYMBOL_GPL(register_kprobe); /* Check if all probes on the aggrprobe are disabled */ -static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) +static int aggr_kprobe_disabled(struct kprobe *ap) { struct kprobe *kp; @@ -1538,7 +1537,7 @@ static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) } /* Disable one kprobe: Make sure called under kprobe_mutex is locked */ -static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) +static struct kprobe *__disable_kprobe(struct kprobe *p) { struct kprobe *orig_p; @@ -1565,7 +1564,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) /* * Unregister a kprobe without a scheduler synchronization. */ -static int __kprobes __unregister_kprobe_top(struct kprobe *p) +static int __unregister_kprobe_top(struct kprobe *p) { struct kprobe *ap, *list_p; @@ -1622,7 +1621,7 @@ disarmed: return 0; } -static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) +static void __unregister_kprobe_bottom(struct kprobe *p) { struct kprobe *ap; @@ -1638,7 +1637,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) /* Otherwise, do nothing. */ } -int __kprobes register_kprobes(struct kprobe **kps, int num) +int register_kprobes(struct kprobe **kps, int num) { int i, ret = 0; @@ -1656,13 +1655,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num) } EXPORT_SYMBOL_GPL(register_kprobes); -void __kprobes unregister_kprobe(struct kprobe *p) +void unregister_kprobe(struct kprobe *p) { unregister_kprobes(&p, 1); } EXPORT_SYMBOL_GPL(unregister_kprobe); -void __kprobes unregister_kprobes(struct kprobe **kps, int num) +void unregister_kprobes(struct kprobe **kps, int num) { int i; @@ -1691,7 +1690,7 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } -int __kprobes register_jprobes(struct jprobe **jps, int num) +int register_jprobes(struct jprobe **jps, int num) { struct jprobe *jp; int ret = 0, i; @@ -1722,19 +1721,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num) } EXPORT_SYMBOL_GPL(register_jprobes); -int __kprobes register_jprobe(struct jprobe *jp) +int register_jprobe(struct jprobe *jp) { return register_jprobes(&jp, 1); } EXPORT_SYMBOL_GPL(register_jprobe); -void __kprobes unregister_jprobe(struct jprobe *jp) +void unregister_jprobe(struct jprobe *jp) { unregister_jprobes(&jp, 1); } EXPORT_SYMBOL_GPL(unregister_jprobe); -void __kprobes unregister_jprobes(struct jprobe **jps, int num) +void unregister_jprobes(struct jprobe **jps, int num) { int i; @@ -1799,7 +1798,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, return 0; } -int __kprobes register_kretprobe(struct kretprobe *rp) +int register_kretprobe(struct kretprobe *rp) { int ret = 0; struct kretprobe_instance *inst; @@ -1852,7 +1851,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) } EXPORT_SYMBOL_GPL(register_kretprobe); -int __kprobes register_kretprobes(struct kretprobe **rps, int num) +int register_kretprobes(struct kretprobe **rps, int num) { int ret = 0, i; @@ -1870,13 +1869,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num) } EXPORT_SYMBOL_GPL(register_kretprobes); -void __kprobes unregister_kretprobe(struct kretprobe *rp) +void unregister_kretprobe(struct kretprobe *rp) { unregister_kretprobes(&rp, 1); } EXPORT_SYMBOL_GPL(unregister_kretprobe); -void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) +void unregister_kretprobes(struct kretprobe **rps, int num) { int i; @@ -1899,24 +1898,24 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) EXPORT_SYMBOL_GPL(unregister_kretprobes); #else /* CONFIG_KRETPROBES */ -int __kprobes register_kretprobe(struct kretprobe *rp) +int register_kretprobe(struct kretprobe *rp) { return -ENOSYS; } EXPORT_SYMBOL_GPL(register_kretprobe); -int __kprobes register_kretprobes(struct kretprobe **rps, int num) +int register_kretprobes(struct kretprobe **rps, int num) { return -ENOSYS; } EXPORT_SYMBOL_GPL(register_kretprobes); -void __kprobes unregister_kretprobe(struct kretprobe *rp) +void unregister_kretprobe(struct kretprobe *rp) { } EXPORT_SYMBOL_GPL(unregister_kretprobe); -void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) +void unregister_kretprobes(struct kretprobe **rps, int num) { } EXPORT_SYMBOL_GPL(unregister_kretprobes); @@ -1930,7 +1929,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, #endif /* CONFIG_KRETPROBES */ /* Set the kprobe gone and remove its instruction buffer. */ -static void __kprobes kill_kprobe(struct kprobe *p) +static void kill_kprobe(struct kprobe *p) { struct kprobe *kp; @@ -1954,7 +1953,7 @@ static void __kprobes kill_kprobe(struct kprobe *p) } /* Disable one kprobe */ -int __kprobes disable_kprobe(struct kprobe *kp) +int disable_kprobe(struct kprobe *kp) { int ret = 0; @@ -1970,7 +1969,7 @@ int __kprobes disable_kprobe(struct kprobe *kp) EXPORT_SYMBOL_GPL(disable_kprobe); /* Enable one kprobe */ -int __kprobes enable_kprobe(struct kprobe *kp) +int enable_kprobe(struct kprobe *kp) { int ret = 0; struct kprobe *p; @@ -2043,8 +2042,8 @@ static int __init populate_kprobe_blacklist(unsigned long *start, } /* Module notifier call back, checking kprobes on the module */ -static int __kprobes kprobes_module_callback(struct notifier_block *nb, - unsigned long val, void *data) +static int kprobes_module_callback(struct notifier_block *nb, + unsigned long val, void *data) { struct module *mod = data; struct hlist_head *head; @@ -2145,7 +2144,7 @@ static int __init init_kprobes(void) } #ifdef CONFIG_DEBUG_FS -static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, +static void report_probe(struct seq_file *pi, struct kprobe *p, const char *sym, int offset, char *modname, struct kprobe *pp) { char *kprobe_type; @@ -2174,12 +2173,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, (kprobe_ftrace(pp) ? "[FTRACE]" : "")); } -static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) +static void *kprobe_seq_start(struct seq_file *f, loff_t *pos) { return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL; } -static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) +static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) { (*pos)++; if (*pos >= KPROBE_TABLE_SIZE) @@ -2187,12 +2186,12 @@ static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) return pos; } -static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v) +static void kprobe_seq_stop(struct seq_file *f, void *v) { /* Nothing to do */ } -static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) +static int show_kprobe_addr(struct seq_file *pi, void *v) { struct hlist_head *head; struct kprobe *p, *kp; @@ -2223,7 +2222,7 @@ static const struct seq_operations kprobes_seq_ops = { .show = show_kprobe_addr }; -static int __kprobes kprobes_open(struct inode *inode, struct file *filp) +static int kprobes_open(struct inode *inode, struct file *filp) { return seq_open(filp, &kprobes_seq_ops); } @@ -2235,7 +2234,7 @@ static const struct file_operations debugfs_kprobes_operations = { .release = seq_release, }; -static void __kprobes arm_all_kprobes(void) +static void arm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; @@ -2263,7 +2262,7 @@ already_enabled: return; } -static void __kprobes disarm_all_kprobes(void) +static void disarm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; @@ -2347,7 +2346,7 @@ static const struct file_operations fops_kp = { .llseek = default_llseek, }; -static int __kprobes debugfs_kprobe_init(void) +static int __init debugfs_kprobe_init(void) { struct dentry *dir, *file; unsigned int value = 1; -- cgit From fbc1963d2c1c4eb4651132a2c5c9d6111ada17d3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:18:00 +0900 Subject: kprobes, ftrace: Allow probing on some functions There is no need to prohibit probing on the functions used for preparation and uprobe only fetch functions. Those are safely probed because those are not invoked from kprobe's breakpoint/fault/debug handlers. So there is no chance to cause recursive exceptions. Following functions are now removed from the kprobes blacklist: update_bitfield_fetch_param free_bitfield_fetch_param kprobe_register FETCH_FUNC_NAME(stack, type) in trace_uprobe.c FETCH_FUNC_NAME(memory, type) in trace_uprobe.c FETCH_FUNC_NAME(memory, string) in trace_uprobe.c FETCH_FUNC_NAME(memory, string_size) in trace_uprobe.c FETCH_FUNC_NAME(file_offset, type) in trace_uprobe.c Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140417081800.26341.56504.stgit@ltc230.yrl.intra.hitachi.co.jp Signed-off-by: Ingo Molnar --- kernel/trace/trace_kprobe.c | 5 ++--- kernel/trace/trace_probe.c | 4 ++-- kernel/trace/trace_uprobe.c | 20 ++++++++++---------- 3 files changed, 14 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 903ae28962be..aa5f0bfcdf7b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1196,9 +1196,8 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe * lockless, but we can't race with this __init function. */ -static __kprobes -int kprobe_register(struct ftrace_event_call *event, - enum trace_reg type, void *data) +static int kprobe_register(struct ftrace_event_call *event, + enum trace_reg type, void *data) { struct trace_kprobe *tk = (struct trace_kprobe *)event->data; struct ftrace_event_file *file = data; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8364a421b4df..d3a91e40a659 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -183,7 +183,7 @@ DEFINE_BASIC_FETCH_FUNCS(bitfield) #define fetch_bitfield_string NULL #define fetch_bitfield_string_size NULL -static __kprobes void +static void update_bitfield_fetch_param(struct bitfield_fetch_param *data) { /* @@ -196,7 +196,7 @@ update_bitfield_fetch_param(struct bitfield_fetch_param *data) update_symbol_cache(data->orig.data); } -static __kprobes void +static void free_bitfield_fetch_param(struct bitfield_fetch_param *data) { /* diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c082a7441345..991e3b7c4edb 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -108,8 +108,8 @@ static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) * Uprobes-specific fetch functions */ #define DEFINE_FETCH_stack(type) \ -static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ - void *offset, void *dest) \ +static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ + void *offset, void *dest) \ { \ *(type *)dest = (type)get_user_stack_nth(regs, \ ((unsigned long)offset)); \ @@ -120,8 +120,8 @@ DEFINE_BASIC_FETCH_FUNCS(stack) #define fetch_stack_string_size NULL #define DEFINE_FETCH_memory(type) \ -static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ - void *addr, void *dest) \ +static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ + void *addr, void *dest) \ { \ type retval; \ void __user *vaddr = (void __force __user *) addr; \ @@ -136,8 +136,8 @@ DEFINE_BASIC_FETCH_FUNCS(memory) * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max * length and relative data location. */ -static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) +static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) { long ret; u32 rloc = *(u32 *)dest; @@ -158,8 +158,8 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, } } -static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) +static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) { int len; void __user *vaddr = (void __force __user *) addr; @@ -184,8 +184,8 @@ static unsigned long translate_user_vaddr(void *file_offset) } #define DEFINE_FETCH_file_offset(type) \ -static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\ - void *offset, void *dest) \ +static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \ + void *offset, void *dest)\ { \ void *vaddr = (void *)translate_user_vaddr(offset); \ \ -- cgit From 820aede0209a51549e8a014c8030e29229920e4e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:18:21 +0900 Subject: kprobes: Use NOKPROBE_SYMBOL macro instead of __kprobes Use NOKPROBE_SYMBOL macro to protect functions from kprobes instead of __kprobes annotation. Signed-off-by: Masami Hiramatsu Reviewed-by: Steven Rostedt Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S. Miller Link: http://lkml.kernel.org/r/20140417081821.26341.40362.stgit@ltc230.yrl.intra.hitachi.co.jp Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 67 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 4db2cc616f50..a21b4e67fd97 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -301,7 +301,7 @@ static inline void reset_kprobe_instance(void) * OR * - with preemption disabled - from arch/xxx/kernel/kprobes.c */ -struct kprobe __kprobes *get_kprobe(void *addr) +struct kprobe *get_kprobe(void *addr) { struct hlist_head *head; struct kprobe *p; @@ -314,8 +314,9 @@ struct kprobe __kprobes *get_kprobe(void *addr) return NULL; } +NOKPROBE_SYMBOL(get_kprobe); -static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); +static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); /* Return true if the kprobe is an aggregator */ static inline int kprobe_aggrprobe(struct kprobe *p) @@ -347,7 +348,7 @@ static bool kprobes_allow_optimization; * Call all pre_handler on the list, but ignores its return value. * This must be called from arch-dep optimized caller. */ -void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) +void opt_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp; @@ -359,6 +360,7 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) reset_kprobe_instance(); } } +NOKPROBE_SYMBOL(opt_pre_handler); /* Free optimized instructions and optimized_kprobe */ static void free_aggr_kprobe(struct kprobe *p) @@ -995,7 +997,7 @@ static void disarm_kprobe(struct kprobe *kp, bool reopt) * Aggregate handlers for multiple kprobes support - these handlers * take care of invoking the individual kprobe handlers on p->list */ -static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) +static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp; @@ -1009,9 +1011,10 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) } return 0; } +NOKPROBE_SYMBOL(aggr_pre_handler); -static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) +static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) { struct kprobe *kp; @@ -1023,9 +1026,10 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, } } } +NOKPROBE_SYMBOL(aggr_post_handler); -static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, - int trapnr) +static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, + int trapnr) { struct kprobe *cur = __this_cpu_read(kprobe_instance); @@ -1039,8 +1043,9 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, } return 0; } +NOKPROBE_SYMBOL(aggr_fault_handler); -static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) +static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *cur = __this_cpu_read(kprobe_instance); int ret = 0; @@ -1052,9 +1057,10 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) reset_kprobe_instance(); return ret; } +NOKPROBE_SYMBOL(aggr_break_handler); /* Walks the list and increments nmissed count for multiprobe case */ -void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) +void kprobes_inc_nmissed_count(struct kprobe *p) { struct kprobe *kp; if (!kprobe_aggrprobe(p)) { @@ -1065,9 +1071,10 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) } return; } +NOKPROBE_SYMBOL(kprobes_inc_nmissed_count); -void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, - struct hlist_head *head) +void recycle_rp_inst(struct kretprobe_instance *ri, + struct hlist_head *head) { struct kretprobe *rp = ri->rp; @@ -1082,8 +1089,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, /* Unregistering */ hlist_add_head(&ri->hlist, head); } +NOKPROBE_SYMBOL(recycle_rp_inst); -void __kprobes kretprobe_hash_lock(struct task_struct *tsk, +void kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) __acquires(hlist_lock) { @@ -1094,17 +1102,19 @@ __acquires(hlist_lock) hlist_lock = kretprobe_table_lock_ptr(hash); raw_spin_lock_irqsave(hlist_lock, *flags); } +NOKPROBE_SYMBOL(kretprobe_hash_lock); -static void __kprobes kretprobe_table_lock(unsigned long hash, - unsigned long *flags) +static void kretprobe_table_lock(unsigned long hash, + unsigned long *flags) __acquires(hlist_lock) { raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); raw_spin_lock_irqsave(hlist_lock, *flags); } +NOKPROBE_SYMBOL(kretprobe_table_lock); -void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, - unsigned long *flags) +void kretprobe_hash_unlock(struct task_struct *tsk, + unsigned long *flags) __releases(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); @@ -1113,14 +1123,16 @@ __releases(hlist_lock) hlist_lock = kretprobe_table_lock_ptr(hash); raw_spin_unlock_irqrestore(hlist_lock, *flags); } +NOKPROBE_SYMBOL(kretprobe_hash_unlock); -static void __kprobes kretprobe_table_unlock(unsigned long hash, - unsigned long *flags) +static void kretprobe_table_unlock(unsigned long hash, + unsigned long *flags) __releases(hlist_lock) { raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); raw_spin_unlock_irqrestore(hlist_lock, *flags); } +NOKPROBE_SYMBOL(kretprobe_table_unlock); /* * This function is called from finish_task_switch when task tk becomes dead, @@ -1128,7 +1140,7 @@ __releases(hlist_lock) * with this task. These left over instances represent probed functions * that have been called but will never return. */ -void __kprobes kprobe_flush_task(struct task_struct *tk) +void kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; struct hlist_head *head, empty_rp; @@ -1153,6 +1165,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) kfree(ri); } } +NOKPROBE_SYMBOL(kprobe_flush_task); static inline void free_rp_inst(struct kretprobe *rp) { @@ -1165,7 +1178,7 @@ static inline void free_rp_inst(struct kretprobe *rp) } } -static void __kprobes cleanup_rp_inst(struct kretprobe *rp) +static void cleanup_rp_inst(struct kretprobe *rp) { unsigned long flags, hash; struct kretprobe_instance *ri; @@ -1184,6 +1197,7 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp) } free_rp_inst(rp); } +NOKPROBE_SYMBOL(cleanup_rp_inst); /* * Add the new probe to ap->list. Fail if this is the @@ -1758,8 +1772,7 @@ EXPORT_SYMBOL_GPL(unregister_jprobes); * This kprobe pre_handler is registered with every kretprobe. When probe * hits it will set up the return probe. */ -static int __kprobes pre_handler_kretprobe(struct kprobe *p, - struct pt_regs *regs) +static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) { struct kretprobe *rp = container_of(p, struct kretprobe, kp); unsigned long hash, flags = 0; @@ -1797,6 +1810,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, } return 0; } +NOKPROBE_SYMBOL(pre_handler_kretprobe); int register_kretprobe(struct kretprobe *rp) { @@ -1920,11 +1934,11 @@ void unregister_kretprobes(struct kretprobe **rps, int num) } EXPORT_SYMBOL_GPL(unregister_kretprobes); -static int __kprobes pre_handler_kretprobe(struct kprobe *p, - struct pt_regs *regs) +static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) { return 0; } +NOKPROBE_SYMBOL(pre_handler_kretprobe); #endif /* CONFIG_KRETPROBES */ @@ -2002,12 +2016,13 @@ out: } EXPORT_SYMBOL_GPL(enable_kprobe); -void __kprobes dump_kprobe(struct kprobe *kp) +void dump_kprobe(struct kprobe *kp) { printk(KERN_WARNING "Dumping kprobe:\n"); printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", kp->symbol_name, kp->addr, kp->offset); } +NOKPROBE_SYMBOL(dump_kprobe); /* * Lookup and populate the kprobe_blacklist. -- cgit From 3da0f18007e5b87b573cf6ae8c445d59e757d274 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:18:28 +0900 Subject: kprobes, ftrace: Use NOKPROBE_SYMBOL macro in ftrace Use NOKPROBE_SYMBOL macro to protect functions from kprobes instead of __kprobes annotation in ftrace. This applies nokprobe_inline annotation for some cases, because NOKPROBE_SYMBOL() will inhibit inlining by referring the symbol address. Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140417081828.26341.55152.stgit@ltc230.yrl.intra.hitachi.co.jp Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 5 ++-- kernel/trace/trace_kprobe.c | 66 ++++++++++++++++++++++++----------------- kernel/trace/trace_probe.c | 61 ++++++++++++++++++++----------------- kernel/trace/trace_probe.h | 15 +++++----- 4 files changed, 82 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index c894614de14d..5d12bb407b44 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -248,8 +248,8 @@ void perf_trace_del(struct perf_event *p_event, int flags) tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } -__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs *regs, int *rctxp) +void *perf_trace_buf_prepare(int size, unsigned short type, + struct pt_regs *regs, int *rctxp) { struct trace_entry *entry; unsigned long flags; @@ -281,6 +281,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, return raw_data; } EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); +NOKPROBE_SYMBOL(perf_trace_buf_prepare); #ifdef CONFIG_FUNCTION_TRACER static void diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index aa5f0bfcdf7b..242e4ec97d94 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -40,27 +40,27 @@ struct trace_kprobe { (sizeof(struct probe_arg) * (n))) -static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk) +static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { return tk->rp.handler != NULL; } -static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk) +static nokprobe_inline const char *trace_kprobe_symbol(struct trace_kprobe *tk) { return tk->symbol ? tk->symbol : "unknown"; } -static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk) +static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk) { return tk->rp.kp.offset; } -static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk) +static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) { return !!(kprobe_gone(&tk->rp.kp)); } -static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk, +static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, struct module *mod) { int len = strlen(mod->name); @@ -68,7 +68,7 @@ static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk, return strncmp(mod->name, name, len) == 0 && name[len] == ':'; } -static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk) +static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk) { return !!strchr(trace_kprobe_symbol(tk), ':'); } @@ -132,19 +132,21 @@ struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) * Kprobes-specific fetch functions */ #define DEFINE_FETCH_stack(type) \ -static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ +static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ void *offset, void *dest) \ { \ *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ (unsigned int)((unsigned long)offset)); \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type)); + DEFINE_BASIC_FETCH_FUNCS(stack) /* No string on the stack entry */ #define fetch_stack_string NULL #define fetch_stack_string_size NULL #define DEFINE_FETCH_memory(type) \ -static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ +static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ void *addr, void *dest) \ { \ type retval; \ @@ -152,14 +154,16 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ *(type *)dest = 0; \ else \ *(type *)dest = retval; \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type)); + DEFINE_BASIC_FETCH_FUNCS(memory) /* * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max * length and relative data location. */ -static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) +static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) { long ret; int maxlen = get_rloc_len(*(u32 *)dest); @@ -193,10 +197,11 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, get_rloc_offs(*(u32 *)dest)); } } +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); /* Return the length of string -- including null terminal byte */ -static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) +static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) { mm_segment_t old_fs; int ret, len = 0; @@ -219,17 +224,19 @@ static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, else *(u32 *)dest = len; } +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size)); #define DEFINE_FETCH_symbol(type) \ -__kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, \ - void *data, void *dest) \ +void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\ { \ struct symbol_cache *sc = data; \ if (sc->addr) \ fetch_memory_##type(regs, (void *)sc->addr, dest); \ else \ *(type *)dest = 0; \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type)); + DEFINE_BASIC_FETCH_FUNCS(symbol) DEFINE_FETCH_symbol(string) DEFINE_FETCH_symbol(string_size) @@ -907,7 +914,7 @@ static const struct file_operations kprobe_profile_ops = { }; /* Kprobe handler */ -static __kprobes void +static nokprobe_inline void __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, struct ftrace_event_file *ftrace_file) { @@ -943,7 +950,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, entry, irq_flags, pc, regs); } -static __kprobes void +static void kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct event_file_link *link; @@ -951,9 +958,10 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) list_for_each_entry_rcu(link, &tk->tp.files, list) __kprobe_trace_func(tk, regs, link->file); } +NOKPROBE_SYMBOL(kprobe_trace_func); /* Kretprobe handler */ -static __kprobes void +static nokprobe_inline void __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs, struct ftrace_event_file *ftrace_file) @@ -991,7 +999,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry, irq_flags, pc, regs); } -static __kprobes void +static void kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -1000,6 +1008,7 @@ kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, list_for_each_entry_rcu(link, &tk->tp.files, list) __kretprobe_trace_func(tk, ri, regs, link->file); } +NOKPROBE_SYMBOL(kretprobe_trace_func); /* Event entry printers */ static enum print_line_t @@ -1131,7 +1140,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static __kprobes void +static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct ftrace_event_call *call = &tk->tp.call; @@ -1158,9 +1167,10 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } +NOKPROBE_SYMBOL(kprobe_perf_func); /* Kretprobe profile handler */ -static __kprobes void +static void kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -1188,6 +1198,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } +NOKPROBE_SYMBOL(kretprobe_perf_func); #endif /* CONFIG_PERF_EVENTS */ /* @@ -1223,8 +1234,7 @@ static int kprobe_register(struct ftrace_event_call *event, return 0; } -static __kprobes -int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); @@ -1238,9 +1248,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) #endif return 0; /* We don't tweek kernel, so just return 0 */ } +NOKPROBE_SYMBOL(kprobe_dispatcher); -static __kprobes -int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) +static int +kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); @@ -1254,6 +1265,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) #endif return 0; /* We don't tweek kernel, so just return 0 */ } +NOKPROBE_SYMBOL(kretprobe_dispatcher); static struct trace_event_functions kretprobe_funcs = { .trace = print_kretprobe_event diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index d3a91e40a659..d4b9fc22cd27 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -37,13 +37,13 @@ const char *reserved_field_names[] = { /* Printing in basic type function template */ #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ -__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ - const char *name, \ - void *data, void *ent) \ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ + void *data, void *ent) \ { \ return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ } \ -const char PRINT_TYPE_FMT_NAME(type)[] = fmt; +const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ +NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") @@ -55,9 +55,8 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") /* Print type function for string type */ -__kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, - const char *name, - void *data, void *ent) +int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, + void *data, void *ent) { int len = *(u32 *)data >> 16; @@ -67,6 +66,7 @@ __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, return trace_seq_printf(s, " %s=\"%s\"", name, (const char *)get_loc_data(data, ent)); } +NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; @@ -81,23 +81,24 @@ const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; /* Data fetch function templates */ #define DEFINE_FETCH_reg(type) \ -__kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ +void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \ { \ *(type *)dest = (type)regs_get_register(regs, \ (unsigned int)((unsigned long)offset)); \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type)); DEFINE_BASIC_FETCH_FUNCS(reg) /* No string on the register */ #define fetch_reg_string NULL #define fetch_reg_string_size NULL #define DEFINE_FETCH_retval(type) \ -__kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ - void *dummy, void *dest) \ +void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ + void *dummy, void *dest) \ { \ *(type *)dest = (type)regs_return_value(regs); \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type)); DEFINE_BASIC_FETCH_FUNCS(retval) /* No string on the retval */ #define fetch_retval_string NULL @@ -112,8 +113,8 @@ struct deref_fetch_param { }; #define DEFINE_FETCH_deref(type) \ -__kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ - void *data, void *dest) \ +void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ + void *data, void *dest) \ { \ struct deref_fetch_param *dprm = data; \ unsigned long addr; \ @@ -123,12 +124,13 @@ __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ dprm->fetch(regs, (void *)addr, dest); \ } else \ *(type *)dest = 0; \ -} +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type)); DEFINE_BASIC_FETCH_FUNCS(deref) DEFINE_FETCH_deref(string) -__kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, - void *data, void *dest) +void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, + void *data, void *dest) { struct deref_fetch_param *dprm = data; unsigned long addr; @@ -140,16 +142,18 @@ __kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, } else *(string_size *)dest = 0; } +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size)); -static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) +static void update_deref_fetch_param(struct deref_fetch_param *data) { if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) update_deref_fetch_param(data->orig.data); else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) update_symbol_cache(data->orig.data); } +NOKPROBE_SYMBOL(update_deref_fetch_param); -static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) +static void free_deref_fetch_param(struct deref_fetch_param *data) { if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) free_deref_fetch_param(data->orig.data); @@ -157,6 +161,7 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) free_symbol_cache(data->orig.data); kfree(data); } +NOKPROBE_SYMBOL(free_deref_fetch_param); /* Bitfield fetch function */ struct bitfield_fetch_param { @@ -166,8 +171,8 @@ struct bitfield_fetch_param { }; #define DEFINE_FETCH_bitfield(type) \ -__kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ - void *data, void *dest) \ +void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ + void *data, void *dest) \ { \ struct bitfield_fetch_param *bprm = data; \ type buf = 0; \ @@ -177,8 +182,8 @@ __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ buf >>= bprm->low_shift; \ } \ *(type *)dest = buf; \ -} - +} \ +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type)); DEFINE_BASIC_FETCH_FUNCS(bitfield) #define fetch_bitfield_string NULL #define fetch_bitfield_string_size NULL @@ -255,17 +260,17 @@ fail: } /* Special function : only accept unsigned long */ -static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs, - void *dummy, void *dest) +static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest) { *(unsigned long *)dest = kernel_stack_pointer(regs); } +NOKPROBE_SYMBOL(fetch_kernel_stack_address); -static __kprobes void fetch_user_stack_address(struct pt_regs *regs, - void *dummy, void *dest) +static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest) { *(unsigned long *)dest = user_stack_pointer(regs); } +NOKPROBE_SYMBOL(fetch_user_stack_address); static fetch_func_t get_fetch_size_function(const struct fetch_type *type, fetch_func_t orig_fn, diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index fb1ab5dfbd42..4f815fbce16d 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -81,13 +81,13 @@ */ #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) -static inline void *get_rloc_data(u32 *dl) +static nokprobe_inline void *get_rloc_data(u32 *dl) { return (u8 *)dl + get_rloc_offs(*dl); } /* For data_loc conversion */ -static inline void *get_loc_data(u32 *dl, void *ent) +static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) { return (u8 *)ent + get_rloc_offs(*dl); } @@ -136,9 +136,8 @@ typedef u32 string_size; /* Printing in basic type function template */ #define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ -__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ - const char *name, \ - void *data, void *ent); \ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ + void *data, void *ent); \ extern const char PRINT_TYPE_FMT_NAME(type)[] DECLARE_BASIC_PRINT_TYPE_FUNC(u8); @@ -303,7 +302,7 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp) return !!(tp->flags & TP_FLAG_REGISTERED); } -static inline __kprobes void call_fetch(struct fetch_param *fprm, +static nokprobe_inline void call_fetch(struct fetch_param *fprm, struct pt_regs *regs, void *dest) { return fprm->fn(regs, fprm->data, dest); @@ -351,7 +350,7 @@ extern ssize_t traceprobe_probes_write(struct file *file, extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); /* Sum up total data length for dynamic arraies (strings) */ -static inline __kprobes int +static nokprobe_inline int __get_data_size(struct trace_probe *tp, struct pt_regs *regs) { int i, ret = 0; @@ -367,7 +366,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs) } /* Store the value of each argument */ -static inline __kprobes void +static nokprobe_inline void store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, u8 *data, int maxlen) { -- cgit From b40a2cb6e0c218c6d0f12c7f7e683e75972973c1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:18:35 +0900 Subject: kprobes, notifier: Use NOKPROBE_SYMBOL macro in notifier Use NOKPROBE_SYMBOL macro to protect functions from kprobes instead of __kprobes annotation in notifier. Signed-off-by: Masami Hiramatsu Reviewed-by: Steven Rostedt Reviewed-by: Josh Triplett Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/20140417081835.26341.56128.stgit@ltc230.yrl.intra.hitachi.co.jp Signed-off-by: Ingo Molnar --- kernel/notifier.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index db4c8b08a50c..4803da6eab62 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -71,9 +71,9 @@ static int notifier_chain_unregister(struct notifier_block **nl, * @returns: notifier_call_chain returns the value returned by the * last notifier function called. */ -static int __kprobes notifier_call_chain(struct notifier_block **nl, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) +static int notifier_call_chain(struct notifier_block **nl, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) { int ret = NOTIFY_DONE; struct notifier_block *nb, *next_nb; @@ -102,6 +102,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, } return ret; } +NOKPROBE_SYMBOL(notifier_call_chain); /* * Atomic notifier chain routines. Registration and unregistration @@ -172,9 +173,9 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); * Otherwise the return value is the return value * of the last notifier function called. */ -int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v, - int nr_to_call, int *nr_calls) +int __atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) { int ret; @@ -184,13 +185,15 @@ int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, return ret; } EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); +NOKPROBE_SYMBOL(__atomic_notifier_call_chain); -int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, - unsigned long val, void *v) +int atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v) { return __atomic_notifier_call_chain(nh, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); +NOKPROBE_SYMBOL(atomic_notifier_call_chain); /* * Blocking notifier chain routines. All access to the chain is @@ -527,7 +530,7 @@ EXPORT_SYMBOL_GPL(srcu_init_notifier_head); static ATOMIC_NOTIFIER_HEAD(die_chain); -int notrace __kprobes notify_die(enum die_val val, const char *str, +int notrace notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { @@ -540,6 +543,7 @@ int notrace __kprobes notify_die(enum die_val val, const char *str, }; return atomic_notifier_call_chain(&die_chain, val, &args); } +NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { -- cgit From edafe3a56dbd42c499245b222e9f7e80099356e5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:18:42 +0900 Subject: kprobes, sched: Use NOKPROBE_SYMBOL macro in sched Use NOKPROBE_SYMBOL macro to protect functions from kprobes instead of __kprobes annotation in sched/core.c. Signed-off-by: Masami Hiramatsu Reviewed-by: Steven Rostedt Cc: Peter Zijlstra Signed-off-by: Ingo Molnar Link: http://lkml.kernel.org/r/20140417081842.26341.83959.stgit@ltc230.yrl.intra.hitachi.co.jp --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6863631e8cd0..00781cc38047 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2480,7 +2480,7 @@ notrace unsigned long get_parent_ip(unsigned long addr) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) -void __kprobes preempt_count_add(int val) +void preempt_count_add(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2506,8 +2506,9 @@ void __kprobes preempt_count_add(int val) } } EXPORT_SYMBOL(preempt_count_add); +NOKPROBE_SYMBOL(preempt_count_add); -void __kprobes preempt_count_sub(int val) +void preempt_count_sub(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2528,6 +2529,7 @@ void __kprobes preempt_count_sub(int val) __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); +NOKPROBE_SYMBOL(preempt_count_sub); #endif -- cgit From 637247403abff8c963bc7be8002b3f49ea604563 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Apr 2014 17:18:49 +0900 Subject: kprobes: Show blacklist entries via debugfs Show blacklist entries (function names with the address range) via /sys/kernel/debug/kprobes/blacklist. Note that at this point the blacklist supports only in vmlinux, not module. So the list is fixed and not updated. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S. Miller Link: http://lkml.kernel.org/r/20140417081849.26341.11609.stgit@ltc230.yrl.intra.hitachi.co.jp Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a21b4e67fd97..3214289df5a7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2249,6 +2249,46 @@ static const struct file_operations debugfs_kprobes_operations = { .release = seq_release, }; +/* kprobes/blacklist -- shows which functions can not be probed */ +static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) +{ + return seq_list_start(&kprobe_blacklist, *pos); +} + +static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &kprobe_blacklist, pos); +} + +static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) +{ + struct kprobe_blacklist_entry *ent = + list_entry(v, struct kprobe_blacklist_entry, list); + + seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr, + (void *)ent->end_addr, (void *)ent->start_addr); + return 0; +} + +static const struct seq_operations kprobe_blacklist_seq_ops = { + .start = kprobe_blacklist_seq_start, + .next = kprobe_blacklist_seq_next, + .stop = kprobe_seq_stop, /* Reuse void function */ + .show = kprobe_blacklist_seq_show, +}; + +static int kprobe_blacklist_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &kprobe_blacklist_seq_ops); +} + +static const struct file_operations debugfs_kprobe_blacklist_ops = { + .open = kprobe_blacklist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static void arm_all_kprobes(void) { struct hlist_head *head; @@ -2372,19 +2412,24 @@ static int __init debugfs_kprobe_init(void) file = debugfs_create_file("list", 0444, dir, NULL, &debugfs_kprobes_operations); - if (!file) { - debugfs_remove(dir); - return -ENOMEM; - } + if (!file) + goto error; file = debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); - if (!file) { - debugfs_remove(dir); - return -ENOMEM; - } + if (!file) + goto error; + + file = debugfs_create_file("blacklist", 0444, dir, NULL, + &debugfs_kprobe_blacklist_ops); + if (!file) + goto error; return 0; + +error: + debugfs_remove(dir); + return -ENOMEM; } late_initcall(debugfs_kprobe_init); -- cgit From 7eea4fce0246fe3a15ad7f3bb8d0a56d1f9440e6 Mon Sep 17 00:00:00 2001 From: Jiaxing Wang Date: Sun, 20 Apr 2014 23:10:43 +0800 Subject: tracing/stack_trace: Skip 4 instead of 3 when using ftrace_ops_list_func When using ftrace_ops_list_func, we should skip 4 instead of 3, to avoid ftrace_call+0x5/0xb appearing in the stack trace: Depth Size Location (110 entries) ----- ---- -------- 0) 2956 0 update_curr+0xe/0x1e0 1) 2956 68 ftrace_call+0x5/0xb 2) 2888 92 enqueue_entity+0x53/0xe80 3) 2796 80 enqueue_task_fair+0x47/0x7e0 4) 2716 28 enqueue_task+0x45/0x70 5) 2688 12 activate_task+0x22/0x30 Add a function using_ftrace_ops_list_func() to test for this while keeping ftrace_ops_list_func to remain static. Link: http://lkml.kernel.org/p/1398006644-5935-2-git-send-email-wangjiaxing@insigma.com.cn Signed-off-by: Jiaxing Wang Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 5 +++++ kernel/trace/trace.h | 1 + kernel/trace/trace_stack.c | 8 ++++++-- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 846888ea2ba4..3cfeb66bda67 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -313,6 +313,11 @@ static void update_ftrace_function(void) ftrace_trace_function = func; } +int using_ftrace_ops_list_func(void) +{ + return ftrace_trace_function == ftrace_ops_list_func; +} + static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) { ops->next = *list; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5d2f07d6746c..8624b5041466 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -830,6 +830,7 @@ void ftrace_destroy_function_files(struct trace_array *tr); void ftrace_init_global_array_ops(struct trace_array *tr); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); +int using_ftrace_ops_list_func(void); #else static inline int ftrace_trace_task(struct task_struct *task) { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 21b320e5d163..5aa9a5b9b6e2 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -85,8 +85,12 @@ check_stack(unsigned long ip, unsigned long *stack) max_stack_size = this_size; - max_stack_trace.nr_entries = 0; - max_stack_trace.skip = 3; + max_stack_trace.nr_entries = 0; + + if (using_ftrace_ops_list_func()) + max_stack_trace.skip = 4; + else + max_stack_trace.skip = 3; save_stack_trace(&max_stack_trace); -- cgit From 8d1b065d47ff1424b1aef7a6aa605b467694c120 Mon Sep 17 00:00:00 2001 From: Jiaxing Wang Date: Sun, 20 Apr 2014 23:10:44 +0800 Subject: tracing: Fix documentation of ftrace_set_global_{filter,notrace}() The functions ftrace_set_global_filter() and ftrace_set_global_notrace() still have their old names in the kernel doc (ftrace_set_filter and ftrace_set_notrace respectively). Replace these with the real names. Link: http://lkml.kernel.org/p/1398006644-5935-3-git-send-email-wangjiaxing@insigma.com.cn Signed-off-by: Jiaxing Wang Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3cfeb66bda67..34b098bfded4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3542,8 +3542,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, } EXPORT_SYMBOL_GPL(ftrace_set_notrace); /** - * ftrace_set_filter - set a function to filter on in ftrace - * @ops - the ops to set the filter with + * ftrace_set_global_filter - set a function to filter on with global tracers * @buf - the string that holds the function filter text. * @len - the length of the string. * @reset - non zero to reset all filters before applying this filter. @@ -3558,8 +3557,7 @@ void ftrace_set_global_filter(unsigned char *buf, int len, int reset) EXPORT_SYMBOL_GPL(ftrace_set_global_filter); /** - * ftrace_set_notrace - set a function to not trace in ftrace - * @ops - the ops to set the notrace filter with + * ftrace_set_global_notrace - set a function to not trace with global tracers * @buf - the string that holds the function notrace text. * @len - the length of the string. * @reset - non zero to reset all filters before applying this filter. -- cgit From 86d56134f1b67d0c18025ba5cade95c048ed528d Mon Sep 17 00:00:00 2001 From: Michael Marineau Date: Thu, 10 Apr 2014 14:09:31 -0700 Subject: kobject: Make support for uevent_helper optional. Support for uevent_helper, aka hotplug, is not required on many systems these days but it can still be enabled via sysfs or sysctl. Reported-by: Darren Shepherd Signed-off-by: Michael Marineau Signed-off-by: Greg Kroah-Hartman --- drivers/base/Kconfig | 17 +++++++++++------ include/linux/kobject.h | 2 ++ kernel/ksysfs.c | 5 ++++- kernel/sysctl.c | 4 ++-- lib/kobject_uevent.c | 6 ++++++ 5 files changed, 25 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 8fa8deab6449..4b7b4522b64f 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -1,10 +1,10 @@ menu "Generic Driver Options" -config UEVENT_HELPER_PATH - string "path to uevent helper" - default "" +config UEVENT_HELPER + bool "Support for uevent helper" + default y help - Path to uevent helper program forked by the kernel for + The uevent helper program is forked by the kernel for every uevent. Before the switch to the netlink-based uevent source, this was used to hook hotplug scripts into kernel device events. It @@ -15,8 +15,13 @@ config UEVENT_HELPER_PATH that it creates a high system load, or on smaller systems it is known to create out-of-memory situations during bootup. - To disable user space helper program execution at early boot - time specify an empty string here. This setting can be altered +config UEVENT_HELPER_PATH + string "path to uevent helper" + depends on UEVENT_HELPER + default "" + help + To disable user space helper program execution at by default + specify an empty string here. This setting can still be altered via /proc/sys/kernel/hotplug or via /sys/kernel/uevent_helper later at runtime. diff --git a/include/linux/kobject.h b/include/linux/kobject.h index f896a33e8341..2d61b909f414 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -32,8 +32,10 @@ #define UEVENT_NUM_ENVP 32 /* number of env pointers */ #define UEVENT_BUFFER_SIZE 2048 /* buffer for the variables */ +#ifdef CONFIG_UEVENT_HELPER /* path to the userspace helper executed on an event */ extern char uevent_helper[]; +#endif /* counter to tag the uevent, read only except for the kobject core */ extern u64 uevent_seqnum; diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 2495a9b14ac8..6683ccef9fff 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -37,6 +37,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj, } KERNEL_ATTR_RO(uevent_seqnum); +#ifdef CONFIG_UEVENT_HELPER /* uevent helper program, used during early boot */ static ssize_t uevent_helper_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -56,7 +57,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj, return count; } KERNEL_ATTR_RW(uevent_helper); - +#endif #ifdef CONFIG_PROFILING static ssize_t profiling_show(struct kobject *kobj, @@ -189,7 +190,9 @@ EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { &fscaps_attr.attr, &uevent_seqnum_attr.attr, +#ifdef CONFIG_UEVENT_HELPER &uevent_helper_attr.attr, +#endif #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..bc966a8ffc3e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -643,7 +643,7 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif - +#ifdef CONFIG_UEVENT_HELPER { .procname = "hotplug", .data = &uevent_helper, @@ -651,7 +651,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dostring, }, - +#endif #ifdef CONFIG_CHR_DEV_SG { .procname = "sg-big-buff", diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 4e3bd71bd949..9ebf9e20de53 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -29,7 +29,9 @@ u64 uevent_seqnum; +#ifdef CONFIG_UEVENT_HELPER char uevent_helper[UEVENT_HELPER_PATH_LEN] = CONFIG_UEVENT_HELPER_PATH; +#endif #ifdef CONFIG_NET struct uevent_sock { struct list_head list; @@ -109,6 +111,7 @@ static int kobj_bcast_filter(struct sock *dsk, struct sk_buff *skb, void *data) } #endif +#ifdef CONFIG_UEVENT_HELPER static int kobj_usermode_filter(struct kobject *kobj) { const struct kobj_ns_type_operations *ops; @@ -147,6 +150,7 @@ static void cleanup_uevent_env(struct subprocess_info *info) { kfree(info->data); } +#endif /** * kobject_uevent_env - send an uevent with environmental data @@ -323,6 +327,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, #endif mutex_unlock(&uevent_sock_mutex); +#ifdef CONFIG_UEVENT_HELPER /* call uevent_helper, usually only enabled during early boot */ if (uevent_helper[0] && !kobj_usermode_filter(kobj)) { struct subprocess_info *info; @@ -347,6 +352,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, env = NULL; /* freed by cleanup_uevent_env */ } } +#endif exit: kfree(devpath); -- cgit From 842b597ee0a7e1aa5a3148164ffdba00ec17f614 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 25 Apr 2014 18:28:02 -0400 Subject: cgroup: implement cgroup.populated for the default hierarchy cgroup users often need a way to determine when a cgroup's subhierarchy becomes empty so that it can be cleaned up. cgroup currently provides release_agent for it; unfortunately, this mechanism is riddled with issues. * It delivers events by forking and execing a userland binary specified as the release_agent. This is a long deprecated method of notification delivery. It's extremely heavy, slow and cumbersome to integrate with larger infrastructure. * There is single monitoring point at the root. There's no way to delegate management of a subtree. * The event isn't recursive. It triggers when a cgroup doesn't have any tasks or child cgroups. Events for internal nodes trigger only after all children are removed. This again makes it impossible to delegate management of a subtree. * Events are filtered from the kernel side. "notify_on_release" file is used to subscribe to or suppress release event. This is unnecessarily complicated and probably done this way because event delivery itself was expensive. This patch implements interface file "cgroup.populated" which can be used to monitor whether the cgroup's subhierarchy has tasks in it or not. Its value is 0 if there is no task in the cgroup and its descendants; otherwise, 1, and kernfs_notify() notificaiton is triggers when the value changes, which can be monitored through poll and [di]notify. This is a lot ligther and simpler and trivially allows delegating management of subhierarchy - subhierarchy monitoring can block further propgation simply by putting itself or another process in the root of the subhierarchy and monitor events that it's interested in from there without interfering with monitoring higher in the tree. v2: Patch description updated as per Serge. v3: "cgroup.subtree_populated" renamed to "cgroup.populated". The subtree_ prefix was a bit confusing because "cgroup.subtree_control" uses it to denote the tree rooted at the cgroup sans the cgroup itself while the populated state includes the cgroup itself. Signed-off-by: Tejun Heo Acked-by: Serge Hallyn Acked-by: Li Zefan Cc: Lennart Poettering --- include/linux/cgroup.h | 15 ++++++++++++ kernel/cgroup.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 76 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ada239253ec7..4b38e2d6110d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -154,6 +154,14 @@ struct cgroup { /* the number of attached css's */ int nr_css; + /* + * If this cgroup contains any tasks, it contributes one to + * populated_cnt. All children with non-zero popuplated_cnt of + * their own contribute one. The count is zero iff there's no task + * in this cgroup or its subtree. + */ + int populated_cnt; + atomic_t refcnt; /* @@ -166,6 +174,7 @@ struct cgroup { struct cgroup *parent; /* my parent */ struct kernfs_node *kn; /* cgroup kernfs entry */ struct kernfs_node *control_kn; /* kn for "cgroup.subtree_control" */ + struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ /* * Monotonically increasing unique serial number which defines a @@ -264,6 +273,12 @@ enum { * * - "cgroup.clone_children" is removed. * + * - "cgroup.subtree_populated" is available. Its value is 0 if + * the cgroup and its descendants contain no task; otherwise, 1. + * The file also generates kernfs notification which can be + * monitored through poll and [di]notify when the value of the + * file changes. + * * - If mount is requested with sane_behavior but without any * subsystem, the default unified hierarchy is mounted. * diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 809dd903ceb8..0f986f7afee4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -411,6 +411,43 @@ static struct css_set init_css_set = { static int css_set_count = 1; /* 1 for init_css_set */ +/** + * cgroup_update_populated - updated populated count of a cgroup + * @cgrp: the target cgroup + * @populated: inc or dec populated count + * + * @cgrp is either getting the first task (css_set) or losing the last. + * Update @cgrp->populated_cnt accordingly. The count is propagated + * towards root so that a given cgroup's populated_cnt is zero iff the + * cgroup and all its descendants are empty. + * + * @cgrp's interface file "cgroup.populated" is zero if + * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt + * changes from or to zero, userland is notified that the content of the + * interface file has changed. This can be used to detect when @cgrp and + * its descendants become populated or empty. + */ +static void cgroup_update_populated(struct cgroup *cgrp, bool populated) +{ + lockdep_assert_held(&css_set_rwsem); + + do { + bool trigger; + + if (populated) + trigger = !cgrp->populated_cnt++; + else + trigger = !--cgrp->populated_cnt; + + if (!trigger) + break; + + if (cgrp->populated_kn) + kernfs_notify(cgrp->populated_kn); + cgrp = cgrp->parent; + } while (cgrp); +} + /* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into @@ -456,10 +493,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) list_del(&link->cgrp_link); /* @cgrp can't go away while we're holding css_set_rwsem */ - if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { - if (taskexit) - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); + if (list_empty(&cgrp->cset_links)) { + cgroup_update_populated(cgrp, false); + if (notify_on_release(cgrp)) { + if (taskexit) + set_bit(CGRP_RELEASABLE, &cgrp->flags); + check_for_release(cgrp); + } } kfree(link); @@ -668,7 +708,11 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; + + if (list_empty(&cgrp->cset_links)) + cgroup_update_populated(cgrp, true); list_move(&link->cset_link, &cgrp->cset_links); + /* * Always add links to the tail of the list so that the list * is sorted by order of hierarchy creation @@ -2643,6 +2687,12 @@ err_undo_css: goto out_unlock; } +static int cgroup_populated_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); + return 0; +} + static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -2809,6 +2859,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) if (cft->seq_show == cgroup_subtree_control_show) cgrp->control_kn = kn; + else if (cft->seq_show == cgroup_populated_show) + cgrp->populated_kn = kn; return 0; } @@ -3918,6 +3970,11 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_subtree_control_show, .write_string = cgroup_subtree_control_write, }, + { + .name = "cgroup.populated", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_populated_show, + }, /* * Historical crazy stuff. These don't have "cgroup." prefix and -- cgit From 2f0edc04e702fc07d29621f9e361b9120a7594d0 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Fri, 25 Apr 2014 18:28:03 -0400 Subject: cgroup: clean up obsolete comment for parse_cgroupfs_options() 1d5be6b287c8efc87 ("cgroup: move module ref handling into rebind_subsystems()") makes parse_cgroupfs_options() no longer takes refcounts on subsystems. And unified hierachy makes parse_cgroupfs_options not need to call with cgroup_mutex held to protect the cgroup_subsys[]. So this patch removes BUG_ON() and the comment. As the comment doesn't contain useful information afterwards, the whole comment is removed. Signed-off-by: Jianyu Zhan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0f986f7afee4..fb848be0ea7b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1221,12 +1221,6 @@ struct cgroup_sb_opts { bool none; }; -/* - * Convert a hierarchy specifier into a bitmask of subsystems and - * flags. Call with cgroup_mutex held to protect the cgroup_subsys[] - * array. This function takes refcounts on subsystems to be used, unless it - * returns error, in which case no refcounts are taken. - */ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; @@ -1235,8 +1229,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) struct cgroup_subsys *ss; int i; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - #ifdef CONFIG_CPUSETS mask = ~(1UL << cpuset_cgrp_id); #endif -- cgit From f8719ccf7bc0858384c7e93d8c57fe69ae8c9eac Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Fri, 25 Apr 2014 18:28:03 -0400 Subject: cgroup: remove orphaned cgroup_pidlist_seq_operations 6612f05b88fa309c9 ("cgroup: unify pidlist and other file handling") has removed the only user of cgroup_pidlist_seq_operations : cgroup_pidlist_open(). This patch removes it. Signed-off-by: Jianyu Zhan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fb848be0ea7b..3849d3d2dfe1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3880,17 +3880,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v) return seq_printf(s, "%d\n", *(int *)v); } -/* - * seq_operations functions for iterating on pidlists through seq_file - - * independent of whether it's tasks or procs - */ -static const struct seq_operations cgroup_pidlist_seq_operations = { - .start = cgroup_pidlist_start, - .stop = cgroup_pidlist_stop, - .next = cgroup_pidlist_next, - .show = cgroup_pidlist_show, -}; - static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { -- cgit From a2a1f9eaf945c46b5b2bc0e439cba68888e3d540 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Fri, 25 Apr 2014 18:28:03 -0400 Subject: cgroup: replace pr_warning with preferred pr_warn As suggested by scripts/checkpatch.pl, substitude all pr_warning() with pr_warn(). No functional change. Signed-off-by: Jianyu Zhan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3849d3d2dfe1..cb453e9954c1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1126,9 +1126,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root, * Just warn about it and continue. */ if (cgrp_dfl_root_visible) { - pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", - ret, ss_mask); - pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); + pr_warn("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", + ret, ss_mask); + pr_warn("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); } } @@ -1323,7 +1323,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Consistency checks */ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); + pr_warn("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || opts->cpuset_clone_children || opts->release_agent || @@ -1387,8 +1387,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) goto out_unlock; if (opts.subsys_mask != root->subsys_mask || opts.release_agent) - pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", - task_tgid_nr(current), current->comm); + pr_warn("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", + task_tgid_nr(current), current->comm); added_mask = opts.subsys_mask & ~root->subsys_mask; removed_mask = root->subsys_mask & ~opts.subsys_mask; @@ -1669,7 +1669,7 @@ retry: ret = -EINVAL; goto out_unlock; } else { - pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); + pr_warn("cgroup: new mount options do not match the existing superblock, will be ignored\n"); } } @@ -4168,10 +4168,10 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { - pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", - current->comm, current->pid, ss->name); + pr_warn("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + current->comm, current->pid, ss->name); if (!strcmp(ss->name, "memory")) - pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + pr_warn("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); ss->warned_broken_hierarchy = true; } -- cgit From ed3d261b53f51c9505822d757d1800c79fb68788 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 25 Apr 2014 18:28:03 -0400 Subject: cgroup: Use more current logging style Use pr_fmt and remove embedded prefixes. Realign modified multi-line statements to open parenthesis. Convert embedded function name to "%s: ", __func__ Signed-off-by: Joe Perches Signed-off-by: Tejun Heo --- kernel/cgroup.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cb453e9954c1..3873267c9ee3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -26,6 +26,8 @@ * distribution for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -1126,9 +1128,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root, * Just warn about it and continue. */ if (cgrp_dfl_root_visible) { - pr_warn("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", + pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", ret, ss_mask); - pr_warn("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); + pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); } } @@ -1323,12 +1325,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Consistency checks */ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_warn("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); + pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || opts->cpuset_clone_children || opts->release_agent || opts->name) { - pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); + pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); return -EINVAL; } } else { @@ -1374,7 +1376,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) unsigned long added_mask, removed_mask; if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("cgroup: sane_behavior: remount is not allowed\n"); + pr_err("sane_behavior: remount is not allowed\n"); return -EINVAL; } @@ -1387,7 +1389,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) goto out_unlock; if (opts.subsys_mask != root->subsys_mask || opts.release_agent) - pr_warn("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", + pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); added_mask = opts.subsys_mask & ~root->subsys_mask; @@ -1396,7 +1398,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || (opts.name && strcmp(opts.name, root->name))) { - pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", + pr_err("option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", root->flags & CGRP_ROOT_OPTION_MASK, root->name); ret = -EINVAL; @@ -1665,11 +1667,11 @@ retry: if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); + pr_err("sane_behavior: new mount options should match the existing superblock\n"); ret = -EINVAL; goto out_unlock; } else { - pr_warn("cgroup: new mount options do not match the existing superblock, will be ignored\n"); + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); } } @@ -2889,8 +2891,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], if (is_add) { ret = cgroup_add_file(cgrp, cft); if (ret) { - pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", - cft->name, ret); + pr_warn("%s: failed to add %s, err=%d\n", + __func__, cft->name, ret); return ret; } } else { @@ -4168,10 +4170,10 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { - pr_warn("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", current->comm, current->pid, ss->name); if (!strcmp(ss->name, "memory")) - pr_warn("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n"); ss->warned_broken_hierarchy = true; } -- cgit From 51e158c12aca3c9ac63988611a97c05109b14dc9 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Apr 2014 11:34:33 +0930 Subject: param: hand arguments after -- straight to init The kernel passes any args it doesn't need through to init, except it assumes anything containing '.' belongs to the kernel (for a module). This change means all users can clearly distinguish which arguments are for init. For example, the kernel uses debug ("dee-bug") to mean log everything to the console, where systemd uses the debug from the Scandinavian "day-boog" meaning "fail to boot". If a future versions uses argv[] instead of reading /proc/cmdline, this confusion will be avoided. eg: test 'FOO="this is --foo"' -- 'systemd.debug="true true true"' Gives: argv[0] = '/debug-init' argv[1] = 'test' argv[2] = 'systemd.debug=true true true' envp[0] = 'HOME=/' envp[1] = 'TERM=linux' envp[2] = 'FOO=this is --foo' Signed-off-by: Rusty Russell --- include/linux/moduleparam.h | 2 +- init/main.c | 33 +++++++++++++++++++++++++++++---- kernel/module.c | 12 +++++++++--- kernel/params.c | 25 ++++++++++++++----------- 4 files changed, 53 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 204a67743804..b1990c5524e1 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -321,7 +321,7 @@ extern bool parameq(const char *name1, const char *name2); extern bool parameqn(const char *name1, const char *name2, size_t n); /* Called on module insert or kernel boot */ -extern int parse_args(const char *name, +extern char *parse_args(const char *name, char *args, const struct kernel_param *params, unsigned num, diff --git a/init/main.c b/init/main.c index 9c7fd4c9249f..e9d458b5d77b 100644 --- a/init/main.c +++ b/init/main.c @@ -252,6 +252,27 @@ static int __init repair_env_string(char *param, char *val, const char *unused) return 0; } +/* Anything after -- gets handed straight to init. */ +static int __init set_init_arg(char *param, char *val, const char *unused) +{ + unsigned int i; + + if (panic_later) + return 0; + + repair_env_string(param, val, unused); + + for (i = 0; argv_init[i]; i++) { + if (i == MAX_INIT_ARGS) { + panic_later = "init"; + panic_param = param; + return 0; + } + } + argv_init[i] = param; + return 0; +} + /* * Unknown boot options get handed to init, unless they look like * unused parameters (modprobe will find them in /proc/cmdline). @@ -478,7 +499,7 @@ static void __init mm_init(void) asmlinkage void __init start_kernel(void) { - char * command_line; + char * command_line, *after_dashes; extern const struct kernel_param __start___param[], __stop___param[]; /* @@ -519,9 +540,13 @@ asmlinkage void __init start_kernel(void) pr_notice("Kernel command line: %s\n", boot_command_line); parse_early_param(); - parse_args("Booting kernel", static_command_line, __start___param, - __stop___param - __start___param, - -1, -1, &unknown_bootoption); + after_dashes = parse_args("Booting kernel", + static_command_line, __start___param, + __stop___param - __start___param, + -1, -1, &unknown_bootoption); + if (after_dashes) + parse_args("Setting init args", after_dashes, NULL, 0, -1, -1, + set_init_arg); jump_label_init(); diff --git a/kernel/module.c b/kernel/module.c index 11869408f79b..66e4e0d260a9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3193,6 +3193,7 @@ static int load_module(struct load_info *info, const char __user *uargs, { struct module *mod; long err; + char *after_dashes; err = module_sig_check(info); if (err) @@ -3277,10 +3278,15 @@ static int load_module(struct load_info *info, const char __user *uargs, goto ddebug_cleanup; /* Module is ready to execute: parsing args may do that. */ - err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, unknown_module_param_cb); - if (err < 0) + after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, + -32768, 32767, unknown_module_param_cb); + if (IS_ERR(after_dashes)) { + err = PTR_ERR(after_dashes); goto bug_cleanup; + } else if (after_dashes) { + pr_warn("%s: parameters '%s' after `--' ignored\n", + mod->name, after_dashes); + } /* Link in to syfs. */ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); diff --git a/kernel/params.c b/kernel/params.c index b00142e7f3ba..1e52ca233fd9 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -177,13 +177,13 @@ static char *next_arg(char *args, char **param, char **val) } /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ -int parse_args(const char *doing, - char *args, - const struct kernel_param *params, - unsigned num, - s16 min_level, - s16 max_level, - int (*unknown)(char *param, char *val, const char *doing)) +char *parse_args(const char *doing, + char *args, + const struct kernel_param *params, + unsigned num, + s16 min_level, + s16 max_level, + int (*unknown)(char *param, char *val, const char *doing)) { char *param, *val; @@ -198,6 +198,9 @@ int parse_args(const char *doing, int irq_was_disabled; args = next_arg(args, ¶m, &val); + /* Stop at -- */ + if (!val && strcmp(param, "--") == 0) + return args; irq_was_disabled = irqs_disabled(); ret = parse_one(param, val, doing, params, num, min_level, max_level, unknown); @@ -208,22 +211,22 @@ int parse_args(const char *doing, switch (ret) { case -ENOENT: pr_err("%s: Unknown parameter `%s'\n", doing, param); - return ret; + return ERR_PTR(ret); case -ENOSPC: pr_err("%s: `%s' too large for parameter `%s'\n", doing, val ?: "", param); - return ret; + return ERR_PTR(ret); case 0: break; default: pr_err("%s: `%s' invalid for parameter `%s'\n", doing, val ?: "", param); - return ret; + return ERR_PTR(ret); } } /* All parsed OK. */ - return 0; + return NULL; } /* Lazy bastard, eh? */ -- cgit From 4881f603d7b82df2bc15efd2a272f973a3bf8df1 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Fri, 25 Apr 2014 08:44:59 +0800 Subject: PM / hibernate: use unsigned local variables in swsusp_show_speed() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit do_div() needs 'u64' type, or it reports warning. And negative number is meaningless for "speed", so change all signed to unsigned within swsusp_show_speed(). The related warning (with allmodconfig for unicore32): CC kernel/power/hibernate.o kernel/power/hibernate.c: In function ‘swsusp_show_speed’: kernel/power/hibernate.c:237: warning: comparison of distinct pointer types lacks a cast Signed-off-by: Chen Gang [rjw: Subject] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index f4f2073711d3..de4b989cc8fd 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -228,19 +228,23 @@ static void platform_recover(int platform_mode) void swsusp_show_speed(struct timeval *start, struct timeval *stop, unsigned nr_pages, char *msg) { - s64 elapsed_centisecs64; - int centisecs; - int k; - int kps; + u64 elapsed_centisecs64; + unsigned int centisecs; + unsigned int k; + unsigned int kps; elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); + /* + * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time, + * it is obvious enough for what went wrong. + */ do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); centisecs = elapsed_centisecs64; if (centisecs == 0) centisecs = 1; /* avoid div-by-zero */ k = nr_pages * (PAGE_SIZE / 1024); kps = (k * 100) / centisecs; - printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", + printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n", msg, k, centisecs / 100, centisecs % 100, kps / 1000, (kps % 1000) / 10); -- cgit From 91dc95427a0d30ac2c58d6e943c7f40a3f25d908 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Feb 2014 09:47:13 -0800 Subject: rcu: Protect ->gp_flags accesses with ACCESS_ONCE() A number of ->gp_flags accesses don't have ACCESS_ONCE(), but all of the can race against other loads or stores. This commit therefore applies ACCESS_ONCE() to the unprotected ->gp_flags accesses. Reported-by: Alexey Roytman Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0c47e300210a..2c53ac924cab 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1403,12 +1403,12 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); - if (rsp->gp_flags == 0) { + if (!ACCESS_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); return 0; } - rsp->gp_flags = 0; /* Clear all flags: New grace period. */ + ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */ if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { /* @@ -1501,7 +1501,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); - rsp->gp_flags &= ~RCU_GP_FLAG_FQS; + ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS; raw_spin_unlock_irq(&rnp->lock); } return fqs_state; @@ -1566,7 +1566,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rdp = this_cpu_ptr(rsp->rda); rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ if (cpu_needs_another_gp(rsp, rdp)) { - rsp->gp_flags = RCU_GP_FLAG_INIT; + ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("newreq")); @@ -1695,7 +1695,7 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, */ return; } - rsp->gp_flags = RCU_GP_FLAG_INIT; + ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("newreq")); @@ -2320,7 +2320,7 @@ static void force_quiescent_state(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp_old->lock, flags); return; /* Someone beat us to it. */ } - rsp->gp_flags |= RCU_GP_FLAG_FQS; + ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ } -- cgit From 24342c963a7fc04ec8f199778427943509f0bd5e Mon Sep 17 00:00:00 2001 From: Liu Ping Fan Date: Mon, 24 Feb 2014 06:18:09 -0800 Subject: rcu: Fix incorrect notes for code Signed-off-by: Liu Ping Fan Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 962d1d589929..f9c9057239a9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2109,7 +2109,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) } #ifndef CONFIG_RCU_NOCB_CPU_ALL -/* Is the specified CPU a no-CPUs CPU? */ +/* Is the specified CPU a no-CBs CPU? */ bool rcu_is_nocb_cpu(int cpu) { if (have_rcu_nocb_mask) -- cgit From 83ebe63ead0fe60e4b548730800cb68293ce098b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Mar 2014 11:09:10 -0800 Subject: rcu: Print negatives for stall-warning counter wraparound The print_other_cpu_stall() and print_cpu_stall() functions print grace-period numbers using an unsigned format, which means that the number one less than zero is a very large number. This commit therefore causes these numbers to be printed with a signed format in order to improve readability of the RCU CPU stall-warning output. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2c53ac924cab..b33c29a99df3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -932,9 +932,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; - pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", + pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), - rsp->gpnum, rsp->completed, totqlen); + (long)rsp->gpnum, (long)rsp->completed, totqlen); if (ndetected == 0) pr_err("INFO: Stall ended before state dump start\n"); else if (!trigger_all_cpu_backtrace()) @@ -971,8 +971,9 @@ static void print_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; - pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", - jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); + pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", + jiffies - rsp->gp_start, + (long)rsp->gpnum, (long)rsp->completed, totqlen); if (!trigger_all_cpu_backtrace()) dump_stack(); -- cgit From 365187fbc04fd55766bf6a94e37e558505bf480a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Mar 2014 10:55:52 -0700 Subject: rcu: Update cpu_needs_another_gp() for futures from non-NOCB CPUs In the old days, the only source of requests for future grace periods was NOCB CPUs. This has changed: CPUs routinely post requests for future grace periods in order to promote power efficiency and reduce OS jitter with minimal impact on grace-period latency. This commit therefore updates cpu_needs_another_gp() to invoke rcu_future_needs_gp() instead of rcu_nocb_needs_gp(). The latter is no longer used, so is now removed. This commit also adds tracing for the irq_work_queue() wakeup case. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 39 +++++++++++++++++++++++++++++---------- kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 18 ------------------ 3 files changed, 29 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b33c29a99df3..b4688993e956 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -323,6 +323,28 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) rdp->nxttail[RCU_DONE_TAIL] != NULL; } +/* + * Return the root node of the specified rcu_state structure. + */ +static struct rcu_node *rcu_get_root(struct rcu_state *rsp) +{ + return &rsp->node[0]; +} + +/* + * Is there any need for future grace periods? + * Interrupts must be disabled. If the caller does not hold the root + * rnp_node structure's ->lock, the results are advisory only. + */ +static int rcu_future_needs_gp(struct rcu_state *rsp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1; + int *fp = &rnp->need_future_gp[idx]; + + return ACCESS_ONCE(*fp); +} + /* * Does the current CPU require a not-yet-started grace period? * The caller must have disabled interrupts to prevent races with @@ -335,7 +357,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) if (rcu_gp_in_progress(rsp)) return 0; /* No, a grace period is already in progress. */ - if (rcu_nocb_needs_gp(rsp)) + if (rcu_future_needs_gp(rsp)) return 1; /* Yes, a no-CBs CPU needs one. */ if (!rdp->nxttail[RCU_NEXT_TAIL]) return 0; /* No, this is a no-CBs (or offline) CPU. */ @@ -349,14 +371,6 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) return 0; /* No grace period needed. */ } -/* - * Return the root node of the specified rcu_state structure. - */ -static struct rcu_node *rcu_get_root(struct rcu_state *rsp) -{ - return &rsp->node[0]; -} - /* * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state * @@ -1672,6 +1686,8 @@ static void rsp_wakeup(struct irq_work *work) /* Wake up rcu_gp_kthread() to start the grace period. */ wake_up(&rsp->gp_wq); + trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), + "Workqueuewoken"); } /* @@ -1706,8 +1722,11 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, * the wakeup to interrupt context. And don't bother waking * up the running kthread. */ - if (current != rsp->gp_kthread) + if (current != rsp->gp_kthread) { + trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), + "Workqueuewake"); irq_work_queue(&rsp->wakeup_work); + } } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 75dc3c39a02a..7b572c5c65e1 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -547,7 +547,6 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); -static int rcu_nocb_needs_gp(struct rcu_state *rsp); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f9c9057239a9..f60dd6ea8333 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2067,19 +2067,6 @@ static int __init parse_rcu_nocb_poll(char *arg) } early_param("rcu_nocb_poll", parse_rcu_nocb_poll); -/* - * Do any no-CBs CPUs need another grace period? - * - * Interrupts must be disabled. If the caller does not hold the root - * rnp_node structure's ->lock, the results are advisory only. - */ -static int rcu_nocb_needs_gp(struct rcu_state *rsp) -{ - struct rcu_node *rnp = rcu_get_root(rsp); - - return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; -} - /* * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended * grace period. @@ -2402,11 +2389,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static int rcu_nocb_needs_gp(struct rcu_state *rsp) -{ - return 0; -} - static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { } -- cgit From 9b67122ae3da3018c966148233739116ed89502a Mon Sep 17 00:00:00 2001 From: Iulia Manda Date: Tue, 11 Mar 2014 13:18:22 +0200 Subject: rcu: Remove unused rcu_data structure field The ->preemptible field in rcu_data is only initialized in the function rcu_init_percpu_data(), and never used. This commit therefore removes this field. Signed-off-by: Iulia Manda Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 6 ++---- kernel/rcu/tree.h | 1 - 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b4688993e956..2cc39c781085 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3180,7 +3180,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) * that this CPU cannot possibly have any RCU callbacks in flight yet. */ static void -rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) +rcu_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; unsigned long mask; @@ -3193,7 +3193,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->beenonline = 1; /* We have now been online. */ - rdp->preemptible = preemptible; rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; @@ -3237,8 +3236,7 @@ static void rcu_prepare_cpu(int cpu) struct rcu_state *rsp; for_each_rcu_flavor(rsp) - rcu_init_percpu_data(cpu, rsp, - strcmp(rsp->name, "rcu_preempt") == 0); + rcu_init_percpu_data(cpu, rsp); } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7b572c5c65e1..13766ad2f4ee 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -252,7 +252,6 @@ struct rcu_data { bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ - bool preemptible; /* Preemptible RCU? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ #ifdef CONFIG_RCU_CPU_STALL_INFO -- cgit From 4fc5b75537d4f56577ad00355b4cd09627deb3c3 Mon Sep 17 00:00:00 2001 From: Iulia Manda Date: Tue, 11 Mar 2014 15:22:28 +0200 Subject: rcu: Protect uses of jiffies_stall field with ACCESS_ONCE() Some of the uses of the rcu_state structure's ->jiffies_stall field do not use ACCESS_ONCE(), despite there being unprotected accesses. This commit therefore uses the ACCESS_ONCE() macro to protect this field. Signed-off-by: Iulia Manda Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2cc39c781085..c624415f8386 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -865,7 +865,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) rsp->gp_start = j; smp_wmb(); /* Record start time before stall time. */ j1 = rcu_jiffies_till_stall_check(); - rsp->jiffies_stall = j + j1; + ACCESS_ONCE(rsp->jiffies_stall) = j + j1; rsp->jiffies_resched = j + j1 / 2; } @@ -904,12 +904,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp) /* Only let one CPU complain about others per time interval. */ raw_spin_lock_irqsave(&rnp->lock, flags); - delta = jiffies - rsp->jiffies_stall; + delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -992,8 +992,8 @@ static void print_cpu_stall(struct rcu_state *rsp) dump_stack(); raw_spin_lock_irqsave(&rnp->lock, flags); - if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) - rsp->jiffies_stall = jiffies + + if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) + ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1077,7 +1077,7 @@ void rcu_cpu_stall_reset(void) struct rcu_state *rsp; for_each_rcu_flavor(rsp) - rsp->jiffies_stall = jiffies + ULONG_MAX / 2; + ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2; } /* -- cgit From 48a7639ce80cf279834d0d44865e49ecd714f37d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Mar 2014 13:02:16 -0700 Subject: rcu: Make callers awaken grace-period kthread The rcu_start_gp_advanced() function currently uses irq_work_queue() to defer wakeups of the RCU grace-period kthread. This deferring is necessary to avoid RCU-scheduler deadlocks involving the rcu_node structure's lock, meaning that RCU cannot call any of the scheduler's wake-up functions while holding one of these locks. Unfortunately, the second and subsequent calls to irq_work_queue() are ignored, and the first call will be ignored (aside from queuing the work item) if the scheduler-clock tick is turned off. This is OK for many uses, especially those where irq_work_queue() is called from an interrupt or softirq handler, because in those cases the scheduler-clock-tick state will be re-evaluated, which will turn the scheduler-clock tick back on. On the next tick, any deferred work will then be processed. However, this strategy does not always work for RCU, which can be invoked at process level from idle CPUs. In this case, the tick might never be turned back on, indefinitely defering a grace-period start request. Note that the RCU CPU stall detector cannot see this condition, because there is no RCU grace period in progress. Therefore, we can (and do!) see long tens-of-seconds stalls in grace-period handling. In theory, we could see a full grace-period hang, but rcutorture testing to date has seen only the tens-of-seconds stalls. Event tracing demonstrates that irq_work_queue() is being called repeatedly to no effect during these stalls: The "newreq" event appears repeatedly from a task that is not one of the grace-period kthreads. In theory, irq_work_queue() might be fixed to avoid this sort of issue, but RCU's requirements are unusual and it is quite straightforward to pass wake-up responsibility up through RCU's call chain, so that the wakeup happens when the offending locks are released. This commit therefore makes this change. The rcu_start_gp_advanced(), rcu_start_future_gp(), rcu_accelerate_cbs(), rcu_advance_cbs(), __note_gp_changes(), and rcu_start_gp() functions now return a boolean which indicates when a wake-up is needed. A new rcu_gp_kthread_wake() does the wakeup when it is necessary and safe to do so: No self-wakes, no wake-ups if the ->gp_flags field indicates there is no need (as in someone else did the wake-up before we got around to it), and no wake-ups before the grace-period kthread has been created. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Frederic Weisbecker Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 137 +++++++++++++++++++++++++++++------------------ kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 10 +++- 3 files changed, 94 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c624415f8386..fca911b6b29c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -243,7 +243,7 @@ static ulong jiffies_till_next_fqs = ULONG_MAX; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); -static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, +static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp, bool *isidle, @@ -1138,15 +1138,18 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* * Start some future grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each - * rcu_node structure's ->need_future_gp field. + * rcu_node structure's ->need_future_gp field. Returns true if there + * is reason to awaken the grace-period kthread. * * The caller must hold the specified rcu_node structure's ->lock. */ -static unsigned long __maybe_unused -rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) +static bool __maybe_unused +rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long *c_out) { unsigned long c; int i; + bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); /* @@ -1157,7 +1160,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); if (rnp->need_future_gp[c & 0x1]) { trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); - return c; + goto out; } /* @@ -1171,7 +1174,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { rnp->need_future_gp[c & 0x1]++; trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); - return c; + goto out; } /* @@ -1212,12 +1215,15 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); } else { trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); - rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); + ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); } unlock_out: if (rnp != rnp_root) raw_spin_unlock(&rnp_root->lock); - return c; +out: + if (c_out != NULL) + *c_out = c; + return ret; } /* @@ -1240,6 +1246,22 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) return needmore; } +/* + * Awaken the grace-period kthread for the specified flavor of RCU. + * Don't do a self-awaken, and don't bother awakening when there is + * nothing for the grace-period kthread to do (as in several CPUs + * raced to awaken, and we lost), and finally don't try to awaken + * a kthread that has not yet been created. + */ +static void rcu_gp_kthread_wake(struct rcu_state *rsp) +{ + if (current == rsp->gp_kthread || + !ACCESS_ONCE(rsp->gp_flags) || + !rsp->gp_kthread) + return; + wake_up(&rsp->gp_wq); +} + /* * If there is room, assign a ->completed number to any callbacks on * this CPU that have not already been assigned. Also accelerate any @@ -1247,19 +1269,21 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) * since proven to be too conservative, which can happen if callbacks get * assigned a ->completed number while RCU is idle, but with reference to * a non-root rcu_node structure. This function is idempotent, so it does - * not hurt to call it repeatedly. + * not hurt to call it repeatedly. Returns an flag saying that we should + * awaken the RCU grace-period kthread. * * The caller must hold rnp->lock with interrupts disabled. */ -static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, +static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { unsigned long c; int i; + bool ret; /* If the CPU has no callbacks, nothing to do. */ if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) - return; + return false; /* * Starting from the sublist containing the callbacks most @@ -1288,7 +1312,7 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * be grouped into. */ if (++i >= RCU_NEXT_TAIL) - return; + return false; /* * Assign all subsequent callbacks' ->completed number to the next @@ -1300,13 +1324,14 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, rdp->nxtcompleted[i] = c; } /* Record any needed additional grace periods. */ - rcu_start_future_gp(rnp, rdp); + ret = rcu_start_future_gp(rnp, rdp, NULL); /* Trace depending on how much we were able to accelerate. */ if (!*rdp->nxttail[RCU_WAIT_TAIL]) trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); else trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); + return ret; } /* @@ -1315,17 +1340,18 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL * sublist. This function is idempotent, so it does not hurt to * invoke it repeatedly. As long as it is not invoked -too- often... + * Returns true if the RCU grace-period kthread needs to be awakened. * * The caller must hold rnp->lock with interrupts disabled. */ -static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, +static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { int i, j; /* If the CPU has no callbacks, nothing to do. */ if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) - return; + return false; /* * Find all callbacks whose ->completed numbers indicate that they @@ -1349,26 +1375,30 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, } /* Classify any remaining callbacks. */ - rcu_accelerate_cbs(rsp, rnp, rdp); + return rcu_accelerate_cbs(rsp, rnp, rdp); } /* * Update CPU-local rcu_data state to record the beginnings and ends of * grace periods. The caller must hold the ->lock of the leaf rcu_node * structure corresponding to the current CPU, and must have irqs disabled. + * Returns true if the grace-period kthread needs to be awakened. */ -static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) { + bool ret; + /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed) { /* No grace period end, so just accelerate recent callbacks. */ - rcu_accelerate_cbs(rsp, rnp, rdp); + ret = rcu_accelerate_cbs(rsp, rnp, rdp); } else { /* Advance callbacks. */ - rcu_advance_cbs(rsp, rnp, rdp); + ret = rcu_advance_cbs(rsp, rnp, rdp); /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; @@ -1387,11 +1417,13 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); } + return ret; } static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; + bool needwake; struct rcu_node *rnp; local_irq_save(flags); @@ -1403,8 +1435,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) return; } smp_mb__after_unlock_lock(); - __note_gp_changes(rsp, rnp, rdp); + needwake = __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); } /* @@ -1468,7 +1502,7 @@ static int rcu_gp_init(struct rcu_state *rsp) WARN_ON_ONCE(rnp->completed != rsp->completed); ACCESS_ONCE(rnp->completed) = rsp->completed; if (rnp == rdp->mynode) - __note_gp_changes(rsp, rnp, rdp); + (void)__note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, @@ -1528,6 +1562,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; + bool needgp = false; int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1563,7 +1598,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) - __note_gp_changes(rsp, rnp, rdp); + needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); @@ -1579,8 +1614,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); - rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ - if (cpu_needs_another_gp(rsp, rdp)) { + /* Advance CBs to reduce false positives below. */ + needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; + if (needgp || cpu_needs_another_gp(rsp, rdp)) { ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -1680,16 +1716,6 @@ static int __noreturn rcu_gp_kthread(void *arg) } } -static void rsp_wakeup(struct irq_work *work) -{ - struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work); - - /* Wake up rcu_gp_kthread() to start the grace period. */ - wake_up(&rsp->gp_wq); - trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), - "Workqueuewoken"); -} - /* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold @@ -1698,8 +1724,10 @@ static void rsp_wakeup(struct irq_work *work) * Note that it is legal for a dying CPU (which is marked as offline) to * invoke this function. This can happen when the dying CPU reports its * quiescent state. + * + * Returns true if the grace-period kthread must be awakened. */ -static void +static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { @@ -1710,7 +1738,7 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, * or a grace period is already in progress. * Either way, don't start a new grace period. */ - return; + return false; } ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -1719,14 +1747,9 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, /* * We can't do wakeups while holding the rnp->lock, as that * could cause possible deadlocks with the rq->lock. Defer - * the wakeup to interrupt context. And don't bother waking - * up the running kthread. + * the wakeup to our caller. */ - if (current != rsp->gp_kthread) { - trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), - "Workqueuewake"); - irq_work_queue(&rsp->wakeup_work); - } + return true; } /* @@ -1735,12 +1758,14 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, * is invoked indirectly from rcu_advance_cbs(), which would result in * endless recursion -- or would do so if it wasn't for the self-deadlock * that is encountered beforehand. + * + * Returns true if the grace-period kthread needs to be awakened. */ -static void -rcu_start_gp(struct rcu_state *rsp) +static bool rcu_start_gp(struct rcu_state *rsp) { struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); + bool ret = false; /* * If there is no grace period in progress right now, any @@ -1750,8 +1775,9 @@ rcu_start_gp(struct rcu_state *rsp) * resulting in pointless grace periods. So, advance callbacks * then start the grace period! */ - rcu_advance_cbs(rsp, rnp, rdp); - rcu_start_gp_advanced(rsp, rnp, rdp); + ret = rcu_advance_cbs(rsp, rnp, rdp) || ret; + ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret; + return ret; } /* @@ -1840,6 +1866,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; unsigned long mask; + bool needwake; struct rcu_node *rnp; rnp = rdp->mynode; @@ -1868,9 +1895,11 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. */ - rcu_accelerate_cbs(rsp, rnp, rdp); + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ + if (needwake) + rcu_gp_kthread_wake(rsp); } } @@ -2354,6 +2383,7 @@ static void __rcu_process_callbacks(struct rcu_state *rsp) { unsigned long flags; + bool needwake; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); WARN_ON_ONCE(rdp->beenonline == 0); @@ -2365,8 +2395,10 @@ __rcu_process_callbacks(struct rcu_state *rsp) local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ - rcu_start_gp(rsp); + needwake = rcu_start_gp(rsp); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); } else { local_irq_restore(flags); } @@ -2424,6 +2456,8 @@ static void invoke_rcu_core(void) static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, struct rcu_head *head, unsigned long flags) { + bool needwake; + /* * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. @@ -2453,8 +2487,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, raw_spin_lock(&rnp_root->lock); smp_mb__after_unlock_lock(); - rcu_start_gp(rsp); + needwake = rcu_start_gp(rsp); raw_spin_unlock(&rnp_root->lock); + if (needwake) + rcu_gp_kthread_wake(rsp); } else { /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; @@ -3440,7 +3476,6 @@ static void __init rcu_init_one(struct rcu_state *rsp, rsp->rda = rda; init_waitqueue_head(&rsp->gp_wq); - init_irq_work(&rsp->wakeup_work, rsp_wakeup); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 13766ad2f4ee..cdbb392d5b4b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -461,7 +461,6 @@ struct rcu_state { const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ struct list_head flavors; /* List of RCU flavors. */ - struct irq_work wakeup_work; /* Postponed wakeups */ }; /* Values for rcu_state structure's gp_flags field. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f60dd6ea8333..0cb0816036c5 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1744,6 +1744,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) static void rcu_prepare_for_idle(int cpu) { #ifndef CONFIG_RCU_NOCB_CPU_ALL + bool needwake; struct rcu_data *rdp; struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); struct rcu_node *rnp; @@ -1792,8 +1793,10 @@ static void rcu_prepare_for_idle(int cpu) rnp = rdp->mynode; raw_spin_lock(&rnp->lock); /* irqs already disabled. */ smp_mb__after_unlock_lock(); - rcu_accelerate_cbs(rsp, rnp, rdp); + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + if (needwake) + rcu_gp_kthread_wake(rsp); } #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ } @@ -2230,12 +2233,15 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) unsigned long c; bool d; unsigned long flags; + bool needwake; struct rcu_node *rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - c = rcu_start_future_gp(rnp, rdp); + needwake = rcu_start_future_gp(rnp, rdp, &c); raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (needwake) + rcu_gp_kthread_wake(rdp->rsp); /* * Wait for the grace period. Do so interruptibly to avoid messing -- cgit From af952b919bf9e2cf3c4e839359cfd033d98aa011 Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Wed, 12 Mar 2014 21:46:46 +0530 Subject: rcu: Protect uses of ->jiffies_stall with ACCESS_ONCE() Some of the accesses to the rcu_state structure's ->jiffies_stall field are unprotected. This patch protects them with ACCESS_ONCE(). The following coccinelle script was used to acheive this: /* coccinelle script to protect uses of ->jiffies_stall with ACCESS_ONCE() */ @@ identifier a; @@ ( ACCESS_ONCE(a->jiffies_stall) | - a->jiffies_stall + ACCESS_ONCE(a->jiffies_stall) ) Signed-off-by: Himangi Saraogi Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tiny_plugin.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 431528520562..858c56569127 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -144,7 +144,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) return; rcp->ticks_this_gp++; j = jiffies; - js = rcp->jiffies_stall; + js = ACCESS_ONCE(rcp->jiffies_stall); if (*rcp->curtail && ULONG_CMP_GE(j, js)) { pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, @@ -152,17 +152,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) dump_stack(); } if (*rcp->curtail && ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + + ACCESS_ONCE(rcp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; else if (ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); + ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); } static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) { rcp->ticks_this_gp = 0; rcp->gp_start = jiffies; - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); + ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); } static void check_cpu_stalls(void) -- cgit From 7941dbdebe2a1fda31aefa033794510f95720a5a Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Mon, 17 Mar 2014 18:33:28 +0200 Subject: rcu: Add event tracing to dyntick_save_progress_counter(). This patch adds event tracing to dyntick_save_progress_counter() in the case where it returns 1. I used the tracepoint string "dti" because this function returns 1 in case the CPU is in dynticks idle mode. Signed-off-by: Andreea-Cristina Bernat Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fca911b6b29c..c6fd0f15425c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -772,7 +772,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, { rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); rcu_sysidle_check_cpu(rdp, isidle, maxj); - return (rdp->dynticks_snap & 0x1) == 0; + if ((rdp->dynticks_snap & 0x1) == 0) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); + return 1; + } else { + return 0; + } } /* -- cgit From 595f3900f6b4221403493c530138b8dad2bd87f3 Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Tue, 18 Mar 2014 22:52:26 +0530 Subject: rcu: Replace NR_CPUS with nr_cpu_ids This patch replaces NR_CPUS with nr_cpu_ids as NR_CPUS should consider cpumask_var_t. Signed-off-by: Himangi Saraogi Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c6fd0f15425c..6724bd98da7d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3461,8 +3461,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; rnp->grphi = (j + 1) * cpustride - 1; - if (rnp->grphi >= NR_CPUS) - rnp->grphi = NR_CPUS - 1; + if (rnp->grphi >= nr_cpu_ids) + rnp->grphi = nr_cpu_ids - 1; if (i == 0) { rnp->grpnum = 0; rnp->grpmask = 0; -- cgit From 495aa969dbaef2e3d28094a2b3c752d069932748 Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Tue, 18 Mar 2014 20:48:48 +0200 Subject: rcu: Consolidate kfree_call_rcu() to use rcu_state pointer kfree_call_rcu is defined two times. When defined under CONFIG_TREE_PREEMPT_RCU, it uses rcu_preempt_state. Otherwise, it uses rcu_sched_state. This patch uses the rcu_state_pointer to combine the two definitions into one. The resulting function is placed after the closing of the preprocessor conditional CONFIG_TREE_PREEMPT_RCU. Signed-off-by: Andreea-Cristina Bernat Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 14 ++++++++++++++ kernel/rcu/tree_plugin.h | 30 ------------------------------ 2 files changed, 14 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6724bd98da7d..f3317c18b354 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2597,6 +2597,20 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); +/* + * Queue an RCU callback for lazy invocation after a grace period. + * This will likely be later named something like "call_rcu_lazy()", + * but this change will require some way of tagging the lazy RCU + * callbacks in the list of pending callbacks. Until then, this + * function may only be called from __kfree_rcu(). + */ +void kfree_call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, rcu_state, -1, 1); +} +EXPORT_SYMBOL_GPL(kfree_call_rcu); + /* * Because a context switch is a grace period for RCU-sched and RCU-bh, * any blocking grace-period wait automatically implies a grace period diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0cb0816036c5..4918a1243efb 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -688,20 +688,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu); -/* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). - */ -void kfree_call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_preempt_state, -1, 1); -} -EXPORT_SYMBOL_GPL(kfree_call_rcu); - /** * synchronize_rcu - wait until a grace period has elapsed. * @@ -1079,22 +1065,6 @@ static void rcu_preempt_check_callbacks(int cpu) { } -/* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). - * - * Because there is no preemptible RCU, we use RCU-sched instead. - */ -void kfree_call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_sched_state, -1, 1); -} -EXPORT_SYMBOL_GPL(kfree_call_rcu); - /* * Wait for an rcu-preempt grace period, but make it happen quickly. * But because preemptible RCU does not exist, map to rcu-sched. -- cgit From a381d757d93f6e43063f74888676edd6216d0aff Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Wed, 19 Mar 2014 11:18:31 +0200 Subject: rcu: Merge rcu_sched_force_quiescent_state() with rcu_force_quiescent_state() This patch merges the function rcu_force_quiescent_state() with rcu_sched_force_quiescent_state(), using the rcu_state pointer. Firstly, the rcu_sched_force_quiescent_state() function is deleted from the file kernel/rcu/tree.c. Also, the rcu_force_quiescent_state() function that was calling force_quiescent_state with the argument rcu_preempt_state pointer was deleted as well. The new function that combines the old ones uses the rcu_state pointer and is located after rcu_batches_completed_bh() in kernel/rcu/tree.c. Signed-off-by: Andreea-Cristina Bernat Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 9 +++++++++ kernel/rcu/tree_plugin.h | 19 ------------------- 2 files changed, 9 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f3317c18b354..dbf43f5c86fa 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -270,6 +270,15 @@ long rcu_batches_completed_bh(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); +/* + * Force a quiescent state. + */ +void rcu_force_quiescent_state(void) +{ + force_quiescent_state(rcu_state); +} +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); + /* * Force a quiescent state for RCU BH. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4918a1243efb..1af19e006899 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -148,15 +148,6 @@ long rcu_batches_completed(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed); -/* - * Force a quiescent state for preemptible RCU. - */ -void rcu_force_quiescent_state(void) -{ - force_quiescent_state(&rcu_preempt_state); -} -EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); - /* * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is @@ -976,16 +967,6 @@ long rcu_batches_completed(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed); -/* - * Force a quiescent state for RCU, which, because there is no preemptible - * RCU, becomes the same as rcu-sched. - */ -void rcu_force_quiescent_state(void) -{ - rcu_sched_force_quiescent_state(); -} -EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); - /* * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. -- cgit From 5057f55e543b7859cfd26bc281291795eac93f8a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Apr 2014 11:20:36 -0700 Subject: rcu: Bind RCU grace-period kthreads if NO_HZ_FULL Currently, RCU binds the grace-period kthreads to the timekeeping CPU only if CONFIG_NO_HZ_FULL_SYSIDLE=y. This means that these kthreads must be bound manually when CONFIG_NO_HZ_FULL_SYSIDLE=n and CONFIG_NO_HZ_FULL=y: Otherwise, these kthreads will induce OS jitter on random CPUs. Given that we are trying to reduce the amount of manual tweaking required to make CONFIG_NO_HZ_FULL=y work nicely, this commit makes this binding happen when CONFIG_NO_HZ_FULL=y, even in cases where CONFIG_NO_HZ_FULL_SYSIDLE=n. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree_plugin.h | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1af19e006899..045c6d04ed3f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2595,20 +2595,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp) return rsp == rcu_sysidle_state; } -/* - * Bind the grace-period kthread for the sysidle flavor of RCU to the - * timekeeping CPU. - */ -static void rcu_bind_gp_kthread(void) -{ - int cpu = ACCESS_ONCE(tick_do_timer_cpu); - - if (cpu < 0 || cpu >= nr_cpu_ids) - return; - if (raw_smp_processor_id() != cpu) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); -} - /* * Return a delay in jiffies based on the number of CPUs, rcu_node * leaf fanout, and jiffies tick rate. The idea is to allow larger @@ -2819,10 +2805,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp) return false; } -static void rcu_bind_gp_kthread(void) -{ -} - static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, unsigned long maxj) { @@ -2853,3 +2835,19 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) #endif /* #ifdef CONFIG_NO_HZ_FULL */ return 0; } + +/* + * Bind the grace-period kthread for the sysidle flavor of RCU to the + * timekeeping CPU. + */ +static void rcu_bind_gp_kthread(void) +{ +#ifdef CONFIG_NO_HZ_FULL + int cpu = ACCESS_ONCE(tick_do_timer_cpu); + + if (cpu < 0 || cpu >= nr_cpu_ids) + return; + if (raw_smp_processor_id() != cpu) + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +#endif /* #ifdef CONFIG_NO_HZ_FULL */ +} -- cgit From becb41bfe0544f1f7f494f48d6f68cbdb2e1ed0e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 Apr 2014 13:34:07 -0700 Subject: rcu: Make large and small sysidle systems use same state machine Currently, small systems move back into RCU_SYSIDLE_NOT from RCU_SYSIDLE_SHORT and large systems do not. This works because moving aggressively to RCU_SYSIDLE_NOT affects only performance, not correctness, and on small systems, the performance impact should be negligible. That said, this difference does make RCU a bit more complex, and RCU does not seem to be suffering from any lack of complexity. This commit therefore adjusts small-system operation to match that of large systems, so that the state never moves back to RCU_SYSIDLE_NOT from RCU_SYSIDLE_SHORT. Reported-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 045c6d04ed3f..f7593c2536b0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2659,7 +2659,8 @@ static void rcu_sysidle(unsigned long j) static void rcu_sysidle_cancel(void) { smp_mb(); - ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; + if (full_sysidle_state > RCU_SYSIDLE_SHORT) + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; } /* -- cgit From 8c96ae1dfa1608b92ddbf8bb285d7269832e4ac0 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Mon, 14 Apr 2014 10:25:43 -0700 Subject: rcu: Remove duplicate resched_cpu() declaration Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index dbf43f5c86fa..ecd7e046de76 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -975,12 +975,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp) force_quiescent_state(rsp); /* Kick them all. */ } -/* - * This function really isn't for public consumption, but RCU is special in - * that context switches can allow the state machine to make progress. - */ -extern void resched_cpu(int cpu); - static void print_cpu_stall(struct rcu_state *rsp) { int cpu; -- cgit From fa07a58f71ee23a82597ce337126982d0cc60b72 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 15 Apr 2014 12:20:12 -0700 Subject: rcu: Replace __this_cpu_ptr() uses with raw_cpu_ptr() __this_cpu_ptr is being phased out. One special case is increment_cpu_stall_ticks(). A per cpu variable is incremented so use raw_cpu_inc(). Cc: Dipankar Sarma Signed-off-by: Christoph Lameter Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 6 +++--- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ecd7e046de76..d3e023e24c6f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2008,7 +2008,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { int i; - struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); /* No-CBs CPUs are handled specially. */ if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) @@ -2392,7 +2392,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) { unsigned long flags; bool needwake; - struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); WARN_ON_ONCE(rdp->beenonline == 0); @@ -3066,7 +3066,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp) static void rcu_barrier_func(void *type) { struct rcu_state *rsp = type; - struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); atomic_inc(&rsp->barrier_cpu_count); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f7593c2536b0..2e579c38bd91 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1809,7 +1809,7 @@ static void rcu_oom_notify_cpu(void *unused) struct rcu_data *rdp; for_each_rcu_flavor(rsp) { - rdp = __this_cpu_ptr(rsp->rda); + rdp = raw_cpu_ptr(rsp->rda); if (rdp->qlen_lazy != 0) { atomic_inc(&oom_callback_count); rsp->call(&rdp->oom_head, rcu_oom_callback); @@ -1951,7 +1951,7 @@ static void increment_cpu_stall_ticks(void) struct rcu_state *rsp; for_each_rcu_flavor(rsp) - __this_cpu_ptr(rsp->rda)->ticks_this_gp++; + raw_cpu_inc(rsp->rda->ticks_this_gp); } #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ -- cgit From a5d6d3a1b00a0ad88f07c3a727c79b27915278e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Apr 2014 09:06:24 -0700 Subject: softirq: A single rcu_bh_qs() call per softirq set is enough Calling rcu_bh_qs() after every softirq action is not really needed. What RCU needs is at least one rcu_bh_qs() per softirq round to note a quiescent state was passed for rcu_bh. Note for Paul and myself : this could be inlined as a single instruction and avoid smp_processor_id() (sone this_cpu_write(rcu_bh_data.passed_quiesce, 1)) Signed-off-by: Eric Dumazet Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/softirq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index b50990a5bea0..b9b2d4906848 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -232,7 +232,6 @@ asmlinkage void __do_softirq(void) bool in_hardirq; __u32 pending; int softirq_bit; - int cpu; /* * Mask out PF_MEMALLOC s current task context is borrowed for the @@ -247,7 +246,6 @@ asmlinkage void __do_softirq(void) __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); in_hardirq = lockdep_softirq_start(); - cpu = smp_processor_id(); restart: /* Reset the pending bitmask before enabling irqs */ set_softirq_pending(0); @@ -276,11 +274,11 @@ restart: prev_count, preempt_count()); preempt_count_set(prev_count); } - rcu_bh_qs(cpu); h++; pending >>= softirq_bit; } + rcu_bh_qs(smp_processor_id()); local_irq_disable(); pending = local_softirq_pending(); -- cgit From f4874261049e3abdd481359d82cafa5068369ebd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Apr 2014 16:07:28 -0400 Subject: tracing: Break out of tracing_wait_pipe() before wait_pipe() is called When reading from trace_pipe, if tracing is off but nothing was read it should block. If something is read and tracing is off, then EOF is returned. If tracing is on and there's nothing to read, it will block. But because the check of whether tracing is off and something was read is done after the block on the pipe, it is hit or miss if the EOF is returned or not leading to inconsistent behavior. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cb41e98cc64b..e058c6091e45 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4237,15 +4237,6 @@ static int tracing_wait_pipe(struct file *filp) return -EAGAIN; } - mutex_unlock(&iter->mutex); - - iter->trace->wait_pipe(iter); - - mutex_lock(&iter->mutex); - - if (signal_pending(current)) - return -EINTR; - /* * We block until we read something and tracing is disabled. * We still block if tracing is disabled, but we have never @@ -4257,6 +4248,15 @@ static int tracing_wait_pipe(struct file *filp) */ if (!tracing_is_on() && iter->pos) break; + + mutex_unlock(&iter->mutex); + + iter->trace->wait_pipe(iter); + + mutex_lock(&iter->mutex); + + if (signal_pending(current)) + return -EINTR; } return 1; -- cgit From b1169cc69ba96b124df820904a6d3eb775491d7f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Apr 2014 17:54:37 -0400 Subject: tracing: Remove mock up poll wait function Now that the ring buffer has a built in way to wake up readers when there's data, using irq_work such that it is safe to do it in any context. But it was still using the old "poor man's" wait polling that checks every 1/10 of a second to see if it should wake up a waiter. This makes the latency for a wake up excruciatingly long. No need to do that anymore. Completely remove the different wait_poll types from the tracers and have them all use the default one now. Reported-by: Johannes Berg Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 29 ++++------------------------- kernel/trace/trace.h | 4 ---- kernel/trace/trace_functions.c | 1 - kernel/trace/trace_functions_graph.c | 1 - kernel/trace/trace_nop.c | 1 - kernel/trace/trace_sched_wakeup.c | 2 -- 6 files changed, 4 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e058c6091e45..4c392c8238bf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1085,7 +1085,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) } #endif /* CONFIG_TRACER_MAX_TRACE */ -static void default_wait_pipe(struct trace_iterator *iter) +static void wait_on_pipe(struct trace_iterator *iter) { /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) @@ -1202,8 +1202,6 @@ int register_tracer(struct tracer *type) else if (!type->flags->opts) type->flags->opts = dummy_tracer_opt; - if (!type->wait_pipe) - type->wait_pipe = default_wait_pipe; ret = run_tracer_selftest(type); if (ret < 0) @@ -4207,25 +4205,6 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) return trace_poll(iter, filp, poll_table); } -/* - * This is a make-shift waitqueue. - * A tracer might use this callback on some rare cases: - * - * 1) the current tracer might hold the runqueue lock when it wakes up - * a reader, hence a deadlock (sched, function, and function graph tracers) - * 2) the function tracers, trace all functions, we don't want - * the overhead of calling wake_up and friends - * (and tracing them too) - * - * Anyway, this is really very primitive wakeup. - */ -void poll_wait_pipe(struct trace_iterator *iter) -{ - set_current_state(TASK_INTERRUPTIBLE); - /* sleep for 100 msecs, and try again. */ - schedule_timeout(HZ / 10); -} - /* Must be called with trace_types_lock mutex held. */ static int tracing_wait_pipe(struct file *filp) { @@ -4251,7 +4230,7 @@ static int tracing_wait_pipe(struct file *filp) mutex_unlock(&iter->mutex); - iter->trace->wait_pipe(iter); + wait_on_pipe(iter); mutex_lock(&iter->mutex); @@ -5179,7 +5158,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, goto out_unlock; } mutex_unlock(&trace_types_lock); - iter->trace->wait_pipe(iter); + wait_on_pipe(iter); mutex_lock(&trace_types_lock); if (signal_pending(current)) { size = -EINTR; @@ -5390,7 +5369,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, goto out; } mutex_unlock(&trace_types_lock); - iter->trace->wait_pipe(iter); + wait_on_pipe(iter); mutex_lock(&trace_types_lock); if (signal_pending(current)) { ret = -EINTR; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8624b5041466..3b3e09e61f33 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -338,7 +338,6 @@ struct tracer_flags { * @stop: called when tracing is paused (echo 0 > tracing_enabled) * @open: called when the trace file is opened * @pipe_open: called when the trace_pipe file is opened - * @wait_pipe: override how the user waits for traces on trace_pipe * @close: called when the trace file is released * @pipe_close: called when the trace_pipe file is released * @read: override the default read callback on trace_pipe @@ -357,7 +356,6 @@ struct tracer { void (*stop)(struct trace_array *tr); void (*open)(struct trace_iterator *iter); void (*pipe_open)(struct trace_iterator *iter); - void (*wait_pipe)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); void (*pipe_close)(struct trace_iterator *iter); ssize_t (*read)(struct trace_iterator *iter, @@ -566,8 +564,6 @@ void trace_init_global_iter(struct trace_iterator *iter); void tracing_iter_reset(struct trace_iterator *iter, int cpu); -void poll_wait_pipe(struct trace_iterator *iter); - void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *prev, struct task_struct *next, diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 2d9482b8f26a..57f0ec962d2c 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -252,7 +252,6 @@ static struct tracer function_trace __tracer_data = .init = function_trace_init, .reset = function_trace_reset, .start = function_trace_start, - .wait_pipe = poll_wait_pipe, .flags = &func_flags, .set_flag = func_set_flag, .allow_instances = true, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index deff11200261..b86dd4d8c6a6 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1505,7 +1505,6 @@ static struct tracer graph_trace __tracer_data = { .pipe_open = graph_trace_open, .close = graph_trace_close, .pipe_close = graph_trace_close, - .wait_pipe = poll_wait_pipe, .init = graph_trace_init, .reset = graph_trace_reset, .print_line = print_graph_function, diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 69a5cc94c01a..fcf0a9e48916 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -91,7 +91,6 @@ struct tracer nop_trace __read_mostly = .name = "nop", .init = nop_trace_init, .reset = nop_trace_reset, - .wait_pipe = poll_wait_pipe, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_nop, #endif diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 1573c03640d2..19bd8928ce94 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -705,7 +705,6 @@ static struct tracer wakeup_rt_tracer __read_mostly = .reset = wakeup_tracer_reset, .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, - .wait_pipe = poll_wait_pipe, .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, @@ -728,7 +727,6 @@ static struct tracer wakeup_dl_tracer __read_mostly = .reset = wakeup_tracer_reset, .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, - .wait_pipe = poll_wait_pipe, .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, -- cgit From ce5f36a58fd1d92cb945cf2568751d40e8598508 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 24 Apr 2014 13:26:01 +0200 Subject: uprobes/tracing: Make uprobe_perf_close() visible to uprobe_perf_open() Preparation. Move uprobe_perf_close() up before uprobe_perf_open() to avoid the forward declaration in the next patch and make it readable. Signed-off-by: Oleg Nesterov Acked-by: Steven Rostedt Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c082a7441345..51bd071eaca0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1009,54 +1009,54 @@ uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); } -static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) { bool done; write_lock(&tu->filter.rwlock); if (event->hw.tp_target) { - /* - * event->parent != NULL means copy_process(), we can avoid - * uprobe_apply(). current->mm must be probed and we can rely - * on dup_mmap() which preserves the already installed bp's. - * - * attr.enable_on_exec means that exec/mmap will install the - * breakpoints we need. - */ + list_del(&event->hw.tp_list); done = tu->filter.nr_systemwide || - event->parent || event->attr.enable_on_exec || + (event->hw.tp_target->flags & PF_EXITING) || uprobe_filter_event(tu, event); - list_add(&event->hw.tp_list, &tu->filter.perf_events); } else { + tu->filter.nr_systemwide--; done = tu->filter.nr_systemwide; - tu->filter.nr_systemwide++; } write_unlock(&tu->filter.rwlock); if (!done) - uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); return 0; } -static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { bool done; write_lock(&tu->filter.rwlock); if (event->hw.tp_target) { - list_del(&event->hw.tp_list); + /* + * event->parent != NULL means copy_process(), we can avoid + * uprobe_apply(). current->mm must be probed and we can rely + * on dup_mmap() which preserves the already installed bp's. + * + * attr.enable_on_exec means that exec/mmap will install the + * breakpoints we need. + */ done = tu->filter.nr_systemwide || - (event->hw.tp_target->flags & PF_EXITING) || + event->parent || event->attr.enable_on_exec || uprobe_filter_event(tu, event); + list_add(&event->hw.tp_list, &tu->filter.perf_events); } else { - tu->filter.nr_systemwide--; done = tu->filter.nr_systemwide; + tu->filter.nr_systemwide++; } write_unlock(&tu->filter.rwlock); if (!done) - uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); return 0; } -- cgit From 927d687480ab7e43d73a003bab58803fc67717d9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 24 Apr 2014 13:33:31 +0200 Subject: uprobes/tracing: Fix uprobe_perf_open() on uprobe_apply() failure uprobe_perf_open()->uprobe_apply() can fail, but this error is wrongly ignored. Change uprobe_perf_open() to do uprobe_perf_close() and return the error code in this case. Change uprobe_perf_close() to propogate the error from uprobe_apply() as well, although it should not fail. Signed-off-by: Oleg Nesterov Acked-by: Steven Rostedt Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 51bd071eaca0..5a7f1a6b3b8b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1026,7 +1026,7 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) write_unlock(&tu->filter.rwlock); if (!done) - uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); return 0; } @@ -1034,6 +1034,7 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { bool done; + int err; write_lock(&tu->filter.rwlock); if (event->hw.tp_target) { @@ -1055,10 +1056,13 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) } write_unlock(&tu->filter.rwlock); - if (!done) - uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); - - return 0; + err = 0; + if (!done) { + err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + if (err) + uprobe_perf_close(tu, event); + } + return err; } static bool uprobe_perf_filter(struct uprobe_consumer *uc, -- cgit From 13f59c5e45be59665c11ddde19799b6295543b7d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 28 Apr 2014 20:15:43 +0200 Subject: uprobes: Refuse to insert a probe into MAP_SHARED vma valid_vma() rejects the VM_SHARED vmas, but this still allows to insert a probe into the MAP_SHARED but not VM_MAYWRITE vma. Currently this is fine, such a mapping doesn't really differ from the private read-only mmap except mprotect(PROT_WRITE) won't work. However, get_user_pages(FOLL_WRITE | FOLL_FORCE) doesn't allow to COW in this case, and it would be safer to follow the same conventions as mm even if currently this happens to work. After the recent cda540ace6a1 "mm: get_user_pages(write,force) refuse to COW in shared areas" only uprobes can insert an anon page into the shared file-backed area, lets stop this and change valid_vma() to check VM_MAYSHARE instead. Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d1edc5e6fd03..7716c40f2c50 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -127,7 +127,7 @@ struct xol_area { */ static bool valid_vma(struct vm_area_struct *vma, bool is_register) { - vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED; + vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE; if (is_register) flags |= VM_WRITE; -- cgit From 52c324f8a87b336496d0f5e9d8dff1aa32bb08cd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 1 May 2014 00:13:47 +0200 Subject: cpuidle: Combine cpuidle_enabled() with cpuidle_select() Since both cpuidle_enabled() and cpuidle_select() are only called by cpuidle_idle_call(), it is not really useful to keep them separate and combining them will help to avoid complicating cpuidle_idle_call() even further if governors are changed to return error codes sometimes. This code modification shouldn't lead to any functional changes. Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 26 ++++++-------------------- include/linux/cpuidle.h | 5 ----- kernel/sched/idle.c | 20 +++++++------------- 3 files changed, 13 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 8236746e46bb..f38359f64cc6 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -64,26 +64,6 @@ int cpuidle_play_dead(void) return -ENODEV; } -/** - * cpuidle_enabled - check if the cpuidle framework is ready - * @dev: cpuidle device for this cpu - * @drv: cpuidle driver for this cpu - * - * Return 0 on success, otherwise: - * -NODEV : the cpuidle framework is not available - * -EBUSY : the cpuidle framework is not initialized - */ -int cpuidle_enabled(struct cpuidle_driver *drv, struct cpuidle_device *dev) -{ - if (off || !initialized) - return -ENODEV; - - if (!drv || !dev || !dev->enabled) - return -EBUSY; - - return 0; -} - /** * cpuidle_enter_state - enter the state and update stats * @dev: cpuidle device for this cpu @@ -138,6 +118,12 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, */ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) { + if (off || !initialized) + return -ENODEV; + + if (!drv || !dev || !dev->enabled) + return -EBUSY; + return cpuidle_curr_governor->select(drv, dev); } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index b0238cba440b..a8d5bd391a26 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -120,8 +120,6 @@ struct cpuidle_driver { #ifdef CONFIG_CPU_IDLE extern void disable_cpuidle(void); -extern int cpuidle_enabled(struct cpuidle_driver *drv, - struct cpuidle_device *dev); extern int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev); extern int cpuidle_enter(struct cpuidle_driver *drv, @@ -149,9 +147,6 @@ extern int cpuidle_play_dead(void); extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); #else static inline void disable_cpuidle(void) { } -static inline int cpuidle_enabled(struct cpuidle_driver *drv, - struct cpuidle_device *dev) -{return -ENODEV; } static inline int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return -ENODEV; } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f4390a079c7..a8f12247ce7c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -101,19 +101,13 @@ static int cpuidle_idle_call(void) rcu_idle_enter(); /* - * Check if the cpuidle framework is ready, otherwise fallback - * to the default arch specific idle method + * Ask the cpuidle framework to choose a convenient idle state. + * Fall back to the default arch specific idle method on errors. */ - ret = cpuidle_enabled(drv, dev); - - if (!ret) { - /* - * Ask the governor to choose an idle state it thinks - * it is convenient to go to. There is *always* a - * convenient idle state - */ - next_state = cpuidle_select(drv, dev); + next_state = cpuidle_select(drv, dev); + ret = next_state; + if (ret >= 0) { /* * The idle task must be scheduled, it is pointless to * go to idle, just update no idle residency and get @@ -140,7 +134,7 @@ static int cpuidle_idle_call(void) CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu); - if (!ret) { + if (ret >= 0) { trace_cpu_idle_rcuidle(next_state, dev->cpu); /* @@ -175,7 +169,7 @@ static int cpuidle_idle_call(void) * We can't use the cpuidle framework, let's use the default * idle routine */ - if (ret) + if (ret < 0) arch_cpu_idle(); __current_set_polling(); -- cgit From 2c730785d9532d2a9c46e059bd6a6c9a764c539f Mon Sep 17 00:00:00 2001 From: Sebastian Capella Date: Mon, 21 Apr 2014 17:30:46 -0700 Subject: PM / hibernate: no kernel_power_off when pm_power_off NULL Reboot logic in kernel/reboot will avoid calling kernel_power_off when pm_power_off is null, and instead uses kernel_halt. Change hibernate's power_down to follow the behavior in the reboot call. Calling the notifier twice (once for SYS_POWER_OFF and again for SYS_HALT) causes a panic during hibernation on Kirkwood Openblocks A6 board. Signed-off-by: Sebastian Capella Reported-by: Ezequiel Garcia Reviewed-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index de4b989cc8fd..1f08ac7f55d8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -599,7 +599,8 @@ static void power_down(void) case HIBERNATION_PLATFORM: hibernation_platform_enter(); case HIBERNATION_SHUTDOWN: - kernel_power_off(); + if (pm_power_off) + kernel_power_off(); break; #ifdef CONFIG_SUSPEND case HIBERNATION_SUSPEND: @@ -627,7 +628,8 @@ static void power_down(void) * corruption after resume. */ printk(KERN_CRIT "PM: Please power down manually\n"); - while(1); + while (1) + cpu_relax(); } /** -- cgit From fd06a54990e94c7f40ca21cf82b9c83106ccb94b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 1 May 2014 23:05:31 -0400 Subject: ftrace: Have function graph tracer use global_ops for filtering Commit 4104d326b670 "ftrace: Remove global function list and call function directly" cleaned up the global_ops filtering and made the code simpler. But it left out function graph filtering which also depended on that code. The function graph filtering still needs to use global_ops as the filter otherwise it wont filter at all. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 34b098bfded4..9eb1aa03a18d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5008,12 +5008,6 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, return NOTIFY_DONE; } -/* Just a place holder for function graph */ -static struct ftrace_ops fgraph_ops __read_mostly = { - .func = ftrace_stub, - .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_RECURSION_SAFE, -}; - static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) { if (!ftrace_ops_test(&global_ops, trace->func, NULL)) @@ -5076,7 +5070,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_entry = ftrace_graph_entry_test; update_function_graph_func(); - ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); + /* Function graph doesn't use the .func field of global_ops */ + global_ops.flags |= FTRACE_OPS_FL_STUB; + + ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); @@ -5094,7 +5091,8 @@ void unregister_ftrace_graph(void) ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); + ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); + global_ops.flags &= ~FTRACE_OPS_FL_STUB; unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -- cgit From 1e77d0a1ed7417d2a5a52a7b8d32aea1833faa6c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 7 Mar 2013 14:53:45 +0100 Subject: genirq: Sanitize spurious interrupt detection of threaded irqs Till reported that the spurious interrupt detection of threaded interrupts is broken in two ways: - note_interrupt() is called for each action thread of a shared interrupt line. That's wrong as we are only interested whether none of the device drivers felt responsible for the interrupt, but by calling multiple times for a single interrupt line we account IRQ_NONE even if one of the drivers felt responsible. - note_interrupt() when called from the thread handler is not serialized. That leaves the members of irq_desc which are used for the spurious detection unprotected. To solve this we need to defer the spurious detection of a threaded interrupt to the next hardware interrupt context where we have implicit serialization. If note_interrupt is called with action_ret == IRQ_WAKE_THREAD, we check whether the previous interrupt requested a deferred check. If not, we request a deferred check for the next hardware interrupt and return. If set, we check whether one of the interrupt threads signaled success. Depending on this information we feed the result into the spurious detector. If one primary handler of a shared interrupt returns IRQ_HANDLED we disable the deferred check of irq threads on the same line, as we have found at least one device driver who cared. Reported-by: Till Straumann Signed-off-by: Thomas Gleixner Tested-by: Austin Schuh Cc: Oliver Hartkopp Cc: Wolfgang Grandegger Cc: Pavel Pisa Cc: Marc Kleine-Budde Cc: linux-can@vger.kernel.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1303071450130.22263@ionos --- include/linux/irqdesc.h | 4 ++ kernel/irq/manage.c | 4 +- kernel/irq/spurious.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 108 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 26e2661d3935..472c021a2d4f 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -27,6 +27,8 @@ struct irq_desc; * @irq_count: stats field to detect stalled irqs * @last_unhandled: aging timer for unhandled count * @irqs_unhandled: stats field for spurious unhandled interrupts + * @threads_handled: stats field for deferred spurious detection of threaded handlers + * @threads_handled_last: comparator field for deferred spurious detection of theraded handlers * @lock: locking for SMP * @affinity_hint: hint to user space for preferred irq affinity * @affinity_notify: context for notification of affinity changes @@ -52,6 +54,8 @@ struct irq_desc { unsigned int irq_count; /* For detecting broken IRQs */ unsigned long last_unhandled; /* Aging timer for unhandled count */ unsigned int irqs_unhandled; + atomic_t threads_handled; + int threads_handled_last; raw_spinlock_t lock; struct cpumask *percpu_enabled; #ifdef CONFIG_SMP diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d34131ca372b..3dc6a61bf06a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -886,8 +886,8 @@ static int irq_thread(void *data) irq_thread_check_affinity(desc, action); action_ret = handler_fn(desc, action); - if (!noirqdebug) - note_interrupt(action->irq, desc, action_ret); + if (action_ret == IRQ_HANDLED) + atomic_inc(&desc->threads_handled); wake_threads_waitq(desc); } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index a1d8cc63b56e..e2514b0e439e 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -270,6 +270,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, return action && (action->flags & IRQF_IRQPOLL); } +#define SPURIOUS_DEFERRED 0x80000000 + void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { @@ -277,15 +279,111 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, irq_settings_is_polled(desc)) return; - /* we get here again via the threaded handler */ - if (action_ret == IRQ_WAKE_THREAD) - return; - if (bad_action_ret(action_ret)) { report_bad_irq(irq, desc, action_ret); return; } + /* + * We cannot call note_interrupt from the threaded handler + * because we need to look at the compound of all handlers + * (primary and threaded). Aside of that in the threaded + * shared case we have no serialization against an incoming + * hardware interrupt while we are dealing with a threaded + * result. + * + * So in case a thread is woken, we just note the fact and + * defer the analysis to the next hardware interrupt. + * + * The threaded handlers store whether they sucessfully + * handled an interrupt and we check whether that number + * changed versus the last invocation. + * + * We could handle all interrupts with the delayed by one + * mechanism, but for the non forced threaded case we'd just + * add pointless overhead to the straight hardirq interrupts + * for the sake of a few lines less code. + */ + if (action_ret & IRQ_WAKE_THREAD) { + /* + * There is a thread woken. Check whether one of the + * shared primary handlers returned IRQ_HANDLED. If + * not we defer the spurious detection to the next + * interrupt. + */ + if (action_ret == IRQ_WAKE_THREAD) { + int handled; + /* + * We use bit 31 of thread_handled_last to + * denote the deferred spurious detection + * active. No locking necessary as + * thread_handled_last is only accessed here + * and we have the guarantee that hard + * interrupts are not reentrant. + */ + if (!(desc->threads_handled_last & SPURIOUS_DEFERRED)) { + desc->threads_handled_last |= SPURIOUS_DEFERRED; + return; + } + /* + * Check whether one of the threaded handlers + * returned IRQ_HANDLED since the last + * interrupt happened. + * + * For simplicity we just set bit 31, as it is + * set in threads_handled_last as well. So we + * avoid extra masking. And we really do not + * care about the high bits of the handled + * count. We just care about the count being + * different than the one we saw before. + */ + handled = atomic_read(&desc->threads_handled); + handled |= SPURIOUS_DEFERRED; + if (handled != desc->threads_handled_last) { + action_ret = IRQ_HANDLED; + /* + * Note: We keep the SPURIOUS_DEFERRED + * bit set. We are handling the + * previous invocation right now. + * Keep it for the current one, so the + * next hardware interrupt will + * account for it. + */ + desc->threads_handled_last = handled; + } else { + /* + * None of the threaded handlers felt + * responsible for the last interrupt + * + * We keep the SPURIOUS_DEFERRED bit + * set in threads_handled_last as we + * need to account for the current + * interrupt as well. + */ + action_ret = IRQ_NONE; + } + } else { + /* + * One of the primary handlers returned + * IRQ_HANDLED. So we don't care about the + * threaded handlers on the same line. Clear + * the deferred detection bit. + * + * In theory we could/should check whether the + * deferred bit is set and take the result of + * the previous run into account here as + * well. But it's really not worth the + * trouble. If every other interrupt is + * handled we never trigger the spurious + * detector. And if this is just the one out + * of 100k unhandled ones which is handled + * then we merily delay the spurious detection + * by one hard interrupt. Not a real problem. + */ + desc->threads_handled_last &= ~SPURIOUS_DEFERRED; + } + } + if (unlikely(action_ret == IRQ_NONE)) { /* * If we are seeing only the odd spurious IRQ caused by -- cgit From 3cf2f34e1a3d4d5ff209d087925cf950e52f4805 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 2 May 2014 12:53:57 -0700 Subject: rwsem: Add comments to explain the meaning of the rwsem's count field It took me quite a while to understand how rwsem's count field mainifested itself in different scenarios. Add comments to provide a quick reference to the the rwsem's count field for each scenario where readers and writers are contending for the lock. Hopefully it will be useful for future maintenance of the code and for people to get up to speed on how the logic in the code works. Signed-off-by: Tim Chen Cc: Davidlohr Bueso Cc: Alex Shi Cc: Michel Lespinasse Cc: Rik van Riel Cc: Peter Hurley Cc: Paul E.McKenney Cc: Jason Low Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/1399060437.2970.146.camel@schen9-DESK Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 49 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 1d66e08e897d..b4219ff87b8c 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -11,6 +11,55 @@ #include #include +/* + * Guide to the rw_semaphore's count field for common values. + * (32-bit case illustrated, similar for 64-bit) + * + * 0x0000000X (1) X readers active or attempting lock, no writer waiting + * X = #active_readers + #readers attempting to lock + * (X*ACTIVE_BIAS) + * + * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or + * attempting to read lock or write lock. + * + * 0xffff000X (1) X readers active or attempting lock, with waiters for lock + * X = #active readers + # readers attempting lock + * (X*ACTIVE_BIAS + WAITING_BIAS) + * (2) 1 writer attempting lock, no waiters for lock + * X-1 = #active readers + #readers attempting lock + * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) + * (3) 1 writer active, no waiters for lock + * X-1 = #active readers + #readers attempting lock + * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) + * + * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock + * (WAITING_BIAS + ACTIVE_BIAS) + * (2) 1 writer active or attempting lock, no waiters for lock + * (ACTIVE_WRITE_BIAS) + * + * 0xffff0000 (1) There are writers or readers queued but none active + * or in the process of attempting lock. + * (WAITING_BIAS) + * Note: writer can attempt to steal lock for this count by adding + * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count + * + * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue. + * (ACTIVE_WRITE_BIAS + WAITING_BIAS) + * + * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking + * the count becomes more than 0 for successful lock acquisition, + * i.e. the case where there are only readers or nobody has lock. + * (1st and 2nd case above). + * + * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and + * checking the count becomes ACTIVE_WRITE_BIAS for successful lock + * acquisition (i.e. nobody else has lock or attempts lock). If + * unsuccessful, in rwsem_down_write_failed, we'll check to see if there + * are only waiters but none active (5th case above), and attempt to + * steal the lock. + * + */ + /* * Initialize an rwsem: */ -- cgit From 69dfa00ccb72a37f3810687ca110e5a8154c6eed Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:13 -0400 Subject: cgroup: make flags and subsys_masks unsigned int There's no reason to use atomic bitops for cgroup_subsys_state->flags, cgroup_root->flags and various subsys_masks. This patch updates those to use bitwise and/or operations instead and converts them form unsigned long to unsigned int. This makes the fields occupy (marginally) smaller space and makes it clear that they don't require atomicity. This patch doesn't cause any behavior difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 8 ++++---- kernel/cgroup.c | 37 ++++++++++++++++++------------------- 2 files changed, 22 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4b38e2d6110d..c6c703f2486b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -62,7 +62,7 @@ struct cgroup_subsys_state { /* the parent css */ struct cgroup_subsys_state *parent; - unsigned long flags; + unsigned int flags; /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; @@ -185,7 +185,7 @@ struct cgroup { u64 serial_nr; /* the bitmask of subsystems enabled on the child cgroups */ - unsigned long child_subsys_mask; + unsigned int child_subsys_mask; /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; @@ -312,7 +312,7 @@ struct cgroup_root { struct kernfs_root *kf_root; /* The bitmask of subsystems attached to this hierarchy */ - unsigned long subsys_mask; + unsigned int subsys_mask; /* Unique id for this hierarchy. */ int hierarchy_id; @@ -327,7 +327,7 @@ struct cgroup_root { struct list_head root_list; /* Hierarchy-specific flags */ - unsigned long flags; + unsigned int flags; /* IDs for cgroups in this hierarchy */ struct idr cgroup_idr; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3873267c9ee3..21667f396a1e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -181,7 +181,7 @@ static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask); + unsigned int ss_mask); static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); @@ -963,7 +963,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask); static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; @@ -1079,7 +1079,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) * @cgrp: target cgroup * @subsys_mask: mask of the subsystem ids whose files should be removed */ -static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) { struct cgroup_subsys *ss; int i; @@ -1087,15 +1087,14 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) for_each_subsys(ss, i) { struct cftype *cfts; - if (!test_bit(i, &subsys_mask)) + if (!(subsys_mask & (1 << i))) continue; list_for_each_entry(cfts, &ss->cfts, node) cgroup_addrm_files(cgrp, cfts, false); } } -static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask) +static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) { struct cgroup_subsys *ss; int ssid, i, ret; @@ -1128,7 +1127,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, * Just warn about it and continue. */ if (cgrp_dfl_root_visible) { - pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", + pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", ret, ss_mask); pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); } @@ -1214,8 +1213,8 @@ static int cgroup_show_options(struct seq_file *seq, } struct cgroup_sb_opts { - unsigned long subsys_mask; - unsigned long flags; + unsigned int subsys_mask; + unsigned int flags; char *release_agent; bool cpuset_clone_children; char *name; @@ -1227,12 +1226,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; bool all_ss = false, one_ss = false; - unsigned long mask = (unsigned long)-1; + unsigned int mask = -1U; struct cgroup_subsys *ss; int i; #ifdef CONFIG_CPUSETS - mask = ~(1UL << cpuset_cgrp_id); + mask = ~(1U << cpuset_cgrp_id); #endif memset(opts, 0, sizeof(*opts)); @@ -1313,7 +1312,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Mutually exclusive option 'all' + subsystem name */ if (all_ss) return -EINVAL; - set_bit(i, &opts->subsys_mask); + opts->subsys_mask |= (1 << i); one_ss = true; break; @@ -1342,7 +1341,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (all_ss || (!one_ss && !opts->none && !opts->name)) for_each_subsys(ss, i) if (!ss->disabled) - set_bit(i, &opts->subsys_mask); + opts->subsys_mask |= (1 << i); /* * We either have to specify by name or by subsystems. (So @@ -1373,7 +1372,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) int ret = 0; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; - unsigned long added_mask, removed_mask; + unsigned int added_mask, removed_mask; if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { pr_err("sane_behavior: remount is not allowed\n"); @@ -1398,7 +1397,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || (opts.name && strcmp(opts.name, root->name))) { - pr_err("option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", + pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", root->flags & CGRP_ROOT_OPTION_MASK, root->name); ret = -EINVAL; @@ -1522,7 +1521,7 @@ static void init_cgroup_root(struct cgroup_root *root, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -2507,7 +2506,7 @@ out_finish: static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, struct cftype *cft, char *buffer) { - unsigned long enable_req = 0, disable_req = 0, enable, disable; + unsigned int enable_req = 0, disable_req = 0, enable, disable; struct cgroup *cgrp = dummy_css->cgroup, *child; struct cgroup_subsys *ss; char *tok, *p; @@ -3998,7 +3997,7 @@ static struct cftype cgroup_base_files[] = { * * On failure, no file is added. */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask) { struct cgroup_subsys *ss; int i, ret = 0; @@ -4007,7 +4006,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) for_each_subsys(ss, i) { struct cftype *cfts; - if (!test_bit(i, &subsys_mask)) + if (!(subsys_mask & (1 << i))) continue; list_for_each_entry(cfts, &ss->cfts, node) { -- cgit From 7d699ddb2b181a2c76e5ea18b1bdf102c4bebe4b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:13 -0400 Subject: cgroup, memcg: allocate cgroup ID from 1 Currently, cgroup->id is allocated from 0, which is always assigned to the root cgroup; unfortunately, memcg wants to use ID 0 to indicate invalid IDs and ends up incrementing all IDs by one. It's reasonable to reserve 0 for special purposes. This patch updates cgroup core so that ID 0 is not used and the root cgroups get ID 1. The ID incrementing is removed form memcg. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Cc: Johannes Weiner Acked-by: Li Zefan --- include/linux/cgroup.h | 4 ++-- kernel/cgroup.c | 4 ++-- mm/memcontrol.c | 8 ++------ 3 files changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c6c703f2486b..793f70a48820 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -144,8 +144,8 @@ struct cgroup { /* * idr allocated in-hierarchy ID. * - * The ID of the root cgroup is always 0, and a new cgroup - * will be assigned with a smallest available ID. + * ID 0 is not used, the ID of the root cgroup is always 1, and a + * new cgroup will be assigned with a smallest available ID. * * Allocating/Removing ID must be protected by cgroup_mutex. */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 21667f396a1e..3fa0463e74bb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1531,7 +1531,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); - ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); + ret = idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); if (ret < 0) goto out; root_cgrp->id = ret; @@ -4225,7 +4225,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. */ - cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); + cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); if (cgrp->id < 0) { err = -ENOMEM; goto err_unlock; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 29501f040568..1d0b29715b73 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -527,18 +527,14 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { - /* - * The ID of the root cgroup is 0, but memcg treat 0 as an - * invalid ID, so we return (cgroup_id + 1). - */ - return memcg->css.cgroup->id + 1; + return memcg->css.cgroup->id; } static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { struct cgroup_subsys_state *css; - css = css_from_id(id - 1, &memory_cgrp_subsys); + css = css_from_id(id, &memory_cgrp_subsys); return mem_cgroup_from_css(css); } -- cgit From 6fa4918d03c39351aef3573ac3e7958d6a5ad9b6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:13 -0400 Subject: cgroup: protect cgroup_root->cgroup_idr with a spinlock Currently, cgroup_root->cgroup_idr is protected by cgroup_mutex, which ends up requiring cgroup_put() to be invoked under sleepable context. This is okay for now but is an unusual requirement and we'll soon add css->id which will have the same problem but won't be able to simply grab cgroup_mutex as removal will have to happen from css_release() which can't sleep. Introduce cgroup_idr_lock and idr_alloc/replace/remove() wrappers which protects the idr operations with the lock and use them for cgroup_root->cgroup_idr. cgroup_put() no longer needs to grab cgroup_mutex and css_from_id() is updated to always require RCU read lock instead of either RCU read lock or cgroup_mutex, which doesn't affect the existing users. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 51 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3fa0463e74bb..7cb9c0847445 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -99,6 +99,12 @@ static DEFINE_MUTEX(cgroup_mutex); static DECLARE_RWSEM(css_set_rwsem); #endif +/* + * Protects cgroup_idr so that IDs can be released without grabbing + * cgroup_mutex. + */ +static DEFINE_SPINLOCK(cgroup_idr_lock); + /* * Protects cgroup_subsys->release_agent_path. Modifying it also requires * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. @@ -190,6 +196,37 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); +/* IDR wrappers which synchronize using cgroup_idr_lock */ +static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, + gfp_t gfp_mask) +{ + int ret; + + idr_preload(gfp_mask); + spin_lock(&cgroup_idr_lock); + ret = idr_alloc(idr, ptr, start, end, gfp_mask); + spin_unlock(&cgroup_idr_lock); + idr_preload_end(); + return ret; +} + +static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) +{ + void *ret; + + spin_lock(&cgroup_idr_lock); + ret = idr_replace(idr, ptr, id); + spin_unlock(&cgroup_idr_lock); + return ret; +} + +static void cgroup_idr_remove(struct idr *idr, int id) +{ + spin_lock(&cgroup_idr_lock); + idr_remove(idr, id); + spin_unlock(&cgroup_idr_lock); +} + /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest @@ -1058,9 +1095,7 @@ static void cgroup_put(struct cgroup *cgrp) * per-subsystem and moved to css->id so that lookups are * successful until the target css is released. */ - mutex_lock(&cgroup_mutex); - idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - mutex_unlock(&cgroup_mutex); + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; call_rcu(&cgrp->rcu_head, cgroup_free_rcu); @@ -1531,7 +1566,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); - ret = idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); + ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); if (ret < 0) goto out; root_cgrp->id = ret; @@ -4225,7 +4260,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. */ - cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); + cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); if (cgrp->id < 0) { err = -ENOMEM; goto err_unlock; @@ -4268,7 +4303,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, * @cgrp is now fully operational. If something fails after this * point, it'll be released via the normal destruction path. */ - idr_replace(&root->cgroup_idr, cgrp, cgrp->id); + cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); err = cgroup_kn_set_ugid(kn); if (err) @@ -4302,7 +4337,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, return 0; err_free_id: - idr_remove(&root->cgroup_idr, cgrp->id); + cgroup_idr_remove(&root->cgroup_idr, cgrp->id); err_unlock: mutex_unlock(&cgroup_mutex); err_unlock_tree: @@ -5162,7 +5197,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { struct cgroup *cgrp; - cgroup_assert_mutexes_or_rcu_locked(); + WARN_ON_ONCE(!rcu_read_lock_held()); cgrp = idr_find(&ss->root->cgroup_idr, id); if (cgrp) -- cgit From a2bed8209a3afc3b2cf1c28383fb48155c1fea46 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:14 -0400 Subject: cgroup: use RCU free in create_css() failure path Currently, when create_css() fails in the middle, the half-initialized css is freed by invoking cgroup_subsys->css_free() directly. This patch updates the function so that it invokes RCU free path instead. As the RCU free path puts the parent css and owning cgroup, their references are now acquired right after a new css is successfully allocated. This doesn't make any visible difference now but is to enable implementing css->id and RCU protected lookup by such IDs. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7cb9c0847445..0e2c401ed7b9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4185,12 +4185,14 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (IS_ERR(css)) return PTR_ERR(css); + init_css(css, ss, cgrp); + cgroup_get(cgrp); + css_get(css->parent); + err = percpu_ref_init(&css->refcnt, css_release); if (err) goto err_free_css; - init_css(css, ss, cgrp); - err = cgroup_populate_dir(cgrp, 1 << ss->id); if (err) goto err_free_percpu_ref; @@ -4199,9 +4201,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (err) goto err_clear_dir; - cgroup_get(cgrp); - css_get(css->parent); - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", @@ -4218,7 +4217,7 @@ err_clear_dir: err_free_percpu_ref: percpu_ref_cancel_init(&css->refcnt); err_free_css: - ss->css_free(css); + call_rcu(&css->rcu_head, css_free_rcu_fn); return err; } -- cgit From ddfcadab35dda6e5bc23ccf1c3055ecb63a71e49 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:14 -0400 Subject: cgroup: update init_css() into init_and_link_css() init_css() takes the cgroup the new css belongs to as an argument and initializes the new css's ->cgroup and ->parent pointers but doesn't acquire the matching reference counts. After the previous patch, create_css() puts init_css() and reference acquisition right next to each other. Let's move reference acquistion into init_css() and rename the function to init_and_link_css(). This makes sense and is easier to follow. This makes the root csses to hold a reference on cgrp_dfl_root.cgrp, which is harmless. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0e2c401ed7b9..f1c98c527b2d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4109,17 +4109,21 @@ static void css_release(struct percpu_ref *ref) call_rcu(&css->rcu_head, css_free_rcu_fn); } -static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, - struct cgroup *cgrp) +static void init_and_link_css(struct cgroup_subsys_state *css, + struct cgroup_subsys *ss, struct cgroup *cgrp) { + cgroup_get(cgrp); + css->cgroup = cgrp; css->ss = ss; css->flags = 0; - if (cgrp->parent) + if (cgrp->parent) { css->parent = cgroup_css(cgrp->parent, ss); - else + css_get(css->parent); + } else { css->flags |= CSS_ROOT; + } BUG_ON(cgroup_css(cgrp, ss)); } @@ -4185,9 +4189,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (IS_ERR(css)) return PTR_ERR(css); - init_css(css, ss, cgrp); - cgroup_get(cgrp); - css_get(css->parent); + init_and_link_css(css, ss, cgrp); err = percpu_ref_init(&css->refcnt, css_release); if (err) @@ -4656,7 +4658,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); - init_css(css, ss, &cgrp_dfl_root.cgrp); + init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is -- cgit From 15a4c835e4ed3e60dd68727cd1907e3dd89563f4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:14 -0400 Subject: cgroup, memcg: implement css->id and convert css_from_id() to use it Until now, cgroup->id has been used to identify all the associated csses and css_from_id() takes cgroup ID and returns the matching css by looking up the cgroup and then dereferencing the css associated with it; however, now that the lifetimes of cgroup and css are separate, this is incorrect and breaks on the unified hierarchy when a controller is disabled and enabled back again before the previous instance is released. This patch adds css->id which is a subsystem-unique ID and converts css_from_id() to look up by the new css->id instead. memcg is the only user of css_from_id() and also converted to use css->id instead. For traditional hierarchies, this shouldn't make any functional difference. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Jianyu Zhan Acked-by: Li Zefan --- include/linux/cgroup.h | 9 ++++++++ kernel/cgroup.c | 59 ++++++++++++++++++++++++++++++++------------------ mm/memcontrol.c | 4 ++-- 3 files changed, 49 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 793f70a48820..2dfabb3b749a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -62,6 +62,12 @@ struct cgroup_subsys_state { /* the parent css */ struct cgroup_subsys_state *parent; + /* + * Subsys-unique ID. 0 is unused and root is always 1. The + * matching css can be looked up using css_from_id(). + */ + int id; + unsigned int flags; /* percpu_ref killing and RCU release */ @@ -655,6 +661,9 @@ struct cgroup_subsys { /* link to parent, protected by cgroup_lock() */ struct cgroup_root *root; + /* idr for css->id */ + struct idr css_idr; + /* * List of cftypes. Each entry is the first entry of an array * terminated by zero length name. diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f1c98c527b2d..a1a20e8c973a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -100,8 +100,8 @@ static DECLARE_RWSEM(css_set_rwsem); #endif /* - * Protects cgroup_idr so that IDs can be released without grabbing - * cgroup_mutex. + * Protects cgroup_idr and css_idr so that IDs can be released without + * grabbing cgroup_mutex. */ static DEFINE_SPINLOCK(cgroup_idr_lock); @@ -1089,12 +1089,6 @@ static void cgroup_put(struct cgroup *cgrp) if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) return; - /* - * XXX: cgrp->id is only used to look up css's. As cgroup and - * css's lifetimes will be decoupled, it should be made - * per-subsystem and moved to css->id so that lookups are - * successful until the target css is released. - */ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -4104,8 +4098,11 @@ static void css_release(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); + struct cgroup_subsys *ss = css->ss; + + RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); + cgroup_idr_remove(&ss->css_idr, css->id); - RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); call_rcu(&css->rcu_head, css_free_rcu_fn); } @@ -4195,9 +4192,17 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (err) goto err_free_css; + err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); + if (err < 0) + goto err_free_percpu_ref; + css->id = err; + err = cgroup_populate_dir(cgrp, 1 << ss->id); if (err) - goto err_free_percpu_ref; + goto err_free_id; + + /* @css is ready to be brought online now, make it visible */ + cgroup_idr_replace(&ss->css_idr, css, css->id); err = online_css(css); if (err) @@ -4216,6 +4221,8 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) err_clear_dir: cgroup_clear_dir(css->cgroup, 1 << css->ss->id); +err_free_id: + cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: percpu_ref_cancel_init(&css->refcnt); err_free_css: @@ -4642,7 +4649,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .rename = cgroup_rename, }; -static void __init cgroup_init_subsys(struct cgroup_subsys *ss) +static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) { struct cgroup_subsys_state *css; @@ -4651,6 +4658,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); + idr_init(&ss->css_idr); INIT_LIST_HEAD(&ss->cfts); /* Create the root cgroup state for this subsystem */ @@ -4659,6 +4667,13 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); + if (early) { + /* idr_alloc() can't be called safely during early init */ + css->id = 1; + } else { + css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); + BUG_ON(css->id < 0); + } /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is @@ -4709,7 +4724,7 @@ int __init cgroup_init_early(void) ss->name = cgroup_subsys_name[i]; if (ss->early_init) - cgroup_init_subsys(ss); + cgroup_init_subsys(ss, true); } return 0; } @@ -4741,8 +4756,16 @@ int __init cgroup_init(void) mutex_unlock(&cgroup_tree_mutex); for_each_subsys(ss, ssid) { - if (!ss->early_init) - cgroup_init_subsys(ss); + if (ss->early_init) { + struct cgroup_subsys_state *css = + init_css_set.subsys[ss->id]; + + css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, + GFP_KERNEL); + BUG_ON(css->id < 0); + } else { + cgroup_init_subsys(ss, false); + } list_add_tail(&init_css_set.e_cset_node[ssid], &cgrp_dfl_root.cgrp.e_csets[ssid]); @@ -5196,14 +5219,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, */ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { - struct cgroup *cgrp; - WARN_ON_ONCE(!rcu_read_lock_held()); - - cgrp = idr_find(&ss->root->cgroup_idr, id); - if (cgrp) - return cgroup_css(cgrp, ss); - return NULL; + return idr_find(&ss->css_idr, id); } #ifdef CONFIG_CGROUP_DEBUG diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1d0b29715b73..c3f82f69ef58 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -527,7 +527,7 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { - return memcg->css.cgroup->id; + return memcg->css.id; } static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) @@ -6401,7 +6401,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); - if (css->cgroup->id > MEM_CGROUP_ID_MAX) + if (css->id > MEM_CGROUP_ID_MAX) return -ENOSPC; if (!parent) -- cgit From 60106946ca7f63396680130b25511ccf6b7d5bff Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 5 May 2014 20:08:13 +0200 Subject: kernel/cgroup.c: fix 2 kernel-doc warnings Fix typo and variable name. tj: Updated @cgrp argument description in cgroup_destroy_css_killed() Cc: Andrew Morton Signed-off-by: Fabian Frederick Signed-off-by: Tejun Heo --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a1a20e8c973a..07815ef7b1f6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1888,7 +1888,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) /** * cgroup_task_migrate - move a task from one cgroup to another. - * @old_cgrp; the cgroup @tsk is being migrated from + * @old_cgrp: the cgroup @tsk is being migrated from * @tsk: the task being migrated * @new_cset: the new css_set @tsk is being attached to * @@ -4586,7 +4586,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /** * cgroup_destroy_css_killed - the second step of cgroup destruction - * @work: cgroup->destroy_free_work + * @cgrp: the cgroup whose csses have just finished offlining * * This function is invoked from a work item for a cgroup which is being * destroyed after all css's are offlined and performs the rest of -- cgit From 3d7ee969bffcc984c8aeaffc6ac6816fd929ace1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 May 2014 12:19:32 -0700 Subject: x86, vdso: Clean up 32-bit vs 64-bit vdso params Rather than using 'vdso_enabled' and an awful #define, just call the parameters vdso32_enabled and vdso64_enabled. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/87913de56bdcbae3d93917938302fc369b05caee.1399317206.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/elf.h | 20 +++++++++++++------- arch/x86/um/vdso/vma.c | 2 +- arch/x86/vdso/vdso32-setup.c | 19 ++++++++----------- arch/x86/vdso/vma.c | 6 +++--- kernel/sysctl.c | 5 +++++ 5 files changed, 30 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 2c71182d30ef..e96df2c0dd69 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -75,7 +75,12 @@ typedef struct user_fxsr_struct elf_fpxregset_t; #include -extern unsigned int vdso_enabled; +#ifdef CONFIG_X86_64 +extern unsigned int vdso64_enabled; +#endif +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +extern unsigned int vdso32_enabled; +#endif /* * This is used to ensure we don't load something for the wrong architecture. @@ -269,9 +274,9 @@ extern int force_personality32; struct task_struct; -#define ARCH_DLINFO_IA32(vdso_enabled) \ +#define ARCH_DLINFO_IA32 \ do { \ - if (vdso_enabled) { \ + if (vdso32_enabled) { \ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ } \ @@ -281,7 +286,7 @@ do { \ #define STACK_RND_MASK (0x7ff) -#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled) +#define ARCH_DLINFO ARCH_DLINFO_IA32 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ @@ -292,14 +297,15 @@ do { \ #define ARCH_DLINFO \ do { \ - if (vdso_enabled) \ + if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (unsigned long)current->mm->context.vdso); \ } while (0) +/* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */ #define ARCH_DLINFO_X32 \ do { \ - if (vdso_enabled) \ + if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (unsigned long)current->mm->context.vdso); \ } while (0) @@ -310,7 +316,7 @@ do { \ if (test_thread_flag(TIF_X32)) \ ARCH_DLINFO_X32; \ else \ - ARCH_DLINFO_IA32(sysctl_vsyscall32) + ARCH_DLINFO_IA32 #define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index af91901babb8..916cda4cd5b4 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -12,7 +12,7 @@ #include #include -unsigned int __read_mostly vdso_enabled = 1; +static unsigned int __read_mostly vdso_enabled = 1; unsigned long um_vdso_addr; extern unsigned long task_size; diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 00348980a3a6..5a657d93c6e0 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -37,7 +37,6 @@ #endif #ifdef CONFIG_X86_64 -#define vdso_enabled sysctl_vsyscall32 #define arch_setup_additional_pages syscall32_setup_pages #endif @@ -45,13 +44,13 @@ * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? */ -unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; +unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT; -static int __init vdso_setup(char *s) +static int __init vdso32_setup(char *s) { - vdso_enabled = simple_strtoul(s, NULL, 0); + vdso32_enabled = simple_strtoul(s, NULL, 0); - if (vdso_enabled > 1) + if (vdso32_enabled > 1) pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); return 1; @@ -62,12 +61,10 @@ static int __init vdso_setup(char *s) * behavior on both 64-bit and 32-bit kernels. * On 32-bit kernels, vdso=[012] means the same thing. */ -__setup("vdso32=", vdso_setup); +__setup("vdso32=", vdso32_setup); #ifdef CONFIG_X86_32 -__setup_param("vdso=", vdso32_setup, vdso_setup, 0); - -EXPORT_SYMBOL_GPL(vdso_enabled); +__setup_param("vdso=", vdso_setup, vdso32_setup, 0); #endif static struct page **vdso32_pages; @@ -160,7 +157,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) return x32_setup_additional_pages(bprm, uses_interp); #endif - if (vdso_enabled != 1) /* Other values all mean "disabled" */ + if (vdso32_enabled != 1) /* Other values all mean "disabled" */ return 0; down_write(&mm->mmap_sem); @@ -244,7 +241,7 @@ subsys_initcall(sysenter_setup); static struct ctl_table abi_table2[] = { { .procname = "vsyscall32", - .data = &sysctl_vsyscall32, + .data = &vdso32_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 1ad102613127..8b790398ed1d 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -17,7 +17,7 @@ #include #if defined(CONFIG_X86_64) -unsigned int __read_mostly vdso_enabled = 1; +unsigned int __read_mostly vdso64_enabled = 1; DECLARE_VDSO_IMAGE(vdso); extern unsigned short vdso_sync_cpuid; @@ -160,7 +160,7 @@ static int setup_additional_pages(struct linux_binprm *bprm, unsigned long addr; int ret; - if (!vdso_enabled) + if (!vdso64_enabled) return 0; down_write(&mm->mmap_sem); @@ -203,7 +203,7 @@ int x32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) static __init int vdso_setup(char *s) { - vdso_enabled = simple_strtoul(s, NULL, 0); + vdso64_enabled = simple_strtoul(s, NULL, 0); return 0; } __setup("vdso=", vdso_setup); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..420d77afa8fd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1418,8 +1418,13 @@ static struct ctl_table vm_table[] = { (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { .procname = "vdso_enabled", +#ifdef CONFIG_X86_32 + .data = &vdso32_enabled, + .maxlen = sizeof(vdso32_enabled), +#else .data = &vdso_enabled, .maxlen = sizeof(vdso_enabled), +#endif .mode = 0644, .proc_handler = proc_dointvec, .extra1 = &zero, -- cgit From bdffd893a0e9c431304142d12d9a0a21d365c502 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 29 Apr 2014 14:17:40 -0500 Subject: tracing: Replace __get_cpu_var uses with this_cpu_ptr Replace uses of &__get_cpu_var for address calculation with this_cpu_ptr. Link: http://lkml.kernel.org/p/alpine.DEB.2.10.1404291415560.18364@gentwo.org Acked-by: Masami Hiramatsu Signed-off-by: Christoph Lameter Signed-off-by: Steven Rostedt --- include/linux/kprobes.h | 2 +- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 925eaf28fca9..7bd2ad01e39c 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -355,7 +355,7 @@ static inline void reset_current_kprobe(void) static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void) { - return (&__get_cpu_var(kprobe_ctlblk)); + return this_cpu_ptr(&kprobe_ctlblk); } int register_kprobe(struct kprobe *p); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9eb1aa03a18d..38e5cf73b9ae 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -822,7 +822,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, local_irq_save(flags); - stat = &__get_cpu_var(ftrace_profile_stats); + stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) goto out; @@ -853,7 +853,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) unsigned long flags; local_irq_save(flags); - stat = &__get_cpu_var(ftrace_profile_stats); + stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) goto out; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4c392c8238bf..05431696b10c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1726,7 +1726,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, */ barrier(); if (use_stack == 1) { - trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; + trace.entries = this_cpu_ptr(ftrace_stack.calls); trace.max_entries = FTRACE_STACK_MAX_ENTRIES; if (regs) -- cgit From fc34ac1dc52f02a7cf1862960c24771e4883e39b Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 5 May 2014 19:46:55 +0200 Subject: kernel/cpuset.c: kernel-doc fixes This patch also converts seq_printf to seq_puts Cc: Andrew Morton Signed-off-by: Fabian Frederick Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3d54c418bd06..1d8c0472bfb0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -890,6 +890,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider + * @trialcs: trial cpuset * @buf: buffer of cpu numbers written to this cpuset */ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, @@ -2536,7 +2537,7 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, /** * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed - * @task: pointer to task_struct of some task. + * @tsk: pointer to task_struct of some task. * * Description: Prints @task's name, cpuset name, and cached copy of its * mems_allowed to the kernel log. @@ -2646,10 +2647,10 @@ out: /* Display task mems_allowed in /proc//status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { - seq_printf(m, "Mems_allowed:\t"); + seq_puts(m, "Mems_allowed:\t"); seq_nodemask(m, &task->mems_allowed); - seq_printf(m, "\n"); - seq_printf(m, "Mems_allowed_list:\t"); + seq_puts(m, "\n"); + seq_puts(m, "Mems_allowed_list:\t"); seq_nodemask_list(m, &task->mems_allowed); - seq_printf(m, "\n"); + seq_puts(m, "\n"); } -- cgit From 12d3089c192c3a762fed098988d99f4b8ff33266 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 5 May 2014 19:49:00 +0200 Subject: kernel/cpuset.c: convert printk to pr_foo() Cc: Andrew Morton Signed-off-by: Fabian Frederick Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1d8c0472bfb0..7c0e8da59e26 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -696,11 +696,8 @@ restart: if (nslot == ndoms) { static int warnings = 10; if (warnings) { - printk(KERN_WARNING - "rebuild_sched_domains confused:" - " nslot %d, ndoms %d, csn %d, i %d," - " apn %d\n", - nslot, ndoms, csn, i, apn); + pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", + nslot, ndoms, csn, i, apn); warnings--; } continue; @@ -2018,7 +2015,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) parent = parent_cs(parent); if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { - printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset "); + pr_err("cpuset: failed to transfer tasks out of empty cpuset "); pr_cont_cgroup_name(cs->css.cgroup); pr_cont("\n"); } @@ -2555,7 +2552,7 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk) cgrp = task_cs(tsk)->css.cgroup; nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, tsk->mems_allowed); - printk(KERN_INFO "%s cpuset=", tsk->comm); + pr_info("%s cpuset=", tsk->comm); pr_cont_cgroup_name(cgrp); pr_cont(" mems_allowed=%s\n", cpuset_nodelist); -- cgit From a6220fc19afc07fe77cfd16f5b8e568615517091 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 5 May 2014 00:51:54 +0200 Subject: PM / suspend: Always use deepest C-state in the "freeze" sleep state If freeze_enter() is called, we want to bypass the current cpuidle governor and always use the deepest available (that is, not disabled) C-state, because we want to save as much energy as reasonably possible then and runtime latency constraints don't matter at that point, since the system is in a sleep state anyway. Signed-off-by: Rafael J. Wysocki Tested-by: Aubrey Li --- drivers/cpuidle/cpuidle.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- include/linux/cpuidle.h | 2 ++ kernel/power/suspend.c | 2 ++ 3 files changed, 48 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index f38359f64cc6..cb7019977c50 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -32,6 +32,7 @@ LIST_HEAD(cpuidle_detected_devices); static int enabled_devices; static int off __read_mostly; static int initialized __read_mostly; +static bool use_deepest_state __read_mostly; int cpuidle_disabled(void) { @@ -64,6 +65,45 @@ int cpuidle_play_dead(void) return -ENODEV; } +/** + * cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode. + * @enable: Whether enable or disable the feature. + * + * If the "deepest idle" mode is enabled, cpuidle will ignore the governor and + * always use the state with the greatest exit latency (out of the states that + * are not disabled). + * + * This function can only be called after cpuidle_pause() to avoid races. + */ +void cpuidle_use_deepest_state(bool enable) +{ + use_deepest_state = enable; +} + +/** + * cpuidle_find_deepest_state - Find the state of the greatest exit latency. + * @drv: cpuidle driver for a given CPU. + * @dev: cpuidle device for a given CPU. + */ +static int cpuidle_find_deepest_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + unsigned int latency_req = 0; + int i, ret = CPUIDLE_DRIVER_STATE_START - 1; + + for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) { + struct cpuidle_state *s = &drv->states[i]; + struct cpuidle_state_usage *su = &dev->states_usage[i]; + + if (s->disabled || su->disable || s->exit_latency <= latency_req) + continue; + + latency_req = s->exit_latency; + ret = i; + } + return ret; +} + /** * cpuidle_enter_state - enter the state and update stats * @dev: cpuidle device for this cpu @@ -124,6 +164,9 @@ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) if (!drv || !dev || !dev->enabled) return -EBUSY; + if (unlikely(use_deepest_state)) + return cpuidle_find_deepest_state(drv, dev); + return cpuidle_curr_governor->select(drv, dev); } @@ -155,7 +198,7 @@ int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ void cpuidle_reflect(struct cpuidle_device *dev, int index) { - if (cpuidle_curr_governor->reflect) + if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state)) cpuidle_curr_governor->reflect(dev, index); } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index a8d5bd391a26..c51a436135c4 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -143,6 +143,7 @@ extern void cpuidle_resume(void); extern int cpuidle_enable_device(struct cpuidle_device *dev); extern void cpuidle_disable_device(struct cpuidle_device *dev); extern int cpuidle_play_dead(void); +extern void cpuidle_use_deepest_state(bool enable); extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); #else @@ -175,6 +176,7 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev) {return -ENODEV; } static inline void cpuidle_disable_device(struct cpuidle_device *dev) { } static inline int cpuidle_play_dead(void) {return -ENODEV; } +static inline void cpuidle_use_deepest_state(bool enable) {} static inline struct cpuidle_driver *cpuidle_get_cpu_driver( struct cpuidle_device *dev) {return NULL; } #endif diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8233cd4047d7..155721f7f909 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -54,9 +54,11 @@ static void freeze_begin(void) static void freeze_enter(void) { + cpuidle_use_deepest_state(true); cpuidle_resume(); wait_event(suspend_freeze_wait_head, suspend_freeze_wake); cpuidle_pause(); + cpuidle_use_deepest_state(false); } void freeze_wake(void) -- cgit From 46ce0fe97a6be7532ce6126bb26ce89fed81528c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 2 May 2014 16:56:01 +0200 Subject: perf: Fix race in removing an event When removing a (sibling) event we do: raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); raw_spin_unlock_irq(&ctx->lock); perf_remove_from_context(event); raw_spin_lock_irq(&ctx->lock); ... raw_spin_unlock_irq(&ctx->lock); Now, assuming the event is a sibling, it will be 'unreachable' for things like ctx_sched_out() because that iterates the groups->siblings, and we just unhooked the sibling. So, if during we get ctx_sched_out(), it will miss the event and not call event_sched_out() on it, leaving it programmed on the PMU. The subsequent perf_remove_from_context() call will find the ctx is inactive and only call list_del_event() to remove the event from all other lists. Hereafter we can proceed to free the event; while still programmed! Close this hole by moving perf_group_detach() inside the same ctx->lock region(s) perf_remove_from_context() has. The condition on inherited events only in __perf_event_exit_task() is likely complete crap because non-inherited events are part of groups too and we're tearing down just the same. But leave that for another patch. Most-likely-Fixes: e03a9a55b4e ("perf: Change close() semantics for group events") Reported-by: Vince Weaver Tested-by: Vince Weaver Much-staring-at-traces-by: Vince Weaver Much-staring-at-traces-by: Thomas Gleixner Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140505093124.GN17778@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f83a71a3e46d..ea899e2b5593 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1443,6 +1443,11 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } +struct remove_event { + struct perf_event *event; + bool detach_group; +}; + /* * Cross CPU call to remove a performance event * @@ -1451,12 +1456,15 @@ group_sched_out(struct perf_event *group_event, */ static int __perf_remove_from_context(void *info) { - struct perf_event *event = info; + struct remove_event *re = info; + struct perf_event *event = re->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); raw_spin_lock(&ctx->lock); event_sched_out(event, cpuctx, ctx); + if (re->detach_group) + perf_group_detach(event); list_del_event(event, ctx); if (!ctx->nr_events && cpuctx->task_ctx == ctx) { ctx->is_active = 0; @@ -1481,10 +1489,14 @@ static int __perf_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_remove_from_context(struct perf_event *event) +static void perf_remove_from_context(struct perf_event *event, bool detach_group) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; + struct remove_event re = { + .event = event, + .detach_group = detach_group, + }; lockdep_assert_held(&ctx->mutex); @@ -1493,12 +1505,12 @@ static void perf_remove_from_context(struct perf_event *event) * Per cpu events are removed via an smp call and * the removal is always successful. */ - cpu_function_call(event->cpu, __perf_remove_from_context, event); + cpu_function_call(event->cpu, __perf_remove_from_context, &re); return; } retry: - if (!task_function_call(task, __perf_remove_from_context, event)) + if (!task_function_call(task, __perf_remove_from_context, &re)) return; raw_spin_lock_irq(&ctx->lock); @@ -1515,6 +1527,8 @@ retry: * Since the task isn't running, its safe to remove the event, us * holding the ctx->lock ensures the task won't get scheduled in. */ + if (detach_group) + perf_group_detach(event); list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); } @@ -3281,10 +3295,7 @@ int perf_event_release_kernel(struct perf_event *event) * to trigger the AB-BA case. */ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); - raw_spin_unlock_irq(&ctx->lock); - perf_remove_from_context(event); + perf_remove_from_context(event, true); mutex_unlock(&ctx->mutex); free_event(event); @@ -7165,7 +7176,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_context *gctx = group_leader->ctx; mutex_lock(&gctx->mutex); - perf_remove_from_context(group_leader); + perf_remove_from_context(group_leader, false); /* * Removing from the context ends up with disabled @@ -7175,7 +7186,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__state_init(group_leader); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_remove_from_context(sibling); + perf_remove_from_context(sibling, false); perf_event__state_init(sibling); put_ctx(gctx); } @@ -7305,7 +7316,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) mutex_lock(&src_ctx->mutex); list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { - perf_remove_from_context(event); + perf_remove_from_context(event, false); unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); list_add(&event->migrate_entry, &events); @@ -7367,13 +7378,7 @@ __perf_event_exit_task(struct perf_event *child_event, struct perf_event_context *child_ctx, struct task_struct *child) { - if (child_event->parent) { - raw_spin_lock_irq(&child_ctx->lock); - perf_group_detach(child_event); - raw_spin_unlock_irq(&child_ctx->lock); - } - - perf_remove_from_context(child_event); + perf_remove_from_context(child_event, !!child_event->parent); /* * It can happen that the parent exits first, and has events @@ -7857,14 +7862,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu) static void __perf_event_exit_context(void *__info) { + struct remove_event re = { .detach_group = false }; struct perf_event_context *ctx = __info; - struct perf_event *event; perf_pmu_rotate_stop(ctx->pmu); rcu_read_lock(); - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) - __perf_remove_from_context(event); + list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) + __perf_remove_from_context(&re); rcu_read_unlock(); } -- cgit From ffb4ef21ac4308c2e738e6f83b6741bbc9b4fa3b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 May 2014 19:12:20 +0200 Subject: perf: Fix perf_event_init_context() perf_pin_task_context() can return NULL but perf_event_init_context() assumes it will not, correct this. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20140505171428.GU26782@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ea899e2b5593..71232844f235 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7729,6 +7729,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn) * swapped under us. */ parent_ctx = perf_pin_task_context(parent, ctxn); + if (!parent_ctx) + return 0; /* * No need to check if parent_ctx != NULL here; since we saw -- cgit From 2d513868e2a33e1d5315490ef4c861ee65babd65 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 May 2014 23:26:24 +0200 Subject: sched: Sanitize irq accounting madness Russell reported, that irqtime_account_idle_ticks() takes ages due to: for (i = 0; i < ticks; i++) irqtime_account_process_tick(current, 0, rq); It's sad, that this code was written way _AFTER_ the NOHZ idle functionality was available. I charge myself guitly for not paying attention when that crap got merged with commit abb74cefa ("sched: Export ns irqtimes through /proc/stat") So instead of looping nr_ticks times just apply the whole thing at once. As a side note: The whole cputime_t vs. u64 business in that context wants to be cleaned up as well. There is no point in having all these back and forth conversions. Lets standardise on u64 nsec for all kernel internal accounting and be done with it. Everything else does not make sense at all for fine grained accounting. Frederic, can you please take care of that? Reported-by: Russell King Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Signed-off-by: Peter Zijlstra Cc: Venkatesh Pallipadi Cc: Shaun Ruffell Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1405022307000.6261@ionos.tec.linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a95097cb4591..72fdf06ef865 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -332,50 +332,50 @@ out: * softirq as those do not count in task exec_runtime any more. */ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) + struct rq *rq, int ticks) { - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); + u64 cputime = (__force u64) cputime_one_jiffy; u64 *cpustat = kcpustat_this_cpu->cpustat; if (steal_account_process_tick()) return; + cputime *= ticks; + scaled *= ticks; + if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; + cpustat[CPUTIME_IRQ] += cputime; } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; + cpustat[CPUTIME_SOFTIRQ] += cputime; } else if (this_cpu_ksoftirqd() == p) { /* * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. * Also, p->stime needs to be updated for ksoftirqd. */ - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SOFTIRQ); + __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); } else if (user_tick) { - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime, scaled); } else if (p == rq->idle) { - account_idle_time(cputime_one_jiffy); + account_idle_time(cputime); } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_guest_time(p, cputime, scaled); } else { - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SYSTEM); + __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); } } static void irqtime_account_idle_ticks(int ticks) { - int i; struct rq *rq = this_rq(); - for (i = 0; i < ticks; i++) - irqtime_account_process_tick(current, 0, rq); + irqtime_account_process_tick(current, 0, rq, ticks); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ static inline void irqtime_account_idle_ticks(int ticks) {} static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) {} + struct rq *rq, int nr_ticks) {} #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* @@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick) return; if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq); + irqtime_account_process_tick(p, user_tick, rq, 1); return; } -- cgit From 5bfd126e80dca70431aef8fdbc1cf14535f3c338 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 15 Apr 2014 13:49:04 +0200 Subject: sched/deadline: Fix sched_yield() behavior yield_task_dl() is broken: o it forces current to be throttled setting its runtime to zero; o it sets current's dl_se->dl_new to one, expecting that dl_task_timer() will queue it back with proper parameters at replenish time. Unfortunately, dl_task_timer() has this check at the very beginning: if (!dl_task(p) || dl_se->dl_new) goto unlock; So, it just bails out and the task is never replenished. It actually yielded forever. To fix this, introduce a new flag indicating that the task properly yielded the CPU before its current runtime expired. While this is a little overdoing at the moment, the flag would be useful in the future to discriminate between "good" jobs (of which remaining runtime could be reclaimed, i.e. recycled) and "bad" jobs (for which dl_throttled task has been set) that needed to be stopped. Reported-by: yjay.kim Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140429103953.e68eba1b2ac3309214e3dc5a@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 +++++-- kernel/sched/core.c | 1 + kernel/sched/deadline.c | 5 +++-- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 25f54c79f757..2a4298fb0d85 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1153,9 +1153,12 @@ struct sched_dl_entity { * * @dl_boosted tells if we are boosted due to DI. If so we are * outside bandwidth enforcement mechanism (but only until we - * exit the critical section). + * exit the critical section); + * + * @dl_yielded tells if task gave up the cpu before consuming + * all its available runtime during the last job. */ - int dl_throttled, dl_new, dl_boosted; + int dl_throttled, dl_new, dl_boosted, dl_yielded; /* * Bandwidth enforcement timer. Each -deadline task has its diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9fe2190005cb..e62c65a12d5b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3124,6 +3124,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_throttled = 0; dl_se->dl_new = 1; + dl_se->dl_yielded = 0; } static void __setscheduler_params(struct task_struct *p, diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b08095786cb8..800e99b99075 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -528,6 +528,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); dl_se->dl_throttled = 0; + dl_se->dl_yielded = 0; if (p->on_rq) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (task_has_dl_policy(rq->curr)) @@ -893,10 +894,10 @@ static void yield_task_dl(struct rq *rq) * We make the task go to sleep until its current deadline by * forcing its runtime to zero. This way, update_curr_dl() stops * it and the bandwidth timer will wake it up and will give it - * new scheduling parameters (thanks to dl_new=1). + * new scheduling parameters (thanks to dl_yielded=1). */ if (p->dl.runtime > 0) { - rq->curr->dl.dl_new = 1; + rq->curr->dl.dl_yielded = 1; p->dl.runtime = 0; } update_curr_dl(rq); -- cgit From 6a7cd273dc4bc3246f37ebe874754a54ccb29141 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 17 Apr 2014 10:05:02 +0800 Subject: sched/deadline: Fix memory leak Free cpudl->free_cpus allocated in cpudl_init(). Signed-off-by: Li Zefan Acked-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: # 3.14+ Link: http://lkml.kernel.org/r/534F36CE.2000409@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5b9bb42b2d47..ab001b5d5048 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -210,7 +210,5 @@ int cpudl_init(struct cpudl *cp) */ void cpudl_cleanup(struct cpudl *cp) { - /* - * nothing to do for the moment - */ + free_cpumask_var(cp->free_cpus); } -- cgit From 6227cb00cc120f9a43ce8313bb0475ddabcb7d01 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Sun, 13 Apr 2014 09:34:53 -0400 Subject: sched: Use CPUPRI_NR_PRIORITIES instead of MAX_RT_PRIO in cpupri check The check at the beginning of cpupri_find() makes sure that the task_pri variable does not exceed the cp->pri_to_cpu array length. But that length is CPUPRI_NR_PRIORITIES not MAX_RT_PRIO, where it will miss the last two priorities in that array. As task_pri is computed from convert_prio() which should never be bigger than CPUPRI_NR_PRIORITIES, if the check should cause a panic if it is hit. Reported-by: Mike Galbraith Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1397015410.5212.13.camel@marge.simpson.net Signed-off-by: Ingo Molnar --- kernel/sched/cpupri.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 8b836b376d91..3031bac8aa3e 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -70,8 +70,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, int idx = 0; int task_pri = convert_prio(p->prio); - if (task_pri >= MAX_RT_PRIO) - return 0; + BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); for (idx = 0; idx < task_pri; idx++) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; -- cgit From 6ccdc84b81a0a6c09a7f0427761d2f8cecfc2218 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Apr 2014 12:00:47 +0200 Subject: sched: Skip double execution of pick_next_task_fair() Tim wrote: "The current code will call pick_next_task_fair a second time in the slow path if we did not pull any task in our first try. This is really unnecessary as we already know no task can be pulled and it doubles the delay for the cpu to enter idle. We instrumented some network workloads and that saw that pick_next_task_fair is frequently called twice before a cpu enters idle. The call to pick_next_task_fair can add non trivial latency as it calls load_balance which runs find_busiest_group on an hierarchy of sched domains spanning the cpus for a large system. For some 4 socket systems, we saw almost 0.25 msec spent per call of pick_next_task_fair before a cpu can be idled." Optimize the second call away for the common case and document the dependency. Reported-by: Tim Chen Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Len Brown Link: http://lkml.kernel.org/r/20140424100047.GP11096@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e62c65a12d5b..28921ec91b3d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2592,8 +2592,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev) if (likely(prev->sched_class == class && rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq, prev); - if (likely(p && p != RETRY_TASK)) - return p; + if (unlikely(p == RETRY_TASK)) + goto again; + + /* assumes fair_sched_class->next == idle_sched_class */ + if (unlikely(!p)) + p = idle_sched_class.pick_next_task(rq, prev); + + return p; } again: -- cgit From 0e5b5337f0da073e1f17aec3c322ea7826975d0d Mon Sep 17 00:00:00 2001 From: Jason Low Date: Mon, 28 Apr 2014 15:45:54 -0700 Subject: sched: Fix updating rq->max_idle_balance_cost and rq->next_balance in idle_balance() The following commit: e5fc66119ec9 ("sched: Fix race in idle_balance()") can potentially cause rq->max_idle_balance_cost to not be updated, even when load_balance(NEWLY_IDLE) is attempted and the per-sd max cost value is updated. Preeti noticed a similar issue with updating rq->next_balance. In this patch, we fix this by making sure we still check/update those values even if a task gets enqueued while browsing the domains. Signed-off-by: Jason Low Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Cc: morten.rasmussen@arm.com Cc: aswin@hp.com Cc: daniel.lezcano@linaro.org Cc: alex.shi@linaro.org Cc: efault@gmx.de Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1398725155-7591-2-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7570dd969c28..0fdb96de81a5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6653,6 +6653,7 @@ static int idle_balance(struct rq *this_rq) int this_cpu = this_rq->cpu; idle_enter_fair(this_rq); + /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -6705,14 +6706,16 @@ static int idle_balance(struct rq *this_rq) raw_spin_lock(&this_rq->lock); + if (curr_cost > this_rq->max_idle_balance_cost) + this_rq->max_idle_balance_cost = curr_cost; + /* - * While browsing the domains, we released the rq lock. - * A task could have be enqueued in the meantime + * While browsing the domains, we released the rq lock, a task could + * have been enqueued in the meantime. Since we're not going idle, + * pretend we pulled a task. */ - if (this_rq->cfs.h_nr_running && !pulled_task) { + if (this_rq->cfs.h_nr_running && !pulled_task) pulled_task = 1; - goto out; - } if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* @@ -6722,9 +6725,6 @@ static int idle_balance(struct rq *this_rq) this_rq->next_balance = next_balance; } - if (curr_cost > this_rq->max_idle_balance_cost) - this_rq->max_idle_balance_cost = curr_cost; - out: /* Is there a task of a high priority class? */ if (this_rq->nr_running != this_rq->cfs.h_nr_running && -- cgit From 2b4cfe64dee0d84506b951d81bf55d9891744d25 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Wed, 23 Apr 2014 18:30:34 -0700 Subject: sched/numa: Initialize newidle balance stats in sd_numa_init() Also initialize the per-sd variables for newidle load balancing in sd_numa_init(). Signed-off-by: Jason Low Acked-by: morten.rasmussen@arm.com Cc: daniel.lezcano@linaro.org Cc: alex.shi@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: efault@gmx.de Cc: vincent.guittot@linaro.org Cc: aswin@hp.com Cc: chegu_vinod@hp.com Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1398303035-18255-3-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 28921ec91b3d..13584f1cccfc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6026,6 +6026,8 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) , .last_balance = jiffies, .balance_interval = sd_weight, + .max_newidle_lb_cost = 0, + .next_decay_max_lb_cost = jiffies, }; SD_INIT_NAME(sd, NUMA); sd->private = &tl->data; -- cgit From 792568ec6a31ca560ca4d528782cbc6cd2cea8b0 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 11 Apr 2014 13:00:27 -0400 Subject: sched/numa: Count pages on active node as local The NUMA code is smart enough to distribute the memory of workloads that span multiple NUMA nodes across those NUMA nodes. However, it still has a pretty high scan rate for such workloads, because any memory that is left on a node other than the node of the CPU that faulted on the memory is counted as non-local, which causes the scan rate to go up. Counting the memory on any node where the task's numa group is actively running as local, allows the scan rate to slow down once the application is settled in. This should reduce the overhead of the automatic NUMA placement code, when a workload spans multiple NUMA nodes. Signed-off-by: Rik van Riel Tested-by: Vinod Chegu Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1397235629-16328-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5d859ec975c2..f6457b63c95c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1738,6 +1738,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) struct task_struct *p = current; bool migrated = flags & TNF_MIGRATED; int cpu_node = task_node(current); + int local = !!(flags & TNF_FAULT_LOCAL); int priv; if (!numabalancing_enabled) @@ -1786,6 +1787,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) task_numa_group(p, last_cpupid, flags, &priv); } + /* + * If a workload spans multiple NUMA nodes, a shared fault that + * occurs wholly within the set of nodes that the workload is + * actively using should be counted as local. This allows the + * scan rate to slow down when a workload has settled down. + */ + if (!priv && !local && p->numa_group && + node_isset(cpu_node, p->numa_group->active_nodes) && + node_isset(mem_node, p->numa_group->active_nodes)) + local = 1; + task_numa_placement(p); /* @@ -1800,7 +1812,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; - p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; + p->numa_faults_locality[local] += pages; } static void reset_ptenuma_scan(struct task_struct *p) -- cgit From 5085e2a328849bdee6650b32d52c87c3788ab01c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 11 Apr 2014 13:00:28 -0400 Subject: sched/numa: Retry placement more frequently when misplaced When tasks have not converged on their preferred nodes yet, we want to retry fairly often, to make sure we do not migrate a task's memory to an undesirable location, only to have to move it again later. This patch reduces the interval at which migration is retried, when the task's numa_scan_period is small. Signed-off-by: Rik van Riel Tested-by: Vinod Chegu Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1397235629-16328-3-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f6457b63c95c..ecea8d9f957c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1326,12 +1326,15 @@ static int task_numa_migrate(struct task_struct *p) /* Attempt to migrate a task to a CPU on the preferred node. */ static void numa_migrate_preferred(struct task_struct *p) { + unsigned long interval = HZ; + /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) return; /* Periodically retry migrating the task to the preferred node */ - p->numa_migrate_retry = jiffies + HZ; + interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); + p->numa_migrate_retry = jiffies + interval; /* Success if task is already running on preferred CPU */ if (task_node(p) == p->numa_preferred_nid) -- cgit From 68d1b02a58f5d9f584c1fb2923ed60ec68cbbd9b Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 11 Apr 2014 13:00:29 -0400 Subject: sched/numa: Do not set preferred_node on migration to a second choice node Setting the numa_preferred_node for a task in task_numa_migrate does nothing on a 2-node system. Either we migrate to the node that already was our preferred node, or we stay where we were. On a 4-node system, it can slightly decrease overhead, by not calling the NUMA code as much. Since every node tends to be directly connected to every other node, running on the wrong node for a while does not do much damage. However, on an 8 node system, there are far more bad nodes than there are good ones, and pretending that a second choice is actually the preferred node can greatly delay, or even prevent, a workload from converging. The only time we can safely pretend that a second choice node is the preferred node is when the task is part of a workload that spans multiple NUMA nodes. Signed-off-by: Rik van Riel Tested-by: Vinod Chegu Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1397235629-16328-4-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ecea8d9f957c..051903f33eec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1301,7 +1301,16 @@ static int task_numa_migrate(struct task_struct *p) if (env.best_cpu == -1) return -EAGAIN; - sched_setnuma(p, env.dst_nid); + /* + * If the task is part of a workload that spans multiple NUMA nodes, + * and is migrating into one of the workload's active nodes, remember + * this node as the task's preferred numa node, so the workload can + * settle down. + * A task that migrated to a second choice node will be better off + * trying for a better one later. Do not set the preferred node here. + */ + if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) + sched_setnuma(p, env.dst_nid); /* * Reset the scan period if the task is being rescheduled on an -- cgit From 143e1e28cb40bed836b0a06567208bd7347c9672 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:37 +0200 Subject: sched: Rework sched_domain topology definition We replace the old way to configure the scheduler topology with a new method which enables a platform to declare additionnal level (if needed). We still have a default topology table definition that can be used by platform that don't want more level than the SMT, MC, CPU and NUMA ones. This table can be overwritten by an arch which either wants to add new level where a load balance make sense like BOOK or powergating level or wants to change the flags configuration of some levels. For each level, we need a function pointer that returns cpumask for each cpu, a function pointer that returns the flags for the level and a name. Only flags that describe topology, can be set by an architecture. The current topology flags are: SD_SHARE_CPUPOWER SD_SHARE_PKG_RESOURCES SD_NUMA SD_ASYM_PACKING Then, each level must be a subset on the next one. The build sequence of the sched_domain will take care of removing useless levels like those with 1 CPU and those with the same CPU span and no more relevant information for load balancing than its children. Signed-off-by: Vincent Guittot Tested-by: Dietmar Eggemann Reviewed-by: Preeti U Murthy Reviewed-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Bjorn Helgaas Cc: Chris Metcalf Cc: Christoph Lameter Cc: David S. Miller Cc: Fenghua Yu Cc: Greg Kroah-Hartman Cc: Hanjun Guo Cc: Heiko Carstens Cc: Jason Low Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Tony Luck Cc: linux390@de.ibm.com Cc: linux-ia64@vger.kernel.org Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/1397209481-28542-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 24 ---- arch/s390/include/asm/topology.h | 2 - arch/tile/include/asm/topology.h | 33 ------ include/linux/sched.h | 53 +++++++++ include/linux/topology.h | 128 +++------------------ kernel/sched/core.c | 233 ++++++++++++++++++++------------------- 6 files changed, 186 insertions(+), 287 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 5cb55a1e606b..3202aa74e0d6 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -46,30 +46,6 @@ void build_cpu_to_node_map(void); -#define SD_CPU_INIT (struct sched_domain) { \ - .parent = NULL, \ - .child = NULL, \ - .groups = NULL, \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 2, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_NEWIDLE \ - | SD_BALANCE_EXEC \ - | SD_BALANCE_FORK \ - | SD_WAKE_AFFINE, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .nr_balance_failed = 0, \ -} - #endif /* CONFIG_NUMA */ #ifdef CONFIG_SMP diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 05425b18c0aa..07763bdb408d 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -64,8 +64,6 @@ static inline void s390_init_cpu_topology(void) }; #endif -#define SD_BOOK_INIT SD_CPU_INIT - #include #endif /* _ASM_S390_TOPOLOGY_H */ diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h index d15c0d8d550f..938311844233 100644 --- a/arch/tile/include/asm/topology.h +++ b/arch/tile/include/asm/topology.h @@ -44,39 +44,6 @@ static inline const struct cpumask *cpumask_of_node(int node) /* For now, use numa node -1 for global allocation. */ #define pcibus_to_node(bus) ((void)(bus), -1) -/* - * TILE architecture has many cores integrated in one processor, so we need - * setup bigger balance_interval for both CPU/NODE scheduling domains to - * reduce process scheduling costs. - */ - -/* sched_domains SD_CPU_INIT for TILE architecture */ -#define SD_CPU_INIT (struct sched_domain) { \ - .min_interval = 4, \ - .max_interval = 128, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 0*SD_WAKE_AFFINE \ - | 0*SD_SHARE_CPUPOWER \ - | 0*SD_SHARE_PKG_RESOURCES \ - | 0*SD_SERIALIZE \ - , \ - .last_balance = jiffies, \ - .balance_interval = 32, \ -} - /* By definition, we create nodes based on online memory. */ #define node_has_online_mem(nid) 1 diff --git a/include/linux/sched.h b/include/linux/sched.h index 2a4298fb0d85..656b035c30e5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -879,6 +879,27 @@ enum cpu_idle_type { extern int __weak arch_sd_sibiling_asym_packing(void); +#ifdef CONFIG_SCHED_SMT +static inline const int cpu_smt_flags(void) +{ + return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES; +} +#endif + +#ifdef CONFIG_SCHED_MC +static inline const int cpu_core_flags(void) +{ + return SD_SHARE_PKG_RESOURCES; +} +#endif + +#ifdef CONFIG_NUMA +static inline const int cpu_numa_flags(void) +{ + return SD_NUMA; +} +#endif + struct sched_domain_attr { int relax_domain_level; }; @@ -985,6 +1006,38 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); bool cpus_share_cache(int this_cpu, int that_cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +typedef const int (*sched_domain_flags_f)(void); + +#define SDTL_OVERLAP 0x01 + +struct sd_data { + struct sched_domain **__percpu sd; + struct sched_group **__percpu sg; + struct sched_group_power **__percpu sgp; +}; + +struct sched_domain_topology_level { + sched_domain_mask_f mask; + sched_domain_flags_f sd_flags; + int flags; + int numa_level; + struct sd_data data; +#ifdef CONFIG_SCHED_DEBUG + char *name; +#endif +}; + +extern struct sched_domain_topology_level *sched_domain_topology; + +extern void set_sched_topology(struct sched_domain_topology_level *tl); + +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(type) .name = #type +#else +# define SD_INIT_NAME(type) +#endif + #else /* CONFIG_SMP */ struct sched_domain_attr; diff --git a/include/linux/topology.h b/include/linux/topology.h index 7062330a1329..973671ff9e7d 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -66,121 +66,6 @@ int arch_update_cpu_topology(void); #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif -/* - * Below are the 3 major initializers used in building sched_domains: - * SD_SIBLING_INIT, for SMT domains - * SD_CPU_INIT, for SMP domains - * - * Any architecture that cares to do any tuning to these values should do so - * by defining their own arch-specific initializer in include/asm/topology.h. - * A definition there will automagically override these default initializers - * and allow arch-specific performance tuning of sched_domains. - * (Only non-zero and non-null fields need be specified.) - */ - -#ifdef CONFIG_SCHED_SMT -/* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is, - * so can't we drop this in favor of CONFIG_SCHED_SMT? - */ -#define ARCH_HAS_SCHED_WAKE_IDLE -/* Common values for SMT siblings */ -#ifndef SD_SIBLING_INIT -#define SD_SIBLING_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 2, \ - .busy_factor = 64, \ - .imbalance_pct = 110, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 1*SD_WAKE_AFFINE \ - | 1*SD_SHARE_CPUPOWER \ - | 1*SD_SHARE_PKG_RESOURCES \ - | 0*SD_SERIALIZE \ - | 0*SD_PREFER_SIBLING \ - | arch_sd_sibling_asym_packing() \ - , \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .smt_gain = 1178, /* 15% */ \ - .max_newidle_lb_cost = 0, \ - .next_decay_max_lb_cost = jiffies, \ -} -#endif -#endif /* CONFIG_SCHED_SMT */ - -#ifdef CONFIG_SCHED_MC -/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ -#ifndef SD_MC_INIT -#define SD_MC_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 1*SD_WAKE_AFFINE \ - | 0*SD_SHARE_CPUPOWER \ - | 1*SD_SHARE_PKG_RESOURCES \ - | 0*SD_SERIALIZE \ - , \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .max_newidle_lb_cost = 0, \ - .next_decay_max_lb_cost = jiffies, \ -} -#endif -#endif /* CONFIG_SCHED_MC */ - -/* Common values for CPUs */ -#ifndef SD_CPU_INIT -#define SD_CPU_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 1*SD_WAKE_AFFINE \ - | 0*SD_SHARE_CPUPOWER \ - | 0*SD_SHARE_PKG_RESOURCES \ - | 0*SD_SERIALIZE \ - | 1*SD_PREFER_SIBLING \ - , \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .max_newidle_lb_cost = 0, \ - .next_decay_max_lb_cost = jiffies, \ -} -#endif - -#ifdef CONFIG_SCHED_BOOK -#ifndef SD_BOOK_INIT -#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!! -#endif -#endif /* CONFIG_SCHED_BOOK */ - #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DECLARE_PER_CPU(int, numa_node); @@ -295,4 +180,17 @@ static inline int cpu_to_mem(int cpu) #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif +#ifdef CONFIG_SCHED_SMT +static inline const struct cpumask *cpu_smt_mask(int cpu) +{ + return topology_thread_cpumask(cpu); +} +#endif + +static inline const struct cpumask *cpu_cpu_mask(int cpu) +{ + return cpumask_of_node(cpu_to_node(cpu)); +} + + #endif /* _LINUX_TOPOLOGY_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13584f1cccfc..7d332b7899cc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5566,17 +5566,6 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -static const struct cpumask *cpu_cpu_mask(int cpu) -{ - return cpumask_of_node(cpu_to_node(cpu)); -} - -struct sd_data { - struct sched_domain **__percpu sd; - struct sched_group **__percpu sg; - struct sched_group_power **__percpu sgp; -}; - struct s_data { struct sched_domain ** __percpu sd; struct root_domain *rd; @@ -5589,21 +5578,6 @@ enum s_alloc { sa_none, }; -struct sched_domain_topology_level; - -typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); - -#define SDTL_OVERLAP 0x01 - -struct sched_domain_topology_level { - sched_domain_init_f init; - sched_domain_mask_f mask; - int flags; - int numa_level; - struct sd_data data; -}; - /* * Build an iteration mask that can exclude certain CPUs from the upwards * domain traversal. @@ -5832,34 +5806,6 @@ int __weak arch_sd_sibling_asym_packing(void) * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ -#ifdef CONFIG_SCHED_DEBUG -# define SD_INIT_NAME(sd, type) sd->name = #type -#else -# define SD_INIT_NAME(sd, type) do { } while (0) -#endif - -#define SD_INIT_FUNC(type) \ -static noinline struct sched_domain * \ -sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ -{ \ - struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ - *sd = SD_##type##_INIT; \ - SD_INIT_NAME(sd, type); \ - sd->private = &tl->data; \ - return sd; \ -} - -SD_INIT_FUNC(CPU) -#ifdef CONFIG_SCHED_SMT - SD_INIT_FUNC(SIBLING) -#endif -#ifdef CONFIG_SCHED_MC - SD_INIT_FUNC(MC) -#endif -#ifdef CONFIG_SCHED_BOOK - SD_INIT_FUNC(BOOK) -#endif - static int default_relax_domain_level = -1; int sched_domain_level_max; @@ -5947,99 +5893,156 @@ static void claim_allocations(int cpu, struct sched_domain *sd) *per_cpu_ptr(sdd->sgp, cpu) = NULL; } -#ifdef CONFIG_SCHED_SMT -static const struct cpumask *cpu_smt_mask(int cpu) -{ - return topology_thread_cpumask(cpu); -} -#endif - -/* - * Topology list, bottom-up. - */ -static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_SCHED_SMT - { sd_init_SIBLING, cpu_smt_mask, }, -#endif -#ifdef CONFIG_SCHED_MC - { sd_init_MC, cpu_coregroup_mask, }, -#endif -#ifdef CONFIG_SCHED_BOOK - { sd_init_BOOK, cpu_book_mask, }, -#endif - { sd_init_CPU, cpu_cpu_mask, }, - { NULL, }, -}; - -static struct sched_domain_topology_level *sched_domain_topology = default_topology; - -#define for_each_sd_topology(tl) \ - for (tl = sched_domain_topology; tl->init; tl++) - #ifdef CONFIG_NUMA - static int sched_domains_numa_levels; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; +#endif -static inline int sd_local_flags(int level) -{ - if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) - return 0; - - return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; -} +/* + * SD_flags allowed in topology descriptions. + * + * SD_SHARE_CPUPOWER - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * + * Odd one out: + * SD_ASYM_PACKING - describes SMT quirks + */ +#define TOPOLOGY_SD_FLAGS \ + (SD_SHARE_CPUPOWER | \ + SD_SHARE_PKG_RESOURCES | \ + SD_NUMA | \ + SD_ASYM_PACKING) static struct sched_domain * -sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, int cpu) { struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); - int level = tl->numa_level; - int sd_weight = cpumask_weight( - sched_domains_numa_masks[level][cpu_to_node(cpu)]); + int sd_weight, sd_flags = 0; + +#ifdef CONFIG_NUMA + /* + * Ugly hack to pass state to sd_numa_mask()... + */ + sched_domains_curr_level = tl->numa_level; +#endif + + sd_weight = cpumask_weight(tl->mask(cpu)); + + if (tl->sd_flags) + sd_flags = (*tl->sd_flags)(); + if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, + "wrong sd_flags in topology description\n")) + sd_flags &= ~TOPOLOGY_SD_FLAGS; *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, .busy_factor = 32, .imbalance_pct = 125, - .cache_nice_tries = 2, - .busy_idx = 3, - .idle_idx = 2, + + .cache_nice_tries = 0, + .busy_idx = 0, + .idle_idx = 0, .newidle_idx = 0, .wake_idx = 0, .forkexec_idx = 0, .flags = 1*SD_LOAD_BALANCE | 1*SD_BALANCE_NEWIDLE - | 0*SD_BALANCE_EXEC - | 0*SD_BALANCE_FORK + | 1*SD_BALANCE_EXEC + | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE - | 0*SD_WAKE_AFFINE + | 1*SD_WAKE_AFFINE | 0*SD_SHARE_CPUPOWER | 0*SD_SHARE_PKG_RESOURCES - | 1*SD_SERIALIZE + | 0*SD_SERIALIZE | 0*SD_PREFER_SIBLING - | 1*SD_NUMA - | sd_local_flags(level) + | 0*SD_NUMA + | sd_flags , + .last_balance = jiffies, .balance_interval = sd_weight, + .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, +#ifdef CONFIG_SCHED_DEBUG + .name = tl->name, +#endif }; - SD_INIT_NAME(sd, NUMA); - sd->private = &tl->data; /* - * Ugly hack to pass state to sd_numa_mask()... + * Convert topological properties into behaviour. */ - sched_domains_curr_level = tl->numa_level; + + if (sd->flags & SD_SHARE_CPUPOWER) { + sd->imbalance_pct = 110; + sd->smt_gain = 1178; /* ~15% */ + sd->flags |= arch_sd_sibling_asym_packing(); + + } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->imbalance_pct = 117; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + +#ifdef CONFIG_NUMA + } else if (sd->flags & SD_NUMA) { + sd->cache_nice_tries = 2; + sd->busy_idx = 3; + sd->idle_idx = 2; + + sd->flags |= SD_SERIALIZE; + if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { + sd->flags &= ~(SD_BALANCE_EXEC | + SD_BALANCE_FORK | + SD_WAKE_AFFINE); + } + +#endif + } else { + sd->flags |= SD_PREFER_SIBLING; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + sd->idle_idx = 1; + } + + sd->private = &tl->data; return sd; } +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif +#ifdef CONFIG_SCHED_BOOK + { cpu_book_mask, SD_INIT_NAME(BOOK) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +struct sched_domain_topology_level *sched_domain_topology = default_topology; + +#define for_each_sd_topology(tl) \ + for (tl = sched_domain_topology; tl->mask; tl++) + +void set_sched_topology(struct sched_domain_topology_level *tl) +{ + sched_domain_topology = tl; +} + +#ifdef CONFIG_NUMA + static const struct cpumask *sd_numa_mask(int cpu) { return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; @@ -6183,7 +6186,10 @@ static void sched_init_numa(void) } } - tl = kzalloc((ARRAY_SIZE(default_topology) + level) * + /* Compute default topology size */ + for (i = 0; sched_domain_topology[i].mask; i++); + + tl = kzalloc((i + level) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; @@ -6191,18 +6197,19 @@ static void sched_init_numa(void) /* * Copy the default topology bits.. */ - for (i = 0; default_topology[i].init; i++) - tl[i] = default_topology[i]; + for (i = 0; sched_domain_topology[i].mask; i++) + tl[i] = sched_domain_topology[i]; /* * .. and append 'j' levels of NUMA goodness. */ for (j = 0; j < level; i++, j++) { tl[i] = (struct sched_domain_topology_level){ - .init = sd_numa_init, .mask = sd_numa_mask, + .sd_flags = cpu_numa_flags, .flags = SDTL_OVERLAP, .numa_level = j, + SD_INIT_NAME(NUMA) }; } @@ -6360,7 +6367,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = tl->init(tl, cpu); + struct sched_domain *sd = sd_init(tl, cpu); if (!sd) return child; -- cgit From 2dfd747629e65f89d2dbceb92fffc763f66228b2 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:38 +0200 Subject: sched, s390: Create a dedicated topology table BOOK level is only relevant for s390 so we create a dedicated topology table with BOOK level and remove it from default table. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Philipp Hachtmann Cc: cmetcalf@tilera.com Cc: benh@kernel.crashing.org Cc: dietmar.eggemann@arm.com Cc: preeti@linux.vnet.ibm.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: linux390@de.ibm.com Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/1397209481-28542-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- arch/s390/include/asm/topology.h | 11 +---------- arch/s390/kernel/topology.c | 20 ++++++++++++++++++++ kernel/sched/core.c | 3 --- 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 07763bdb408d..56af53093d24 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -26,21 +26,12 @@ extern struct cpu_topology_s390 cpu_topology[NR_CPUS]; #define mc_capable() 1 -static inline const struct cpumask *cpu_coregroup_mask(int cpu) -{ - return &cpu_topology[cpu].core_mask; -} - -static inline const struct cpumask *cpu_book_mask(int cpu) -{ - return &cpu_topology[cpu].book_mask; -} - int topology_cpu_init(struct cpu *); int topology_set_cpu_management(int fc); void topology_schedule_update(void); void store_topology(struct sysinfo_15_1_x *info); void topology_expect_change(void); +const struct cpumask *cpu_coregroup_mask(int cpu); #else /* CONFIG_SCHED_BOOK */ diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 6298fed11ced..1860ad3cbc04 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -443,6 +443,23 @@ int topology_cpu_init(struct cpu *cpu) return sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group); } +const struct cpumask *cpu_coregroup_mask(int cpu) +{ + return &cpu_topology[cpu].core_mask; +} + +static const struct cpumask *cpu_book_mask(int cpu) +{ + return &cpu_topology[cpu].book_mask; +} + +static struct sched_domain_topology_level s390_topology[] = { + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_book_mask, SD_INIT_NAME(BOOK) }, + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + static int __init topology_init(void) { if (!MACHINE_HAS_TOPOLOGY) { @@ -451,6 +468,9 @@ static int __init topology_init(void) } set_topology_timer(); out: + + set_sched_topology(s390_topology); + return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching); } device_initcall(topology_init); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7d332b7899cc..e59e5aec745a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6023,9 +6023,6 @@ static struct sched_domain_topology_level default_topology[] = { #endif #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, -#endif -#ifdef CONFIG_SCHED_BOOK - { cpu_book_mask, SD_INIT_NAME(BOOK) }, #endif { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, }, -- cgit From 607b45e9a216e89a63351556e488eea06be0ff48 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:39 +0200 Subject: sched, powerpc: Create a dedicated topology table Create a dedicated topology table for handling asymetric feature of powerpc. Signed-off-by: Vincent Guittot Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Cc: Andy Fleming Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Grant Likely Cc: Linus Torvalds Cc: Michael Ellerman Cc: Paul Gortmaker Cc: Paul Mackerras Cc: Preeti U. Murthy Cc: Rob Herring Cc: Srivatsa S. Bhat Cc: Toshi Kani Cc: Vasant Hegde Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: schwidefsky@de.ibm.com Cc: cmetcalf@tilera.com Cc: dietmar.eggemann@arm.com Cc: devicetree@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1397209481-28542-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/smp.c | 31 +++++++++++++++++++++++-------- include/linux/sched.h | 2 -- kernel/sched/core.c | 6 ------ 3 files changed, 23 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index e2a4232c5871..10ffffef0414 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -766,6 +766,28 @@ int setup_profiling_timer(unsigned int multiplier) return 0; } +#ifdef CONFIG_SCHED_SMT +/* cpumask of CPUs with asymetric SMT dependancy */ +static const int powerpc_smt_flags(void) +{ + int flags = SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES; + + if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { + printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); + flags |= SD_ASYM_PACKING; + } + return flags; +} +#endif + +static struct sched_domain_topology_level powerpc_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + void __init smp_cpus_done(unsigned int max_cpus) { cpumask_var_t old_mask; @@ -790,15 +812,8 @@ void __init smp_cpus_done(unsigned int max_cpus) dump_numa_cpu_topology(); -} + set_sched_topology(powerpc_topology); -int arch_sd_sibling_asym_packing(void) -{ - if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { - printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); - return SD_ASYM_PACKING; - } - return 0; } #ifdef CONFIG_HOTPLUG_CPU diff --git a/include/linux/sched.h b/include/linux/sched.h index 656b035c30e5..439a153b8403 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -877,8 +877,6 @@ enum cpu_idle_type { #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ #define SD_NUMA 0x4000 /* cross-node balancing */ -extern int __weak arch_sd_sibiling_asym_packing(void); - #ifdef CONFIG_SCHED_SMT static inline const int cpu_smt_flags(void) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e59e5aec745a..7e348e238bf1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5796,11 +5796,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); } -int __weak arch_sd_sibling_asym_packing(void) -{ - return 0*SD_ASYM_PACKING; -} - /* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() @@ -5981,7 +5976,6 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) if (sd->flags & SD_SHARE_CPUPOWER) { sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ - sd->flags |= arch_sd_sibling_asym_packing(); } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { sd->imbalance_pct = 117; -- cgit From d77b3ed5c9f8ebedf154b52b5e943c461f3d37e6 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 11 Apr 2014 11:44:40 +0200 Subject: sched: Add a new SD_SHARE_POWERDOMAIN for sched_domain A new flag SD_SHARE_POWERDOMAIN is created to reflect whether groups of CPUs in a sched_domain level can or not reach different power state. As an example, the flag should be cleared at CPU level if groups of cores can be power gated independently. This information can be used in the load balance decision or to add load balancing level between group of CPUs that can power gate independantly. This flag is part of the topology flags that can be set by arch. Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: schwidefsky@de.ibm.com Cc: cmetcalf@tilera.com Cc: benh@kernel.crashing.org Cc: preeti@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1397209481-28542-5-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 439a153b8403..accb66bfd722 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -870,6 +870,7 @@ enum cpu_idle_type { #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ +#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7e348e238bf1..1c9c3b7b26af 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5261,7 +5261,8 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES)) { + SD_SHARE_PKG_RESOURCES | + SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) return 0; } @@ -5292,7 +5293,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING); + SD_PREFER_SIBLING | + SD_SHARE_POWERDOMAIN); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -5901,6 +5903,7 @@ static int sched_domains_curr_level; * SD_SHARE_CPUPOWER - describes SMT topologies * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain * * Odd one out: * SD_ASYM_PACKING - describes SMT quirks @@ -5909,7 +5912,8 @@ static int sched_domains_curr_level; (SD_SHARE_CPUPOWER | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ - SD_ASYM_PACKING) + SD_ASYM_PACKING | \ + SD_SHARE_POWERDOMAIN) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, int cpu) -- cgit From 39a4d9ca77a31503c6317e49742341d0859d5cb2 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Wed, 23 Apr 2014 18:30:35 -0700 Subject: sched/fair: Stop searching for tasks in newidle balance if there are runnable tasks It was found that when running some workloads (such as AIM7) on large systems with many cores, CPUs do not remain idle for long. Thus, tasks can wake/get enqueued while doing idle balancing. In this patch, while traversing the domains in idle balance, in addition to checking for pulled_task, we add an extra check for this_rq->nr_running for determining if we should stop searching for tasks to pull. If there are runnable tasks on this rq, then we will stop traversing the domains. This reduces the chance that idle balance delays a task from running. This patch resulted in approximately a 6% performance improvement when running a Java Server workload on an 8 socket machine. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: daniel.lezcano@linaro.org Cc: alex.shi@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: efault@gmx.de Cc: vincent.guittot@linaro.org Cc: morten.rasmussen@arm.com Cc: aswin@hp.com Cc: chegu_vinod@hp.com Link: http://lkml.kernel.org/r/1398303035-18255-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 051903f33eec..28ccf502c63c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6713,7 +6713,6 @@ static int idle_balance(struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) { t0 = sched_clock_cpu(this_cpu); - /* If we've pulled tasks over stop searching: */ pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); @@ -6728,7 +6727,12 @@ static int idle_balance(struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) + + /* + * Stop searching for tasks to pull if there are + * now runnable tasks on this rq. + */ + if (pulled_task || this_rq->nr_running > 0) break; } rcu_read_unlock(); -- cgit From 1f4ee5038f0c1ef95f8e6d47ad6623e006b5bce1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 May 2014 09:59:34 +0200 Subject: perf: Ensure consistent inherit state in groups Make sure all events in a group have the same inherit state. It was possible for group leaders to have inherit set while sibling events would not have inherit set. In this case we'd still inherit the siblings, leading to some non-fatal weirdness. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-r32tt8yldvic3jlcghd3g35u@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 09866a330af8..1de0d709f69f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7081,20 +7081,26 @@ SYSCALL_DEFINE5(perf_event_open, } } + if (task && group_leader && + group_leader->attr.inherit != attr.inherit) { + err = -EINVAL; + goto err_task; + } + get_online_cpus(); event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL, NULL); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_task; + goto err_cpus; } if (flags & PERF_FLAG_PID_CGROUP) { err = perf_cgroup_connect(pid, event, &attr, group_leader); if (err) { __free_event(event); - goto err_task; + goto err_cpus; } } @@ -7256,8 +7262,9 @@ err_context: put_ctx(ctx); err_alloc: free_event(event); -err_task: +err_cpus: put_online_cpus(); +err_task: if (task) put_task_struct(task); err_group_fd: -- cgit From 15a2d4de0eab533a76bee9e68d7e1063dd25401c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 May 2014 11:41:02 +0200 Subject: perf: Always destroy groups on exit Commit 38b435b16c36 ("perf: Fix tear-down of inherited group events") states that we need to destroy groups for inherited events, but it doesn't make any sense to not also destroy groups for normal events. And while it usually makes no difference (the normal events won't leak, and its very likely all the group events will die in quick succession) it does make the code more consistent and closes a potential hole for trouble. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-426egt8zmsm12d2q8k2xz4tt@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1de0d709f69f..819ffc006d67 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7400,7 +7400,7 @@ __perf_event_exit_task(struct perf_event *child_event, struct perf_event_context *child_ctx, struct task_struct *child) { - perf_remove_from_context(child_event, !!child_event->parent); + perf_remove_from_context(child_event, true); /* * It can happen that the parent exits first, and has events -- cgit From 63342411efd2d9350ad405205da036cd45ed1640 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 May 2014 11:49:16 +0200 Subject: perf: Validate locking assumption Document and validate the locking assumption of event_sched_in(). Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-sybq1publ9xt5no77cwvi0eo@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 819ffc006d67..0de199729f04 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1678,6 +1678,8 @@ event_sched_in(struct perf_event *event, u64 tstamp = perf_event_time(event); int ret = 0; + lockdep_assert_held(&ctx->lock); + if (event->state <= PERF_EVENT_STATE_OFF) return 0; -- cgit From 683ede43dd412c6cad0d23578a018409ac9c683e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 May 2014 12:11:24 +0200 Subject: perf: Rework free paths Primarily make perf_event_release_kernel() into put_event(), this will allow kernel space to create per-task inherited events, and is safer in general. Also, document the free_event() assumptions. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-rk9pvr6e1d0559lxstltbztc@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 66 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0de199729f04..83505a35afba 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3251,7 +3251,8 @@ static void __free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } -static void free_event(struct perf_event *event) + +static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); @@ -3279,42 +3280,31 @@ static void free_event(struct perf_event *event) if (is_cgroup_event(event)) perf_detach_cgroup(event); - __free_event(event); } -int perf_event_release_kernel(struct perf_event *event) +/* + * Used to free events which have a known refcount of 1, such as in error paths + * where the event isn't exposed yet and inherited events. + */ +static void free_event(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - - WARN_ON_ONCE(ctx->parent_ctx); - /* - * There are two ways this annotation is useful: - * - * 1) there is a lock recursion from perf_event_exit_task - * see the comment there. - * - * 2) there is a lock-inversion with mmap_sem through - * perf_event_read_group(), which takes faults while - * holding ctx->mutex, however this is called after - * the last filedesc died, so there is no possibility - * to trigger the AB-BA case. - */ - mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); - perf_remove_from_context(event, true); - mutex_unlock(&ctx->mutex); - - free_event(event); + if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, + "unexpected event refcount: %ld; ptr=%p\n", + atomic_long_read(&event->refcount), event)) { + /* leak to avoid use-after-free */ + return; + } - return 0; + _free_event(event); } -EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ static void put_event(struct perf_event *event) { + struct perf_event_context *ctx = event->ctx; struct task_struct *owner; if (!atomic_long_dec_and_test(&event->refcount)) @@ -3353,9 +3343,33 @@ static void put_event(struct perf_event *event) put_task_struct(owner); } - perf_event_release_kernel(event); + WARN_ON_ONCE(ctx->parent_ctx); + /* + * There are two ways this annotation is useful: + * + * 1) there is a lock recursion from perf_event_exit_task + * see the comment there. + * + * 2) there is a lock-inversion with mmap_sem through + * perf_event_read_group(), which takes faults while + * holding ctx->mutex, however this is called after + * the last filedesc died, so there is no possibility + * to trigger the AB-BA case. + */ + mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); + perf_remove_from_context(event, true); + mutex_unlock(&ctx->mutex); + + _free_event(event); } +int perf_event_release_kernel(struct perf_event *event) +{ + put_event(event); + return 0; +} +EXPORT_SYMBOL_GPL(perf_event_release_kernel); + static int perf_release(struct inode *inode, struct file *file) { put_event(file->private_data); -- cgit From 3a497f48637e2aac17eabb84a17f8ac5216028fc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 May 2014 12:42:29 +0200 Subject: perf: Simplify perf_event_exit_task_context() Instead of jumping through hoops to make sure to find (and exit) each event, do it the simple straight fwd way. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-tij931199thfkys8vbnokdpf@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 83505a35afba..7ab734fbaeeb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7485,24 +7485,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ mutex_lock(&child_ctx->mutex); -again: - list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, - group_entry) - __perf_event_exit_task(child_event, child_ctx, child); - - list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, - group_entry) + list_for_each_entry_rcu(child_event, &child_ctx->event_list, event_entry) __perf_event_exit_task(child_event, child_ctx, child); - /* - * If the last event was a group event, it will have appended all - * its siblings to the list, but we obtained 'tmp' before that which - * will still point to the list head terminating the iteration. - */ - if (!list_empty(&child_ctx->pinned_groups) || - !list_empty(&child_ctx->flexible_groups)) - goto again; - mutex_unlock(&child_ctx->mutex); put_ctx(child_ctx); -- cgit From fd99f91aa007ba255aac44fe6cf21c1db398243a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Apr 2014 15:35:08 +0200 Subject: sched/idle: Avoid spurious wakeup IPIs Because mwait_idle_with_hints() gets called from !idle context it must call current_clr_polling(). This however means that resched_task() is very likely to send an IPI even when we were polling: CPU0 CPU1 if (current_set_polling_and_test()) goto out; __monitor(&ti->flags); if (!need_resched()) __mwait(eax, ecx); set_tsk_need_resched(p); smp_mb(); out: current_clr_polling(); if (!tsk_is_polling(p)) smp_send_reschedule(cpu); So while it is correct (extra IPIs aren't a problem, whereas a missed IPI would be) it is a performance problem (for some). Avoid this issue by using fetch_or() to atomically set NEED_RESCHED and test if POLLING_NRFLAG is set. Since a CPU stuck in mwait is unlikely to modify the flags word, contention on the cmpxchg is unlikely and thus we should mostly succeed in a single go. Signed-off-by: Peter Zijlstra Acked-by: Nicolas Pitre Cc: Andy Lutomirski Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-kf5suce6njh5xf5d3od13rr0@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1c9c3b7b26af..4b82622b6252 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -505,6 +505,39 @@ static inline void init_hrtick(void) } #endif /* CONFIG_SCHED_HRTICK */ +/* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, val) \ +({ typeof(*(ptr)) __old, __val = *(ptr); \ + for (;;) { \ + __old = cmpxchg((ptr), __val, __val | (val)); \ + if (__old == __val) \ + break; \ + __val = __old; \ + } \ + __old; \ +}) + +#ifdef TIF_POLLING_NRFLAG +/* + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +static bool set_nr_and_not_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); +} +#else +static bool set_nr_and_not_polling(struct task_struct *p) +{ + set_tsk_need_resched(p); + return true; +} +#endif + /* * resched_task - mark a task 'to be rescheduled now'. * @@ -521,17 +554,15 @@ void resched_task(struct task_struct *p) if (test_tsk_need_resched(p)) return; - set_tsk_need_resched(p); - cpu = task_cpu(p); + if (cpu == smp_processor_id()) { + set_tsk_need_resched(p); set_preempt_need_resched(); return; } - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) + if (set_nr_and_not_polling(p)) smp_send_reschedule(cpu); } -- cgit From c444117f0f39d59733ec23da67c44424df529230 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Apr 2014 13:47:16 +0200 Subject: sched/idle: Delay clearing the polling bit With the generic idle functions assuming !polling we should only clear the polling bit at the very last opportunity in order to avoid spurious IPIs. Ideally we'd flip the default to polling, but that means auditing all arch idle functions. Signed-off-by: Peter Zijlstra Acked-by: Nicolas Pitre Cc: Daniel Lezcano Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-vq7719foqzf6z5h4j7eh7f9e@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f4390a079c7..ed67f0cd2906 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -78,12 +78,10 @@ static int cpuidle_idle_call(void) /* * Check if the idle task must be rescheduled. If it is the - * case, exit the function after re-enabling the local irq and - * set again the polling flag + * case, exit the function after re-enabling the local irq. */ - if (current_clr_polling_and_test()) { + if (need_resched()) { local_irq_enable(); - __current_set_polling(); return 0; } @@ -127,7 +125,7 @@ static int cpuidle_idle_call(void) broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); - if (broadcast) + if (broadcast) { /* * Tell the time framework to switch * to a broadcast timer because our @@ -139,6 +137,7 @@ static int cpuidle_idle_call(void) ret = clockevents_notify( CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu); + } if (!ret) { trace_cpu_idle_rcuidle(next_state, dev->cpu); @@ -175,8 +174,12 @@ static int cpuidle_idle_call(void) * We can't use the cpuidle framework, let's use the default * idle routine */ - if (ret) - arch_cpu_idle(); + if (ret) { + if (!current_clr_polling_and_test()) + arch_cpu_idle(); + else + local_irq_enable(); + } __current_set_polling(); -- cgit From 37352273ad48f2d177ed1b06ced32d5536b773fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Apr 2014 13:55:48 +0200 Subject: sched/idle: Reflow cpuidle_idle_call() Apply goto to reduce lines and nesting levels. Signed-off-by: Peter Zijlstra Acked-by: Nicolas Pitre Cc: Daniel Lezcano Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-cc6vb0snt3sr7op6rlbfeqfh@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 131 +++++++++++++++++++++++----------------------------- 1 file changed, 58 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ed67f0cd2906..88a6bc43738b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -73,7 +73,7 @@ static int cpuidle_idle_call(void) { struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); - int next_state, entered_state, ret; + int next_state, entered_state; bool broadcast; /* @@ -102,90 +102,75 @@ static int cpuidle_idle_call(void) * Check if the cpuidle framework is ready, otherwise fallback * to the default arch specific idle method */ - ret = cpuidle_enabled(drv, dev); - - if (!ret) { - /* - * Ask the governor to choose an idle state it thinks - * it is convenient to go to. There is *always* a - * convenient idle state - */ - next_state = cpuidle_select(drv, dev); - + if (cpuidle_enabled(drv, dev)) { +use_default: /* - * The idle task must be scheduled, it is pointless to - * go to idle, just update no idle residency and get - * out of this function + * We can't use the cpuidle framework, let's use the default + * idle routine. */ - if (current_clr_polling_and_test()) { - dev->last_residency = 0; - entered_state = next_state; + if (current_clr_polling_and_test()) local_irq_enable(); - } else { - broadcast = !!(drv->states[next_state].flags & - CPUIDLE_FLAG_TIMER_STOP); - - if (broadcast) { - /* - * Tell the time framework to switch - * to a broadcast timer because our - * local timer will be shutdown. If a - * local timer is used from another - * cpu as a broadcast timer, this call - * may fail if it is not available - */ - ret = clockevents_notify( - CLOCK_EVT_NOTIFY_BROADCAST_ENTER, - &dev->cpu); - } - - if (!ret) { - trace_cpu_idle_rcuidle(next_state, dev->cpu); - - /* - * Enter the idle state previously - * returned by the governor - * decision. This function will block - * until an interrupt occurs and will - * take care of re-enabling the local - * interrupts - */ - entered_state = cpuidle_enter(drv, dev, - next_state); - - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, - dev->cpu); - - if (broadcast) - clockevents_notify( - CLOCK_EVT_NOTIFY_BROADCAST_EXIT, - &dev->cpu); - - /* - * Give the governor an opportunity to reflect on the - * outcome - */ - cpuidle_reflect(dev, entered_state); - } - } + else + arch_cpu_idle(); + + goto exit_idle; } /* - * We can't use the cpuidle framework, let's use the default - * idle routine + * Ask the governor to choose an idle state it thinks + * it is convenient to go to. There is *always* a + * convenient idle state */ - if (ret) { - if (!current_clr_polling_and_test()) - arch_cpu_idle(); - else - local_irq_enable(); + next_state = cpuidle_select(drv, dev); + + /* + * The idle task must be scheduled, it is pointless to + * go to idle, just update no idle residency and get + * out of this function + */ + if (current_clr_polling_and_test()) { + dev->last_residency = 0; + entered_state = next_state; + local_irq_enable(); + goto exit_idle; } + broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); + + /* + * Tell the time framework to switch to a broadcast timer + * because our local timer will be shutdown. If a local timer + * is used from another cpu as a broadcast timer, this call may + * fail if it is not available + */ + if (broadcast && + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) + goto use_default; + + trace_cpu_idle_rcuidle(next_state, dev->cpu); + + /* + * Enter the idle state previously returned by the governor decision. + * This function will block until an interrupt occurs and will take + * care of re-enabling the local interrupts + */ + entered_state = cpuidle_enter(drv, dev, next_state); + + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + + if (broadcast) + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); + + /* + * Give the governor an opportunity to reflect on the outcome + */ + cpuidle_reflect(dev, entered_state); + +exit_idle: __current_set_polling(); /* - * It is up to the idle functions to enable back the local - * interrupt + * It is up to the idle functions to reenable local interrupts */ if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); -- cgit From 08c373e5123b4595588ae1a7aa7e00a046c61cc6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 21 Apr 2014 01:26:58 +0200 Subject: sched/idle: Make cpuidle_idle_call() void The only value ever returned by cpuidle_idle_call() is 0 and its only caller ignores that value anyway, so make it void. Signed-off-by: Rafael J. Wysocki Cc: Daniel Lezcano Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4717784.WmVEpDoliM@vostro.rjw.lan Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 88a6bc43738b..34083c9ac976 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -67,9 +67,8 @@ void __weak arch_cpu_idle(void) * cpuidle_idle_call - the main idle function * * NOTE: no locks or semaphores should be used here - * return non-zero on failure */ -static int cpuidle_idle_call(void) +static void cpuidle_idle_call(void) { struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); @@ -82,7 +81,7 @@ static int cpuidle_idle_call(void) */ if (need_resched()) { local_irq_enable(); - return 0; + return; } /* @@ -177,8 +176,6 @@ exit_idle: rcu_idle_exit(); start_critical_timings(); - - return 0; } /* -- cgit From 317cf7e5e85e3ef9f23fc6dd8b2945ab4a258140 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 9 May 2014 23:32:08 +0200 Subject: PM / hibernate: convert simple_strtoul to kstrtoul Replace obsolete function. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1f08ac7f55d8..2377ff72994c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1115,7 +1115,10 @@ static int __init resumewait_setup(char *str) static int __init resumedelay_setup(char *str) { - resume_delay = simple_strtoul(str, NULL, 0); + int rc = kstrtoul(str, 0, (unsigned long *)&resume_delay); + + if (rc) + return rc; return 1; } -- cgit From cdafb93feb3d0ae3131f631a10f70954c96cbef8 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 9 May 2014 20:32:25 +0200 Subject: ntp: Convert simple_strtol to kstrtol Replace obsolete function simple_strtol w/ kstrtol Inspired-By: Andrew Morton Cc: John Stultz Cc: Andrew Morton Signed-off-by: Fabian Frederick [jstultz: Tweak commit message] Signed-off-by: John Stultz --- kernel/time/ntp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 419a52cecd20..82b7c9edba7e 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -923,7 +923,10 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) static int __init ntp_tick_adj_setup(char *str) { - ntp_tick_adj = simple_strtol(str, NULL, 0); + int rc = kstrtol(str, 0, (long *)&ntp_tick_adj); + + if (rc) + return rc; ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; -- cgit From 2d916033a318d7e763eb099c69600d5dcd1ccb6b Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 12 May 2014 13:59:35 -0400 Subject: kernel/workqueue.c: pr_warning/pr_warn & printk/pr_info tj: Refreshed on top of wq/for-3.16. Cc: Andrew Morton Signed-off-by: Fabian Frederick Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c3f076f0f8fd..c8411085466f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4117,8 +4117,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, /* create a new pwq */ pwq = alloc_unbound_pwq(wq, target_attrs); if (!pwq) { - pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", - wq->name); + pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", + wq->name); mutex_lock(&wq->mutex); goto use_dfl_pwq; } @@ -4568,7 +4568,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { - printk("%sWorkqueue: %s %pf", log_lvl, name, fn); + pr_info("%sWorkqueue: %s %pf", log_lvl, name, fn); if (desc[0]) pr_cont(" (%s)", desc); pr_cont("\n"); -- cgit From ea54bca3aab3468daf1c37d15047ee66bca8760d Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Mon, 12 May 2014 09:35:48 -0400 Subject: ntp: Make is_error_status() use its argument is_error_status() is an inline function always called with the global time_status as an argument, so there's zero functional difference with this change, but the non-CONFIG_NTP_PPS version uses the passed-in argument, while the CONFIG_NTP_PPS one ignores its argument and uses the global. Looks like is_error_status was refactored out, but someone forgot to change the logic to check the local argument value. Thus this patch makes it use the argument always; shorter variable names are good. Signed-off-by: George Spelvin [jstultz: Tweaked commit message] Signed-off-by: John Stultz --- kernel/time/ntp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 82b7c9edba7e..c8780cdaf852 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -165,21 +165,21 @@ static inline void pps_set_freq(s64 freq) static inline int is_error_status(int status) { - return (time_status & (STA_UNSYNC|STA_CLOCKERR)) + return (status & (STA_UNSYNC|STA_CLOCKERR)) /* PPS signal lost when either PPS time or * PPS frequency synchronization requested */ - || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) - && !(time_status & STA_PPSSIGNAL)) + || ((status & (STA_PPSFREQ|STA_PPSTIME)) + && !(status & STA_PPSSIGNAL)) /* PPS jitter exceeded when * PPS time synchronization requested */ - || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) + || ((status & (STA_PPSTIME|STA_PPSJITTER)) == (STA_PPSTIME|STA_PPSJITTER)) /* PPS wander exceeded or calibration error when * PPS frequency synchronization requested */ - || ((time_status & STA_PPSFREQ) - && (time_status & (STA_PPSWANDER|STA_PPSERROR))); + || ((status & STA_PPSFREQ) + && (status & (STA_PPSWANDER|STA_PPSERROR))); } static inline void pps_fill_timex(struct timex *txc) -- cgit From 0cee8b7786467907e12d1d8f872e6dc73bc95204 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:10:59 -0400 Subject: cgroup: fix offlining child waiting in cgroup_subtree_control_write() cgroup_subtree_control_write() waits for offline to complete child-by-child before enabling a controller; however, it has a couple bugs. * It doesn't initialize the wait_queue_t. This can lead to infinite hang on the following schedule() among other things. * It forgets to pin the child before releasing cgroup_tree_mutex and performing schedule(). The child may already be gone by the time it wakes up and invokes finish_wait(). Pin the child being waited on. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9db1a9629a5c..95fc66b16091 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2594,16 +2594,18 @@ retry: * cases, wait till it's gone using offline_waitq. */ cgroup_for_each_live_child(child, cgrp) { - wait_queue_t wait; + DEFINE_WAIT(wait); if (!cgroup_css(child, ss)) continue; + cgroup_get(child); prepare_to_wait(&child->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&cgroup_tree_mutex); schedule(); finish_wait(&child->offline_waitq, &wait); + cgroup_put(child); goto retry; } -- cgit From 54504e977ceee0bea6fbe8b632eceea771b18c6c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:10:59 -0400 Subject: cgroup: cgroup_idr_lock should be bh cgroup_idr_remove() can be invoked from bh leading to lockdep detecting possible AA deadlock (IN_BH/ON_BH). Make the lock bh-safe. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 95fc66b16091..e2ff925e6ee8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -203,9 +203,9 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, int ret; idr_preload(gfp_mask); - spin_lock(&cgroup_idr_lock); + spin_lock_bh(&cgroup_idr_lock); ret = idr_alloc(idr, ptr, start, end, gfp_mask); - spin_unlock(&cgroup_idr_lock); + spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; } @@ -214,17 +214,17 @@ static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) { void *ret; - spin_lock(&cgroup_idr_lock); + spin_lock_bh(&cgroup_idr_lock); ret = idr_replace(idr, ptr, id); - spin_unlock(&cgroup_idr_lock); + spin_unlock_bh(&cgroup_idr_lock); return ret; } static void cgroup_idr_remove(struct idr *idr, int id) { - spin_lock(&cgroup_idr_lock); + spin_lock_bh(&cgroup_idr_lock); idr_remove(idr, id); - spin_unlock(&cgroup_idr_lock); + spin_unlock_bh(&cgroup_idr_lock); } /** -- cgit From 0ab7a60dea71c285dfbb65e344d895b9c4f7bcb9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:10:59 -0400 Subject: cgroup: css_release() shouldn't clear cgroup->subsys[] c1a71504e971 ("cgroup: don't recycle cgroup id until all csses' have been destroyed") made cgroup ID persist until a cgroup is released and add cgroup->subsys[] clearing to css_release() so that css_from_id() doesn't return a css which has already been released which happens before cgroup release; however, the right change here was updating offline_css() to clear cgroup->subsys[] which was done by e32978031016 ("cgroup: cgroup->subsys[] should be cleared after the css is offlined") instead of clearing it from css_release(). We're now clearing cgroup->subsys[] twice. This is okay for traditional hierarchies as a css's lifetime is the same as its cgroup's; however, this confuses unified hierarchy and turning on and off a controller repeatedly using "cgroup.subtree_control" can lead to an oops like the following which happens because cgroup->subsys[] is incorrectly cleared asynchronously by css_release(). BUG: unable to handle kernel NULL pointer dereference at 00000000000000 08 IP: [] kill_css+0x21/0x1c0 PGD 1170d067 PUD f0ab067 PMD 0 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 2 PID: 459 Comm: bash Not tainted 3.15.0-rc2-work+ #5 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff880009296710 ti: ffff88000e198000 task.ti: ffff88000e198000 RIP: 0010:[] [] kill_css+0x21/0x1c0 RSP: 0018:ffff88000e199dc8 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000001 RDX: 0000000000000001 RSI: ffffffff8238a968 RDI: ffff880009296f98 RBP: ffff88000e199de0 R08: 0000000000000001 R09: 02b0000000000000 R10: 0000000000000000 R11: ffff880009296fc0 R12: 0000000000000001 R13: ffff88000db6fc58 R14: 0000000000000001 R15: ffff8800139dcc00 FS: 00007ff9160c5740(0000) GS:ffff88001fb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 0000000013947000 CR4: 00000000000006e0 Stack: ffff88000e199de0 ffffffff82389160 0000000000000001 ffff88000e199e80 ffffffff8113537f 0000000000000007 ffff88000e74af00 ffff88000e199e48 ffff880009296710 ffff88000db6fc00 ffffffff8239c100 0000000000000002 Call Trace: [] cgroup_subtree_control_write+0x85f/0xa00 [] cgroup_file_write+0x38/0x1d0 [] kernfs_fop_write+0xe7/0x170 [] vfs_write+0xb6/0x1c0 [] SyS_write+0x4d/0xc0 [] system_call_fastpath+0x16/0x1b Code: 5c 41 5d 41 5e 41 5f 5d c3 90 0f 1f 44 00 00 55 48 89 e5 41 54 53 48 89 fb 48 83 ec 08 8b 05 37 ad 29 01 85 c0 0f 85 df 00 00 00 <48> 8b 43 08 48 8b 3b be 01 00 00 00 8b 48 5c d3 e6 e8 49 ff ff RIP [] kill_css+0x21/0x1c0 RSP CR2: 0000000000000008 ---[ end trace e7aae1f877c4e1b4 ]--- Remove the unnecessary cgroup->subsys[] clearing from css_release(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e2ff925e6ee8..35daf892b6e6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4102,7 +4102,6 @@ static void css_release(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); struct cgroup_subsys *ss = css->ss; - RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); cgroup_idr_remove(&ss->css_idr, css->id); call_rcu(&css->rcu_head, css_free_rcu_fn); -- cgit From d37167ab7b3d67d53519585a44c47416e6758ed2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:10:59 -0400 Subject: cgroup: update and fix parsing of "cgroup.subtree_control" I was confused that strsep() was equivalent to strtok_r() in skipping over consecutive delimiters. strsep() just splits at the first occurrence of one of the delimiters which makes the parsing very inflexible, which makes allowing multiple whitespace chars as delimters kinda moot. Let's just be consistently strict and require list of tokens separated by spaces. This is what Documentation/cgroups/unified-hierarchy.txt describes too. Also, parsing may access beyond the end of the string if the string ends with spaces or is zero-length. Make sure it skips zero-length tokens. Note that this also ensures that the parser doesn't puke on multiple consecutive spaces. v2: Add zero-length token skipping. v3: Added missing space after "==". Spotted by Li. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 35daf892b6e6..250def0694b4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2542,11 +2542,13 @@ static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, int ssid, ret; /* - * Parse input - white space separated list of subsystem names - * prefixed with either + or -. + * Parse input - space separated list of subsystem names prefixed + * with either + or -. */ p = buffer; - while ((tok = strsep(&p, " \t\n"))) { + while ((tok = strsep(&p, " "))) { + if (tok[0] == '\0') + continue; for_each_subsys(ss, ssid) { if (ss->disabled || strcmp(tok + 1, ss->name)) continue; -- cgit From 7d331fa985d3a39d5b8cb60caf016d3e53e57c91 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:11:00 -0400 Subject: cgroup: use restart_syscall() for retries after offline waits in cgroup_subtree_control_write() After waiting for a child to finish offline, cgroup_subtree_control_write() jumps up to retry from after the input parsing and active protection breaking. This retry makes the scheduled locking update - removal of cgroup_tree_mutex - more difficult. Let's simplify it by returning with restart_syscall() for retries. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 250def0694b4..3251cc9070fa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2535,7 +2535,7 @@ out_finish: static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, struct cftype *cft, char *buffer) { - unsigned int enable_req = 0, disable_req = 0, enable, disable; + unsigned int enable = 0, disable = 0; struct cgroup *cgrp = dummy_css->cgroup, *child; struct cgroup_subsys *ss; char *tok, *p; @@ -2554,11 +2554,11 @@ static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, continue; if (*tok == '+') { - enable_req |= 1 << ssid; - disable_req &= ~(1 << ssid); + enable |= 1 << ssid; + disable &= ~(1 << ssid); } else if (*tok == '-') { - disable_req |= 1 << ssid; - enable_req &= ~(1 << ssid); + disable |= 1 << ssid; + enable &= ~(1 << ssid); } else { return -EINVAL; } @@ -2576,9 +2576,6 @@ static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, */ cgroup_get(cgrp); kernfs_break_active_protection(cgrp->control_kn); -retry: - enable = enable_req; - disable = disable_req; mutex_lock(&cgroup_tree_mutex); @@ -2608,7 +2605,9 @@ retry: schedule(); finish_wait(&child->offline_waitq, &wait); cgroup_put(child); - goto retry; + + ret = restart_syscall(); + goto out_unbreak; } /* unavailable or not enabled on the parent? */ @@ -2692,6 +2691,7 @@ out_unlock: mutex_unlock(&cgroup_mutex); out_unlock_tree: mutex_unlock(&cgroup_tree_mutex); +out_unbreak: kernfs_unbreak_active_protection(cgrp->control_kn); cgroup_put(cgrp); return ret; -- cgit From 46cfeb043b04f5878154bea36714709d46028495 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:11:00 -0400 Subject: cgroup: use release_agent_path_lock in cgroup_release_agent_show() release_path is now protected by release_agent_path_lock to allow accessing it without grabbing cgroup_mutex; however, cgroup_release_agent_show() was still grabbing cgroup_mutex. Let's convert it to release_agent_path_lock so that we don't have to worry about this one for the planned locking updates. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3251cc9070fa..7633703e9baf 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2373,11 +2373,10 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + spin_lock(&release_agent_path_lock); seq_puts(seq, cgrp->root->release_agent_path); + spin_unlock(&release_agent_path_lock); seq_putc(seq, '\n'); - mutex_unlock(&cgroup_mutex); return 0; } -- cgit From ec903c0c858e4963a9e0724bdcadfa837253341c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:11:01 -0400 Subject: cgroup: rename css_tryget*() to css_tryget_online*() Unlike the more usual refcnting, what css_tryget() provides is the distinction between online and offline csses instead of protection against upping a refcnt which already reached zero. cgroup is planning to provide actual tryget which fails if the refcnt already reached zero. Let's rename the existing trygets so that they clearly indicate that they're onliness. I thought about keeping the existing names as-are and introducing new names for the planned actual tryget; however, given that each controller participates in the synchronization of the online state, it seems worthwhile to make it explicit that these functions are about on/offline state. Rename css_tryget() to css_tryget_online() and css_tryget_from_dir() to css_tryget_online_from_dir(). This is pure rename. v2: cgroup_freezer grew new usages of css_tryget(). Update accordingly. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Li Zefan Cc: Vivek Goyal Cc: Jens Axboe Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- block/blk-cgroup.c | 2 +- fs/bio.c | 2 +- include/linux/cgroup.h | 14 +++++++------- kernel/cgroup.c | 40 ++++++++++++++++++++-------------------- kernel/cgroup_freezer.c | 4 ++-- kernel/cpuset.c | 6 +++--- kernel/events/core.c | 3 ++- mm/hugetlb_cgroup.c | 2 +- mm/memcontrol.c | 46 ++++++++++++++++++++++++---------------------- 9 files changed, 61 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1039fb9ff5f5..9f5bce33e6fe 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -185,7 +185,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, lockdep_assert_held(q->queue_lock); /* blkg holds a reference to blkcg */ - if (!css_tryget(&blkcg->css)) { + if (!css_tryget_online(&blkcg->css)) { ret = -EINVAL; goto err_free_blkg; } diff --git a/fs/bio.c b/fs/bio.c index 6f0362b77806..0608ef9422bd 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1970,7 +1970,7 @@ int bio_associate_current(struct bio *bio) /* associate blkcg if exists */ rcu_read_lock(); css = task_css(current, blkio_cgrp_id); - if (css && css_tryget(css)) + if (css && css_tryget_online(css)) bio->bi_css = css; rcu_read_unlock(); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index bde44618d8c2..c5f3684ef557 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -95,16 +95,16 @@ static inline void css_get(struct cgroup_subsys_state *css) } /** - * css_tryget - try to obtain a reference on the specified css + * css_tryget_online - try to obtain a reference on the specified css if online * @css: target css * - * Obtain a reference on @css if it's alive. The caller naturally needs to - * ensure that @css is accessible but doesn't have to be holding a + * Obtain a reference on @css if it's online. The caller naturally needs + * to ensure that @css is accessible but doesn't have to be holding a * reference on it - IOW, RCU protected access is good enough for this * function. Returns %true if a reference count was successfully obtained; * %false otherwise. */ -static inline bool css_tryget(struct cgroup_subsys_state *css) +static inline bool css_tryget_online(struct cgroup_subsys_state *css) { if (css->flags & CSS_ROOT) return true; @@ -115,7 +115,7 @@ static inline bool css_tryget(struct cgroup_subsys_state *css) * css_put - put a css reference * @css: target css * - * Put a reference obtained via css_get() and css_tryget(). + * Put a reference obtained via css_get() and css_tryget_online(). */ static inline void css_put(struct cgroup_subsys_state *css) { @@ -905,8 +905,8 @@ void css_task_iter_end(struct css_task_iter *it); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); -struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss); +struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss); #else /* !CONFIG_CGROUPS */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7633703e9baf..671d8a6dae37 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3771,7 +3771,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) /* * We aren't being called from kernfs and there's no guarantee on - * @kn->priv's validity. For this and css_tryget_from_dir(), + * @kn->priv's validity. For this and css_tryget_online_from_dir(), * @kn->priv is RCU safe. Let's do the RCU dancing. */ rcu_read_lock(); @@ -4060,9 +4060,9 @@ err: * Implemented in kill_css(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs - * and thus css_tryget() is guaranteed to fail, the css can be offlined - * by invoking offline_css(). After offlining, the base ref is put. - * Implemented in css_killed_work_fn(). + * and thus css_tryget_online() is guaranteed to fail, the css can be + * offlined by invoking offline_css(). After offlining, the base ref is + * put. Implemented in css_killed_work_fn(). * * 3. When the percpu_ref reaches zero, the only possible remaining * accessors are inside RCU read sections. css_release() schedules the @@ -4386,7 +4386,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, /* * This is called when the refcnt of a css is confirmed to be killed. - * css_tryget() is now guaranteed to fail. + * css_tryget_online() is now guaranteed to fail. */ static void css_killed_work_fn(struct work_struct *work) { @@ -4398,8 +4398,8 @@ static void css_killed_work_fn(struct work_struct *work) mutex_lock(&cgroup_mutex); /* - * css_tryget() is guaranteed to fail now. Tell subsystems to - * initate destruction. + * css_tryget_online() is guaranteed to fail now. Tell subsystems + * to initate destruction. */ offline_css(css); @@ -4440,8 +4440,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref) * * This function initiates destruction of @css by removing cgroup interface * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget() is guaranteed to fail and when the - * reference count reaches zero, @css will be released. + * asynchronously once css_tryget_online() is guaranteed to fail and when + * the reference count reaches zero, @css will be released. */ static void kill_css(struct cgroup_subsys_state *css) { @@ -4462,7 +4462,7 @@ static void kill_css(struct cgroup_subsys_state *css) /* * cgroup core guarantees that, by the time ->css_offline() is * invoked, no new css reference will be given out via - * css_tryget(). We can't simply call percpu_ref_kill() and + * css_tryget_online(). We can't simply call percpu_ref_kill() and * proceed to offlining css's because percpu_ref_kill() doesn't * guarantee that the ref is seen as killed on all CPUs on return. * @@ -4478,9 +4478,9 @@ static void kill_css(struct cgroup_subsys_state *css) * * css's make use of percpu refcnts whose killing latency shouldn't be * exposed to userland and are RCU protected. Also, cgroup core needs to - * guarantee that css_tryget() won't succeed by the time ->css_offline() is - * invoked. To satisfy all the requirements, destruction is implemented in - * the following two steps. + * guarantee that css_tryget_online() won't succeed by the time + * ->css_offline() is invoked. To satisfy all the requirements, + * destruction is implemented in the following two steps. * * s1. Verify @cgrp can be destroyed and mark it dying. Remove all * userland visible parts and start killing the percpu refcnts of @@ -4574,9 +4574,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * There are two control paths which try to determine cgroup from * dentry without going through kernfs - cgroupstats_build() and - * css_tryget_from_dir(). Those are supported by RCU protecting - * clearing of cgrp->kn->priv backpointer, which should happen - * after all files under it have been removed. + * css_tryget_online_from_dir(). Those are supported by RCU + * protecting clearing of cgrp->kn->priv backpointer, which should + * happen after all files under it have been removed. */ kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); @@ -5173,7 +5173,7 @@ static int __init cgroup_disable(char *str) __setup("cgroup_disable=", cgroup_disable); /** - * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir + * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest * @ss: subsystem of interest * @@ -5181,8 +5181,8 @@ __setup("cgroup_disable=", cgroup_disable); * to get the corresponding css and return it. If such css doesn't exist * or can't be pinned, an ERR_PTR value is returned. */ -struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss) +struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup_subsys_state *css = NULL; @@ -5204,7 +5204,7 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, if (cgrp) css = cgroup_css(cgrp, ss); - if (!css || !css_tryget(css)) + if (!css || !css_tryget_online(css)) css = ERR_PTR(-ENOENT); rcu_read_unlock(); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 345628c78b5b..0398f7e9ac81 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -304,7 +304,7 @@ static int freezer_read(struct seq_file *m, void *v) /* update states bottom-up */ css_for_each_descendant_post(pos, css) { - if (!css_tryget(pos)) + if (!css_tryget_online(pos)) continue; rcu_read_unlock(); @@ -404,7 +404,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); - if (!css_tryget(pos)) + if (!css_tryget_online(pos)) continue; rcu_read_unlock(); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7c0e8da59e26..37ca0a5c226d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -872,7 +872,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) continue; } } - if (!css_tryget(&cp->css)) + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); @@ -1108,7 +1108,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) continue; } } - if (!css_tryget(&cp->css)) + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); @@ -2153,7 +2153,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (cs == &top_cpuset || !css_tryget(&cs->css)) + if (cs == &top_cpuset || !css_tryget_online(&cs->css)) continue; rcu_read_unlock(); diff --git a/kernel/events/core.c b/kernel/events/core.c index f83a71a3e46d..077968d19b8a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -607,7 +607,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (!f.file) return -EBADF; - css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); + css = css_tryget_online_from_dir(f.file->f_dentry, + &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 595d7fd795e1..372f1adca491 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -181,7 +181,7 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, again: rcu_read_lock(); h_cg = hugetlb_cgroup_from_task(current); - if (!css_tryget(&h_cg->css)) { + if (!css_tryget_online(&h_cg->css)) { rcu_read_unlock(); goto again; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c3f82f69ef58..5cf3246314a2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -567,7 +567,8 @@ void sock_update_memcg(struct sock *sk) memcg = mem_cgroup_from_task(current); cg_proto = sk->sk_prot->proto_cgroup(memcg); if (!mem_cgroup_is_root(memcg) && - memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { + memcg_proto_active(cg_proto) && + css_tryget_online(&memcg->css)) { sk->sk_cgrp = cg_proto; } rcu_read_unlock(); @@ -834,7 +835,7 @@ retry: */ __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); if (!res_counter_soft_limit_excess(&mz->memcg->res) || - !css_tryget(&mz->memcg->css)) + !css_tryget_online(&mz->memcg->css)) goto retry; done: return mz; @@ -1076,7 +1077,7 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (unlikely(!memcg)) memcg = root_mem_cgroup; - } while (!css_tryget(&memcg->css)); + } while (!css_tryget_online(&memcg->css)); rcu_read_unlock(); return memcg; } @@ -1113,7 +1114,8 @@ skip_node: */ if (next_css) { if ((next_css == &root->css) || - ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))) + ((next_css->flags & CSS_ONLINE) && + css_tryget_online(next_css))) return mem_cgroup_from_css(next_css); prev_css = next_css; @@ -1159,7 +1161,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, * would be returned all the time. */ if (position && position != root && - !css_tryget(&position->css)) + !css_tryget_online(&position->css)) position = NULL; } return position; @@ -2785,9 +2787,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, /* * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller is responsible for calling css_tryget if - * the mem_cgroup is used for charging. (dropping refcnt from swap can be - * called against removed memcg.) + * rcu_read_lock(). The caller is responsible for calling + * css_tryget_online() if the mem_cgroup is used for charging. (dropping + * refcnt from swap can be called against removed memcg.) */ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) { @@ -2810,14 +2812,14 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { memcg = pc->mem_cgroup; - if (memcg && !css_tryget(&memcg->css)) + if (memcg && !css_tryget_online(&memcg->css)) memcg = NULL; } else if (PageSwapCache(page)) { ent.val = page_private(page); id = lookup_swap_cgroup_id(ent); rcu_read_lock(); memcg = mem_cgroup_lookup(id); - if (memcg && !css_tryget(&memcg->css)) + if (memcg && !css_tryget_online(&memcg->css)) memcg = NULL; rcu_read_unlock(); } @@ -3473,7 +3475,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, } /* The corresponding put will be done in the workqueue. */ - if (!css_tryget(&memcg->css)) + if (!css_tryget_online(&memcg->css)) goto out; rcu_read_unlock(); @@ -4246,8 +4248,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) memcg = mem_cgroup_lookup(id); if (memcg) { /* - * We uncharge this because swap is freed. - * This memcg can be obsolete one. We avoid calling css_tryget + * We uncharge this because swap is freed. This memcg can + * be obsolete one. We avoid calling css_tryget_online(). */ if (!mem_cgroup_is_root(memcg)) res_counter_uncharge(&memcg->memsw, PAGE_SIZE); @@ -5840,10 +5842,10 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) * which is then paired with css_put during uncharge resp. here. * * Although this might sound strange as this path is called from - * css_offline() when the referencemight have dropped down to 0 - * and shouldn't be incremented anymore (css_tryget would fail) - * we do not have other options because of the kmem allocations - * lifetime. + * css_offline() when the referencemight have dropped down to 0 and + * shouldn't be incremented anymore (css_tryget_online() would + * fail) we do not have other options because of the kmem + * allocations lifetime. */ css_get(&memcg->css); @@ -6051,8 +6053,8 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, * automatically removed on cgroup destruction but the removal is * asynchronous, so take an extra ref on @css. */ - cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent, - &memory_cgrp_subsys); + cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent, + &memory_cgrp_subsys); ret = -EINVAL; if (IS_ERR(cfile_css)) goto out_put_cfile; @@ -6496,7 +6498,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) /* * XXX: css_offline() would be where we should reparent all * memory to prepare the cgroup for destruction. However, - * memcg does not do css_tryget() and res_counter charging + * memcg does not do css_tryget_online() and res_counter charging * under the same RCU lock region, which means that charging * could race with offlining. Offlining only happens to * cgroups with no tasks in them but charges can show up @@ -6510,9 +6512,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) * lookup_swap_cgroup_id() * rcu_read_lock() * mem_cgroup_lookup() - * css_tryget() + * css_tryget_online() * rcu_read_unlock() - * disable css_tryget() + * disable css_tryget_online() * call_rcu() * offline_css() * reparent_charges() -- cgit From b41686401e501430ffe93b575ef7959d2ecc6f2e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:21 -0400 Subject: cgroup: implement cftype->write() During the recent conversion to kernfs, cftype's seq_file operations are updated so that they are directly mapped to kernfs operations and thus can fully access the associated kernfs and cgroup contexts; however, write path hasn't seen similar updates and none of the existing write operations has access to, for example, the associated kernfs_open_file. Let's introduce a new operation cftype->write() which maps directly to the kernfs write operation and has access to all the arguments and contexts. This will replace ->write_string() and ->trigger() and ease manipulation of kernfs active protection from cgroup file operations. Two accessors - of_cft() and of_css() - are introduced to enable accessing the associated cgroup context from cftype->write() which only takes kernfs_open_file for the context information. The accessors for seq_file operations - seq_cft() and seq_css() - are rewritten to wrap the of_ accessors. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 27 +++++++++++++++++++++++---- kernel/cgroup.c | 14 ++++++++------ 2 files changed, 31 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c5f3684ef557..c5a170ca4a48 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -515,6 +515,15 @@ struct cftype { */ int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. Use + * of_css/cft() to access the associated css and cft. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); + #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lock_class_key lockdep_key; #endif @@ -552,14 +561,24 @@ static inline ino_t cgroup_ino(struct cgroup *cgrp) return 0; } -static inline struct cftype *seq_cft(struct seq_file *seq) +/* cft/css accessors for cftype->write() operation */ +static inline struct cftype *of_cft(struct kernfs_open_file *of) { - struct kernfs_open_file *of = seq->private; - return of->kn->priv; } -struct cgroup_subsys_state *seq_css(struct seq_file *seq); +struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); + +/* cft/css accessors for cftype->seq_*() operations */ +static inline struct cftype *seq_cft(struct seq_file *seq) +{ + return of_cft(seq->private); +} + +static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) +{ + return of_css(seq->private); +} /* * Name / path handling functions. All are thin wrappers around the kernfs diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 671d8a6dae37..a16f91d12f4e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -283,11 +283,10 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) return test_bit(CGRP_DEAD, &cgrp->flags); } -struct cgroup_subsys_state *seq_css(struct seq_file *seq) +struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { - struct kernfs_open_file *of = seq->private; struct cgroup *cgrp = of->kn->parent->priv; - struct cftype *cft = seq_cft(seq); + struct cftype *cft = of_cft(of); /* * This is open and unprotected implementation of cgroup_css(). @@ -302,7 +301,7 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq) else return &cgrp->dummy_css; } -EXPORT_SYMBOL_GPL(seq_css); +EXPORT_SYMBOL_GPL(of_css); /** * cgroup_is_descendant - test ancestry @@ -1035,8 +1034,8 @@ static umode_t cgroup_file_mode(const struct cftype *cft) if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; - if (cft->write_u64 || cft->write_s64 || cft->write_string || - cft->trigger) + if (cft->write_u64 || cft->write_s64 || cft->write || + cft->write_string || cft->trigger) mode |= S_IWUSR; return mode; @@ -2726,6 +2725,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, struct cgroup_subsys_state *css; int ret; + if (cft->write) + return cft->write(of, buf, nbytes, off); + /* * kernfs guarantees that a file isn't deleted with operations in * flight, which means that the matching css is and stays alive and -- cgit From 451af504df0c62f695a69b83c250486e77c66378 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:21 -0400 Subject: cgroup: replace cftype->write_string() with cftype->write() Convert all cftype->write_string() users to the new cftype->write() which maps directly to kernfs write operation and has full access to kernfs and cgroup contexts. The conversions are mostly mechanical. * @css and @cft are accessed using of_css() and of_cft() accessors respectively instead of being specified as arguments. * Should return @nbytes on success instead of 0. * @buf is not trimmed automatically. Trim if necessary. Note that blkcg and netprio don't need this as the parsers already handle whitespaces. cftype->write_string() has no user left after the conversions and removed. While at it, remove unnecessary local variable @p in cgroup_subtree_control_write() and stale comment about CGROUP_LOCAL_BUFFER_SIZE in cgroup_freezer.c. This patch doesn't introduce any visible behavior changes. v2: netprio was missing from conversion. Converted. Signed-off-by: Tejun Heo Acked-by: Aristeu Rozanski Acked-by: Vivek Goyal Acked-by: Li Zefan Cc: Jens Axboe Cc: Johannes Weiner Cc: Michal Hocko Cc: Neil Horman Cc: "David S. Miller" --- block/blk-throttle.c | 32 ++++++++++++++++---------------- block/cfq-iosched.c | 28 ++++++++++++++-------------- include/linux/cgroup.h | 10 +--------- kernel/cgroup.c | 38 +++++++++++++++++++------------------- kernel/cgroup_freezer.c | 20 +++++++++----------- kernel/cpuset.c | 16 +++++++++------- mm/hugetlb_cgroup.c | 17 +++++++++-------- mm/memcontrol.c | 46 +++++++++++++++++++++++++--------------------- net/core/netprio_cgroup.c | 12 ++++++------ net/ipv4/tcp_memcontrol.c | 16 +++++++++------- security/device_cgroup.c | 14 +++++++------- 11 files changed, 124 insertions(+), 125 deletions(-) (limited to 'kernel') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 033745cd7fba..5e8fd1bace98 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1346,10 +1346,10 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v) return 0; } -static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buf, bool is_u64) +static ssize_t tg_set_conf(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, bool is_u64) { - struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct throtl_grp *tg; struct throtl_service_queue *sq; @@ -1368,9 +1368,9 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, ctx.v = -1; if (is_u64) - *(u64 *)((void *)tg + cft->private) = ctx.v; + *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v; else - *(unsigned int *)((void *)tg + cft->private) = ctx.v; + *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v; throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", @@ -1404,19 +1404,19 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, } blkg_conf_finish(&ctx); - return 0; + return nbytes; } -static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft, - char *buf) +static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return tg_set_conf(css, cft, buf, true); + return tg_set_conf(of, buf, nbytes, off, true); } -static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft, - char *buf) +static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return tg_set_conf(css, cft, buf, false); + return tg_set_conf(of, buf, nbytes, off, false); } static struct cftype throtl_files[] = { @@ -1424,25 +1424,25 @@ static struct cftype throtl_files[] = { .name = "throttle.read_bps_device", .private = offsetof(struct throtl_grp, bps[READ]), .seq_show = tg_print_conf_u64, - .write_string = tg_set_conf_u64, + .write = tg_set_conf_u64, }, { .name = "throttle.write_bps_device", .private = offsetof(struct throtl_grp, bps[WRITE]), .seq_show = tg_print_conf_u64, - .write_string = tg_set_conf_u64, + .write = tg_set_conf_u64, }, { .name = "throttle.read_iops_device", .private = offsetof(struct throtl_grp, iops[READ]), .seq_show = tg_print_conf_uint, - .write_string = tg_set_conf_uint, + .write = tg_set_conf_uint, }, { .name = "throttle.write_iops_device", .private = offsetof(struct throtl_grp, iops[WRITE]), .seq_show = tg_print_conf_uint, - .write_string = tg_set_conf_uint, + .write = tg_set_conf_uint, }, { .name = "throttle.io_service_bytes", diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e0985f1955e7..a73020b8c9af 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1670,11 +1670,11 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v) return 0; } -static int __cfqg_set_weight_device(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buf, - bool is_leaf_weight) +static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, + bool is_leaf_weight) { - struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct cfq_group *cfqg; int ret; @@ -1697,19 +1697,19 @@ static int __cfqg_set_weight_device(struct cgroup_subsys_state *css, } blkg_conf_finish(&ctx); - return ret; + return ret ?: nbytes; } -static int cfqg_set_weight_device(struct cgroup_subsys_state *css, - struct cftype *cft, char *buf) +static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return __cfqg_set_weight_device(css, cft, buf, false); + return __cfqg_set_weight_device(of, buf, nbytes, off, false); } -static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css, - struct cftype *cft, char *buf) +static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return __cfqg_set_weight_device(css, cft, buf, true); + return __cfqg_set_weight_device(of, buf, nbytes, off, true); } static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, @@ -1837,7 +1837,7 @@ static struct cftype cfq_blkcg_files[] = { .name = "weight_device", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cfqg_print_leaf_weight_device, - .write_string = cfqg_set_leaf_weight_device, + .write = cfqg_set_leaf_weight_device, }, { .name = "weight", @@ -1851,7 +1851,7 @@ static struct cftype cfq_blkcg_files[] = { .name = "weight_device", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cfqg_print_weight_device, - .write_string = cfqg_set_weight_device, + .write = cfqg_set_weight_device, }, { .name = "weight", @@ -1863,7 +1863,7 @@ static struct cftype cfq_blkcg_files[] = { { .name = "leaf_weight_device", .seq_show = cfqg_print_leaf_weight_device, - .write_string = cfqg_set_leaf_weight_device, + .write = cfqg_set_leaf_weight_device, }, { .name = "leaf_weight", diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c5a170ca4a48..aecdc84fe128 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -453,8 +453,7 @@ struct cftype { /* * The maximum length of string, excluding trailing nul, that can - * be passed to write_string. If < PAGE_SIZE-1, PAGE_SIZE-1 is - * assumed. + * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. */ size_t max_write_len; @@ -500,13 +499,6 @@ struct cftype { int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, s64 val); - /* - * write_string() is passed a nul-terminated kernelspace - * buffer of maximum length determined by max_write_len. - * Returns 0 or -ve error code. - */ - int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft, - char *buffer); /* * trigger() callback can be used to get some kick from the * userspace, when the actual string written is not important diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a16f91d12f4e..2a88ce7b24b6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1035,7 +1035,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) mode |= S_IRUGO; if (cft->write_u64 || cft->write_s64 || cft->write || - cft->write_string || cft->trigger) + cft->trigger) mode |= S_IWUSR; return mode; @@ -2352,20 +2352,21 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css, return attach_task_by_pid(css->cgroup, tgid, true); } -static int cgroup_release_agent_write(struct cgroup_subsys_state *css, - struct cftype *cft, char *buffer) +static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct cgroup_root *root = css->cgroup->root; + struct cgroup *cgrp = of_css(of)->cgroup; + struct cgroup_root *root = cgrp->root; BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); - if (!cgroup_lock_live_group(css->cgroup)) + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; spin_lock(&release_agent_path_lock); - strlcpy(root->release_agent_path, buffer, + strlcpy(root->release_agent_path, strstrip(buf), sizeof(root->release_agent_path)); spin_unlock(&release_agent_path_lock); mutex_unlock(&cgroup_mutex); - return 0; + return nbytes; } static int cgroup_release_agent_show(struct seq_file *seq, void *v) @@ -2530,21 +2531,22 @@ out_finish: } /* change the enabled child controllers for a cgroup in the default hierarchy */ -static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, - struct cftype *cft, char *buffer) +static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) { unsigned int enable = 0, disable = 0; - struct cgroup *cgrp = dummy_css->cgroup, *child; + struct cgroup *cgrp = of_css(of)->cgroup, *child; struct cgroup_subsys *ss; - char *tok, *p; + char *tok; int ssid, ret; /* * Parse input - space separated list of subsystem names prefixed * with either + or -. */ - p = buffer; - while ((tok = strsep(&p, " "))) { + buf = strstrip(buf); + while ((tok = strsep(&buf, " "))) { if (tok[0] == '\0') continue; for_each_subsys(ss, ssid) { @@ -2692,7 +2694,7 @@ out_unlock_tree: out_unbreak: kernfs_unbreak_active_protection(cgrp->control_kn); cgroup_put(cgrp); - return ret; + return ret ?: nbytes; err_undo_css: cgrp->child_subsys_mask &= ~enable; @@ -2738,9 +2740,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, css = cgroup_css(cgrp, cft->ss); rcu_read_unlock(); - if (cft->write_string) { - ret = cft->write_string(css, cft, strstrip(buf)); - } else if (cft->write_u64) { + if (cft->write_u64) { unsigned long long v; ret = kstrtoull(buf, 0, &v); if (!ret) @@ -3984,7 +3984,7 @@ static struct cftype cgroup_base_files[] = { .name = "cgroup.subtree_control", .flags = CFTYPE_ONLY_ON_DFL, .seq_show = cgroup_subtree_control_show, - .write_string = cgroup_subtree_control_write, + .write = cgroup_subtree_control_write, }, { .name = "cgroup.populated", @@ -4018,7 +4018,7 @@ static struct cftype cgroup_base_files[] = { .name = "release_agent", .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, - .write_string = cgroup_release_agent_write, + .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, { } /* terminate */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 0398f7e9ac81..6b4e60e33a9a 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -73,10 +73,6 @@ bool cgroup_freezing(struct task_struct *task) return ret; } -/* - * cgroups_write_string() limits the size of freezer state strings to - * CGROUP_LOCAL_BUFFER_SIZE - */ static const char *freezer_state_strs(unsigned int state) { if (state & CGROUP_FROZEN) @@ -423,20 +419,22 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) mutex_unlock(&freezer_mutex); } -static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, - char *buffer) +static ssize_t freezer_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { bool freeze; - if (strcmp(buffer, freezer_state_strs(0)) == 0) + buf = strstrip(buf); + + if (strcmp(buf, freezer_state_strs(0)) == 0) freeze = false; - else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) + else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) freeze = true; else return -EINVAL; - freezer_change_state(css_freezer(css), freeze); - return 0; + freezer_change_state(css_freezer(of_css(of)), freeze); + return nbytes; } static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, @@ -460,7 +458,7 @@ static struct cftype files[] = { .name = "state", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = freezer_read, - .write_string = freezer_write, + .write = freezer_write, }, { .name = "self_freezing", diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 37ca0a5c226d..2f4b08b8db24 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1603,13 +1603,15 @@ out_unlock: /* * Common handling for a write to a "cpus" or "mems" file. */ -static int cpuset_write_resmask(struct cgroup_subsys_state *css, - struct cftype *cft, char *buf) +static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct cpuset *cs = css_cs(css); + struct cpuset *cs = css_cs(of_css(of)); struct cpuset *trialcs; int retval = -ENODEV; + buf = strstrip(buf); + /* * CPU or memory hotunplug may leave @cs w/o any execution * resources, in which case the hotplug code asynchronously updates @@ -1633,7 +1635,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css, goto out_unlock; } - switch (cft->private) { + switch (of_cft(of)->private) { case FILE_CPULIST: retval = update_cpumask(cs, trialcs, buf); break; @@ -1648,7 +1650,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css, free_trial_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); - return retval; + return retval ?: nbytes; } /* @@ -1750,7 +1752,7 @@ static struct cftype files[] = { { .name = "cpus", .seq_show = cpuset_common_seq_show, - .write_string = cpuset_write_resmask, + .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, }, @@ -1758,7 +1760,7 @@ static struct cftype files[] = { { .name = "mems", .seq_show = cpuset_common_seq_show, - .write_string = cpuset_write_resmask, + .write = cpuset_write_resmask, .max_write_len = (100U + 6 * MAX_NUMNODES), .private = FILE_MEMLIST, }, diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 372f1adca491..191de26b0148 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -253,15 +253,16 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, return res_counter_read_u64(&h_cg->hugepage[idx], name); } -static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, - struct cftype *cft, char *buffer) +static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { int idx, name, ret; unsigned long long val; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); - idx = MEMFILE_IDX(cft->private); - name = MEMFILE_ATTR(cft->private); + buf = strstrip(buf); + idx = MEMFILE_IDX(of_cft(of)->private); + name = MEMFILE_ATTR(of_cft(of)->private); switch (name) { case RES_LIMIT: @@ -271,7 +272,7 @@ static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, break; } /* This function does all necessary parse...reuse it */ - ret = res_counter_memparse_write_strategy(buffer, &val); + ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; ret = res_counter_set_limit(&h_cg->hugepage[idx], val); @@ -280,7 +281,7 @@ static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, ret = -EINVAL; break; } - return ret; + return ret ?: nbytes; } static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css, @@ -331,7 +332,7 @@ static void __init __hugetlb_cgroup_file_init(int idx) snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); cft->read_u64 = hugetlb_cgroup_read_u64; - cft->write_string = hugetlb_cgroup_write; + cft->write = hugetlb_cgroup_write; /* Add the usage file */ cft = &h->cgroup_files[1]; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5cf3246314a2..7098a43f7447 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5143,17 +5143,18 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg, * The user of this function is... * RES_LIMIT. */ -static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, - char *buffer) +static ssize_t mem_cgroup_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); enum res_type type; int name; unsigned long long val; int ret; - type = MEMFILE_TYPE(cft->private); - name = MEMFILE_ATTR(cft->private); + buf = strstrip(buf); + type = MEMFILE_TYPE(of_cft(of)->private); + name = MEMFILE_ATTR(of_cft(of)->private); switch (name) { case RES_LIMIT: @@ -5162,7 +5163,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, break; } /* This function does all necessary parse...reuse it */ - ret = res_counter_memparse_write_strategy(buffer, &val); + ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; if (type == _MEM) @@ -5175,7 +5176,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, return -EINVAL; break; case RES_SOFT_LIMIT: - ret = res_counter_memparse_write_strategy(buffer, &val); + ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; /* @@ -5192,7 +5193,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, ret = -EINVAL; /* should be BUG() ? */ break; } - return ret; + return ret ?: nbytes; } static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, @@ -5964,9 +5965,10 @@ static void memcg_event_ptable_queue_proc(struct file *file, * Input must be in format ' '. * Interpretation of args is defined by control file implementation. */ -static int memcg_write_event_control(struct cgroup_subsys_state *css, - struct cftype *cft, char *buffer) +static ssize_t memcg_write_event_control(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { + struct cgroup_subsys_state *css = of_css(of); struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event; struct cgroup_subsys_state *cfile_css; @@ -5977,15 +5979,17 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, char *endp; int ret; - efd = simple_strtoul(buffer, &endp, 10); + buf = strstrip(buf); + + efd = simple_strtoul(buf, &endp, 10); if (*endp != ' ') return -EINVAL; - buffer = endp + 1; + buf = endp + 1; - cfd = simple_strtoul(buffer, &endp, 10); + cfd = simple_strtoul(buf, &endp, 10); if ((*endp != ' ') && (*endp != '\0')) return -EINVAL; - buffer = endp + 1; + buf = endp + 1; event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) @@ -6063,7 +6067,7 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, goto out_put_cfile; } - ret = event->register_event(memcg, event->eventfd, buffer); + ret = event->register_event(memcg, event->eventfd, buf); if (ret) goto out_put_css; @@ -6076,7 +6080,7 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, fdput(cfile); fdput(efile); - return 0; + return nbytes; out_put_css: css_put(css); @@ -6107,13 +6111,13 @@ static struct cftype mem_cgroup_files[] = { { .name = "limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), - .write_string = mem_cgroup_write, + .write = mem_cgroup_write, .read_u64 = mem_cgroup_read_u64, }, { .name = "soft_limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), - .write_string = mem_cgroup_write, + .write = mem_cgroup_write, .read_u64 = mem_cgroup_read_u64, }, { @@ -6138,7 +6142,7 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "cgroup.event_control", /* XXX: for compat */ - .write_string = memcg_write_event_control, + .write = memcg_write_event_control, .flags = CFTYPE_NO_PREFIX, .mode = S_IWUGO, }, @@ -6171,7 +6175,7 @@ static struct cftype mem_cgroup_files[] = { { .name = "kmem.limit_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), - .write_string = mem_cgroup_write, + .write = mem_cgroup_write, .read_u64 = mem_cgroup_read_u64, }, { @@ -6217,7 +6221,7 @@ static struct cftype memsw_cgroup_files[] = { { .name = "memsw.limit_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), - .write_string = mem_cgroup_write, + .write = mem_cgroup_write, .read_u64 = mem_cgroup_read_u64, }, { diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 3825f669147b..b990cefd906b 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -185,15 +185,15 @@ static int read_priomap(struct seq_file *sf, void *v) return 0; } -static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft, - char *buffer) +static ssize_t write_priomap(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { char devname[IFNAMSIZ + 1]; struct net_device *dev; u32 prio; int ret; - if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) + if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) return -EINVAL; dev = dev_get_by_name(&init_net, devname); @@ -202,11 +202,11 @@ static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft, rtnl_lock(); - ret = netprio_set_prio(css, dev, prio); + ret = netprio_set_prio(of_css(of), dev, prio); rtnl_unlock(); dev_put(dev); - return ret; + return ret ?: nbytes; } static int update_netprio(const void *v, struct file *file, unsigned n) @@ -239,7 +239,7 @@ static struct cftype ss_files[] = { { .name = "ifpriomap", .seq_show = read_priomap, - .write_string = write_priomap, + .write = write_priomap, }, { } /* terminate */ }; diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index d4f015ad6c84..841fd3fa937a 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -102,17 +102,19 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) return 0; } -static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, - char *buffer) +static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned long long val; int ret = 0; - switch (cft->private) { + buf = strstrip(buf); + + switch (of_cft(of)->private) { case RES_LIMIT: /* see memcontrol.c */ - ret = res_counter_memparse_write_strategy(buffer, &val); + ret = res_counter_memparse_write_strategy(buf, &val); if (ret) break; ret = tcp_update_limit(memcg, val); @@ -121,7 +123,7 @@ static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, ret = -EINVAL; break; } - return ret; + return ret ?: nbytes; } static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) @@ -193,7 +195,7 @@ static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) static struct cftype tcp_files[] = { { .name = "kmem.tcp.limit_in_bytes", - .write_string = tcp_cgroup_write, + .write = tcp_cgroup_write, .read_u64 = tcp_cgroup_read, .private = RES_LIMIT, }, diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 9134dbf70d3e..7dbac4061b1c 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -767,27 +767,27 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, return rc; } -static int devcgroup_access_write(struct cgroup_subsys_state *css, - struct cftype *cft, char *buffer) +static ssize_t devcgroup_access_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { int retval; mutex_lock(&devcgroup_mutex); - retval = devcgroup_update_access(css_to_devcgroup(css), - cft->private, buffer); + retval = devcgroup_update_access(css_to_devcgroup(of_css(of)), + of_cft(of)->private, strstrip(buf)); mutex_unlock(&devcgroup_mutex); - return retval; + return retval ?: nbytes; } static struct cftype dev_cgroup_files[] = { { .name = "allow", - .write_string = devcgroup_access_write, + .write = devcgroup_access_write, .private = DEVCG_ALLOW, }, { .name = "deny", - .write_string = devcgroup_access_write, + .write = devcgroup_access_write, .private = DEVCG_DENY, }, { -- cgit From 6770c64e5c8da4705d1f0973bdeb5c2bf4f3a404 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:21 -0400 Subject: cgroup: replace cftype->trigger() with cftype->write() cftype->trigger() is pointless. It's trivial to ignore the input buffer from a regular ->write() operation. Convert all ->trigger() users to ->write() and remove ->trigger(). This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Johannes Weiner Cc: Michal Hocko --- include/linux/cgroup.h | 8 -------- kernel/cgroup.c | 5 +---- mm/hugetlb_cgroup.c | 16 ++++++++-------- mm/memcontrol.c | 34 ++++++++++++++++++---------------- net/ipv4/tcp_memcontrol.c | 15 ++++++++------- 5 files changed, 35 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index aecdc84fe128..08eb71ee600b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -499,14 +499,6 @@ struct cftype { int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, s64 val); - /* - * trigger() callback can be used to get some kick from the - * userspace, when the actual string written is not important - * at all. The private field can be used to determine the - * kick type for multiplexing. - */ - int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); - /* * write() is the generic write callback which maps directly to * kernfs write operation and overrides all other operations. diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a88ce7b24b6..2f16aab03493 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1034,8 +1034,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; - if (cft->write_u64 || cft->write_s64 || cft->write || - cft->trigger) + if (cft->write_u64 || cft->write_s64 || cft->write) mode |= S_IWUSR; return mode; @@ -2750,8 +2749,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, ret = kstrtoll(buf, 0, &v); if (!ret) ret = cft->write_s64(css, cft, v); - } else if (cft->trigger) { - ret = cft->trigger(css, (unsigned int)cft->private); } else { ret = -EINVAL; } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 191de26b0148..a380681ab3cf 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -284,14 +284,14 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, return ret ?: nbytes; } -static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css, - unsigned int event) +static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { int idx, name, ret = 0; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); - idx = MEMFILE_IDX(event); - name = MEMFILE_ATTR(event); + idx = MEMFILE_IDX(of_cft(of)->private); + name = MEMFILE_ATTR(of_cft(of)->private); switch (name) { case RES_MAX_USAGE: @@ -304,7 +304,7 @@ static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css, ret = -EINVAL; break; } - return ret; + return ret ?: nbytes; } static char *mem_fmt(char *buf, int size, unsigned long hsize) @@ -344,14 +344,14 @@ static void __init __hugetlb_cgroup_file_init(int idx) cft = &h->cgroup_files[2]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); - cft->trigger = hugetlb_cgroup_reset; + cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the failcntfile */ cft = &h->cgroup_files[3]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); - cft->trigger = hugetlb_cgroup_reset; + cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* NULL terminate the last cft */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7098a43f7447..b638a79209ee 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4887,14 +4887,15 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return 0; } -static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, - unsigned int event) +static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); if (mem_cgroup_is_root(memcg)) return -EINVAL; - return mem_cgroup_force_empty(memcg); + return mem_cgroup_force_empty(memcg) ?: nbytes; } static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, @@ -5220,14 +5221,15 @@ out: *memsw_limit = min_memsw_limit; } -static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) +static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); int name; enum res_type type; - type = MEMFILE_TYPE(event); - name = MEMFILE_ATTR(event); + type = MEMFILE_TYPE(of_cft(of)->private); + name = MEMFILE_ATTR(of_cft(of)->private); switch (name) { case RES_MAX_USAGE: @@ -5252,7 +5254,7 @@ static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) break; } - return 0; + return nbytes; } static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, @@ -6105,7 +6107,7 @@ static struct cftype mem_cgroup_files[] = { { .name = "max_usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { @@ -6123,7 +6125,7 @@ static struct cftype mem_cgroup_files[] = { { .name = "failcnt", .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { @@ -6132,7 +6134,7 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "force_empty", - .trigger = mem_cgroup_force_empty_write, + .write = mem_cgroup_force_empty_write, }, { .name = "use_hierarchy", @@ -6186,13 +6188,13 @@ static struct cftype mem_cgroup_files[] = { { .name = "kmem.failcnt", .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { .name = "kmem.max_usage_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, #ifdef CONFIG_SLABINFO @@ -6215,7 +6217,7 @@ static struct cftype memsw_cgroup_files[] = { { .name = "memsw.max_usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { @@ -6227,7 +6229,7 @@ static struct cftype memsw_cgroup_files[] = { { .name = "memsw.failcnt", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), - .trigger = mem_cgroup_reset, + .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, { }, /* terminate */ diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 841fd3fa937a..f7a2ec3ac584 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -170,17 +170,18 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) return val; } -static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) +static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg; struct cg_proto *cg_proto; - memcg = mem_cgroup_from_css(css); + memcg = mem_cgroup_from_css(of_css(of)); cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) - return 0; + return nbytes; - switch (event) { + switch (of_cft(of)->private) { case RES_MAX_USAGE: res_counter_reset_max(&cg_proto->memory_allocated); break; @@ -189,7 +190,7 @@ static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) break; } - return 0; + return nbytes; } static struct cftype tcp_files[] = { @@ -207,13 +208,13 @@ static struct cftype tcp_files[] = { { .name = "kmem.tcp.failcnt", .private = RES_FAILCNT, - .trigger = tcp_cgroup_reset, + .write = tcp_cgroup_reset, .read_u64 = tcp_cgroup_read, }, { .name = "kmem.tcp.max_usage_in_bytes", .private = RES_MAX_USAGE, - .trigger = tcp_cgroup_reset, + .write = tcp_cgroup_reset, .read_u64 = tcp_cgroup_read, }, { } /* terminate */ -- cgit From acbef755f40e204b8a6503fa79958d51a898762a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:22 -0400 Subject: cgroup: convert "tasks" and "cgroup.procs" handle to use cftype->write() cgroup_tasks_write() and cgroup_procs_write() are currently using cftype->write_u64(). This patch converts them to use cftype->write() instead. This allows access to the associated kernfs_open_file which will be necessary to implement the planned kernfs active protection manipulation for these files. This shifts buffer parsing to attach_task_by_pid() and makes it return @nbytes on success. Let's rename it to __cgroup_procs_write() to clearly indicate that this is a write handler implementation. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2f16aab03493..9a48c117ebf1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2232,12 +2232,18 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, * function to attach either it or all tasks in its threadgroup. Will lock * cgroup_mutex and threadgroup. */ -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) +static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; const struct cred *cred = current_cred(), *tcred; + struct cgroup *cgrp = of_css(of)->cgroup; + pid_t pid; int ret; + if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) + return -EINVAL; + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; @@ -2305,7 +2311,7 @@ retry_find_task: put_task_struct(tsk); out_unlock_cgroup: mutex_unlock(&cgroup_mutex); - return ret; + return ret ?: nbytes; } /** @@ -2339,16 +2345,16 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -static int cgroup_tasks_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 pid) +static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return attach_task_by_pid(css->cgroup, pid, false); + return __cgroup_procs_write(of, buf, nbytes, off, false); } -static int cgroup_procs_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 tgid) +static ssize_t cgroup_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return attach_task_by_pid(css->cgroup, tgid, true); + return __cgroup_procs_write(of, buf, nbytes, off, true); } static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, @@ -3953,7 +3959,7 @@ static struct cftype cgroup_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, - .write_u64 = cgroup_procs_write, + .write = cgroup_procs_write, .mode = S_IRUGO | S_IWUSR, }, { @@ -4002,7 +4008,7 @@ static struct cftype cgroup_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, - .write_u64 = cgroup_tasks_write, + .write = cgroup_tasks_write, .mode = S_IRUGO | S_IWUSR, }, { -- cgit From b7fc5ad235936379fae67a9f7b50bb53487a1a3a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:22 -0400 Subject: cgroup: remove cgroup->control_kn Now that cgroup_subtree_control_write() has access to the associated kernfs_open_file and thus the kernfs_node, there's no need to cache it in cgroup->control_kn on creation. Remove cgroup->control_kn and use @of->kn directly. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 1 - kernel/cgroup.c | 8 +++----- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 08eb71ee600b..aa7353deaaf3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -177,7 +177,6 @@ struct cgroup { struct cgroup *parent; /* my parent */ struct kernfs_node *kn; /* cgroup kernfs entry */ - struct kernfs_node *control_kn; /* kn for "cgroup.subtree_control" */ struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9a48c117ebf1..94d259bcd2b9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2580,7 +2580,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * active_ref protection. */ cgroup_get(cgrp); - kernfs_break_active_protection(cgrp->control_kn); + kernfs_break_active_protection(of->kn); mutex_lock(&cgroup_tree_mutex); @@ -2697,7 +2697,7 @@ out_unlock: out_unlock_tree: mutex_unlock(&cgroup_tree_mutex); out_unbreak: - kernfs_unbreak_active_protection(cgrp->control_kn); + kernfs_unbreak_active_protection(of->kn); cgroup_put(cgrp); return ret ?: nbytes; @@ -2887,9 +2887,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) return ret; } - if (cft->seq_show == cgroup_subtree_control_show) - cgrp->control_kn = kn; - else if (cft->seq_show == cgroup_populated_show) + if (cft->seq_show == cgroup_populated_show) cgrp->populated_kn = kn; return 0; } -- cgit From ba0f4d761503bd9ce4c7458b56bfd7c3fdb51e86 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:22 -0400 Subject: cgroup: reorganize cgroup_create() Reorganize cgroup_create() so that all paths share unlock out path. * All err_* labels are renamed to out_* as they're now shared by both success and failure paths. * @err renamed to @ret for the similar reason as above and so that it's more consistent with other functions. * cgroup memory allocation moved after locking so that freeing failed cgroup happens before unlocking. While this moves more code inside critical section, memory allocations inside cgroup locking are already pretty common and this is unlikely to make any noticeable difference. * While at it, replace a stray @parent->root dereference with @root. This reorganization will help simplifying locking. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 69 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 34 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 94d259bcd2b9..1d6106c3fb4e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4246,15 +4246,10 @@ static long cgroup_create(struct cgroup *parent, const char *name, { struct cgroup *cgrp; struct cgroup_root *root = parent->root; - int ssid, err; + int ssid, ret; struct cgroup_subsys *ss; struct kernfs_node *kn; - /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); - if (!cgrp) - return -ENOMEM; - mutex_lock(&cgroup_tree_mutex); /* @@ -4265,8 +4260,15 @@ static long cgroup_create(struct cgroup *parent, const char *name, * don't get nasty surprises if we ever grow another caller. */ if (!cgroup_lock_live_group(parent)) { - err = -ENODEV; - goto err_unlock_tree; + ret = -ENODEV; + goto out_unlock_tree; + } + + /* allocate the cgroup and its ID, 0 is reserved for the root */ + cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); + if (!cgrp) { + ret = -ENOMEM; + goto out_unlock; } /* @@ -4275,15 +4277,15 @@ static long cgroup_create(struct cgroup *parent, const char *name, */ cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); if (cgrp->id < 0) { - err = -ENOMEM; - goto err_unlock; + ret = -ENOMEM; + goto out_free_cgrp; } init_cgroup_housekeeping(cgrp); cgrp->parent = parent; cgrp->dummy_css.parent = &parent->dummy_css; - cgrp->root = parent->root; + cgrp->root = root; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -4294,8 +4296,8 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* create the directory */ kn = kernfs_create_dir(parent->kn, name, mode, cgrp); if (IS_ERR(kn)) { - err = PTR_ERR(kn); - goto err_free_id; + ret = PTR_ERR(kn); + goto out_free_id; } cgrp->kn = kn; @@ -4318,20 +4320,20 @@ static long cgroup_create(struct cgroup *parent, const char *name, */ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - err = cgroup_kn_set_ugid(kn); - if (err) - goto err_destroy; + ret = cgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; - err = cgroup_addrm_files(cgrp, cgroup_base_files, true); - if (err) - goto err_destroy; + ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); + if (ret) + goto out_destroy; /* let's create and online css's */ for_each_subsys(ss, ssid) { if (parent->child_subsys_mask & (1 << ssid)) { - err = create_css(cgrp, ss); - if (err) - goto err_destroy; + ret = create_css(cgrp, ss); + if (ret) + goto out_destroy; } } @@ -4344,25 +4346,22 @@ static long cgroup_create(struct cgroup *parent, const char *name, kernfs_activate(kn); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - - return 0; + ret = 0; + goto out_unlock; -err_free_id: +out_free_id: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); -err_unlock: +out_free_cgrp: + kfree(cgrp); +out_unlock: mutex_unlock(&cgroup_mutex); -err_unlock_tree: +out_unlock_tree: mutex_unlock(&cgroup_tree_mutex); - kfree(cgrp); - return err; + return ret; -err_destroy: +out_destroy: cgroup_destroy_locked(cgrp); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - return err; + goto out_unlock; } static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, -- cgit From b3bfd983ca94cf1393accc11e90123c83909babb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:22 -0400 Subject: cgroup: collapse cgroup_create() into croup_mkdir() cgroup_mkdir() is the sole user of cgroup_create(). Let's collapse the latter into the former. This will help simplifying locking. While at it, remove now stale comment about inode locking. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 52 +++++++++++++--------------------------------------- 1 file changed, 13 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1d6106c3fb4e..580d3484f97a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4235,30 +4235,24 @@ err_free_css: return err; } -/** - * cgroup_create - create a cgroup - * @parent: cgroup that will be parent of the new cgroup - * @name: name of the new cgroup - * @mode: mode to set on new cgroup - */ -static long cgroup_create(struct cgroup *parent, const char *name, - umode_t mode) +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) { - struct cgroup *cgrp; + struct cgroup *parent = parent_kn->priv, *cgrp; struct cgroup_root *root = parent->root; - int ssid, ret; struct cgroup_subsys *ss; struct kernfs_node *kn; - - mutex_lock(&cgroup_tree_mutex); + int ssid, ret; /* - * Only live parents can have children. Note that the liveliness - * check isn't strictly necessary because cgroup_mkdir() and - * cgroup_rmdir() are fully synchronized by i_mutex; however, do it - * anyway so that locking is contained inside cgroup proper and we - * don't get nasty surprises if we ever grow another caller. + * cgroup_mkdir() grabs cgroup_tree_mutex which nests outside + * kernfs active_ref and cgroup_create() already synchronizes + * properly against removal through cgroup_lock_live_group(). + * Break it before calling cgroup_create(). */ + cgroup_get(parent); + kernfs_break_active_protection(parent_kn); + mutex_lock(&cgroup_tree_mutex); if (!cgroup_lock_live_group(parent)) { ret = -ENODEV; goto out_unlock_tree; @@ -4357,6 +4351,8 @@ out_unlock: mutex_unlock(&cgroup_mutex); out_unlock_tree: mutex_unlock(&cgroup_tree_mutex); + kernfs_unbreak_active_protection(parent_kn); + cgroup_put(parent); return ret; out_destroy: @@ -4364,28 +4360,6 @@ out_destroy: goto out_unlock; } -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) -{ - struct cgroup *parent = parent_kn->priv; - int ret; - - /* - * cgroup_create() grabs cgroup_tree_mutex which nests outside - * kernfs active_ref and cgroup_create() already synchronizes - * properly against removal through cgroup_lock_live_group(). - * Break it before calling cgroup_create(). - */ - cgroup_get(parent); - kernfs_break_active_protection(parent_kn); - - ret = cgroup_create(parent, name, mode); - - kernfs_unbreak_active_protection(parent_kn); - cgroup_put(parent); - return ret; -} - /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. -- cgit From ddab2b6e0e516098efe6d3b69585bb25b1408d61 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:22 -0400 Subject: cgroup: grab cgroup_mutex earlier in cgroup_subtree_control_write() Move cgroup_lock_live_group() invocation upwards to right below cgroup_tree_mutex in cgroup_subtree_control_write(). This is to help the planned locking simplification. This doesn't make any userland-visible behavioral changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 580d3484f97a..8afddb1a1c6c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2583,6 +2583,10 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); mutex_lock(&cgroup_tree_mutex); + if (!cgroup_lock_live_group(cgrp)) { + ret = -ENODEV; + goto out_unlock_tree; + } for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { @@ -2606,6 +2610,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgroup_get(child); prepare_to_wait(&child->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); + mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); schedule(); finish_wait(&child->offline_waitq, &wait); @@ -2620,7 +2625,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, (cgrp->parent && !(cgrp->parent->child_subsys_mask & (1 << ssid)))) { ret = -ENOENT; - goto out_unlock_tree; + goto out_unlock; } } else if (disable & (1 << ssid)) { if (!(cgrp->child_subsys_mask & (1 << ssid))) { @@ -2632,7 +2637,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgroup_for_each_live_child(child, cgrp) { if (child->child_subsys_mask & (1 << ssid)) { ret = -EBUSY; - goto out_unlock_tree; + goto out_unlock; } } } @@ -2640,12 +2645,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (!enable && !disable) { ret = 0; - goto out_unlock_tree; - } - - if (!cgroup_lock_live_group(cgrp)) { - ret = -ENODEV; - goto out_unlock_tree; + goto out_unlock; } /* -- cgit From cfc79d5bec04cdf26cd207d3e73d8bd59fd780a8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:22 -0400 Subject: cgroup: move cgroup->kn->priv clearing to cgroup_rmdir() The ->priv field of a cgroup directory kernfs_node points back to the cgroup. This field is RCU cleared in cgroup_destroy_locked() for non-kernfs accesses from css_tryget_from_dir() and cgroupstats_build(). As these are only applicable to cgroups which finished creation successfully and fully initialized cgroups are always removed by cgroup_rmdir(), this can be safely moved to the end of cgroup_rmdir(). This will help simplifying cgroup locking and shouldn't introduce any behavior difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8afddb1a1c6c..b49e63d5386b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4546,17 +4546,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* remove @cgrp directory along with the base files */ mutex_unlock(&cgroup_mutex); - - /* - * There are two control paths which try to determine cgroup from - * dentry without going through kernfs - cgroupstats_build() and - * css_tryget_online_from_dir(). Those are supported by RCU - * protecting clearing of cgrp->kn->priv backpointer, which should - * happen after all files under it have been removed. - */ kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ - RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); - mutex_lock(&cgroup_mutex); return 0; @@ -4615,6 +4605,17 @@ static int cgroup_rmdir(struct kernfs_node *kn) mutex_unlock(&cgroup_tree_mutex); kernfs_unbreak_active_protection(kn); + + /* + * There are two control paths which try to determine cgroup from + * dentry without going through kernfs - cgroupstats_build() and + * css_tryget_online_from_dir(). Those are supported by RCU + * protecting clearing of cgrp->kn->priv backpointer, which should + * happen after all files under it have been removed. + */ + if (!ret) + RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); + cgroup_put(cgrp); return ret; } @@ -5174,7 +5175,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, /* * This path doesn't originate from kernfs and @kn could already * have been or be removed at any point. @kn->priv is RCU - * protected for this access. See destroy_locked() for details. + * protected for this access. See cgroup_rmdir() for details. */ cgrp = rcu_dereference(kn->priv); if (cgrp) -- cgit From a9746d8da786bc79b3b4ae1baa0fbbc4b795c1b7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:22 -0400 Subject: cgroup: factor out cgroup_kn_lock_live() and cgroup_kn_unlock() cgroup_mkdir(), cgroup_rmdir() and cgroup_subtree_control_write() share the logic to break active protection so that they can grab cgroup_tree_mutex which nests above active protection and/or remove self. Factor out this logic into cgroup_kn_lock_live() and cgroup_kn_unlock(). This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 157 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 90 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b49e63d5386b..21739e481006 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1093,6 +1093,75 @@ static void cgroup_put(struct cgroup *cgrp) call_rcu(&cgrp->rcu_head, cgroup_free_rcu); } +/** + * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods + * @kn: the kernfs_node being serviced + * + * This helper undoes cgroup_kn_lock_live() and should be invoked before + * the method finishes if locking succeeded. Note that once this function + * returns the cgroup returned by cgroup_kn_lock_live() may become + * inaccessible any time. If the caller intends to continue to access the + * cgroup, it should pin it before invoking this function. + */ +static void cgroup_kn_unlock(struct kernfs_node *kn) +{ + struct cgroup *cgrp; + + if (kernfs_type(kn) == KERNFS_DIR) + cgrp = kn->priv; + else + cgrp = kn->parent->priv; + + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + + kernfs_unbreak_active_protection(kn); + cgroup_put(cgrp); +} + +/** + * cgroup_kn_lock_live - locking helper for cgroup kernfs methods + * @kn: the kernfs_node being serviced + * + * This helper is to be used by a cgroup kernfs method currently servicing + * @kn. It breaks the active protection, performs cgroup locking and + * verifies that the associated cgroup is alive. Returns the cgroup if + * alive; otherwise, %NULL. A successful return should be undone by a + * matching cgroup_kn_unlock() invocation. + * + * Any cgroup kernfs method implementation which requires locking the + * associated cgroup should use this helper. It avoids nesting cgroup + * locking under kernfs active protection and allows all kernfs operations + * including self-removal. + */ +static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) +{ + struct cgroup *cgrp; + + if (kernfs_type(kn) == KERNFS_DIR) + cgrp = kn->priv; + else + cgrp = kn->parent->priv; + + /* + * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * active_ref. cgroup liveliness check alone provides enough + * protection against removal. Ensure @cgrp stays accessible and + * break the active_ref protection. + */ + cgroup_get(cgrp); + kernfs_break_active_protection(kn); + + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); + + if (!cgroup_is_dead(cgrp)) + return cgrp; + + cgroup_kn_unlock(kn); + return NULL; +} + static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; @@ -2541,7 +2610,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, loff_t off) { unsigned int enable = 0, disable = 0; - struct cgroup *cgrp = of_css(of)->cgroup, *child; + struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; int ssid, ret; @@ -2573,20 +2642,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return -EINVAL; } - /* - * We're gonna grab cgroup_tree_mutex which nests outside kernfs - * active_ref. cgroup_lock_live_group() already provides enough - * protection. Ensure @cgrp stays accessible and break the - * active_ref protection. - */ - cgroup_get(cgrp); - kernfs_break_active_protection(of->kn); - - mutex_lock(&cgroup_tree_mutex); - if (!cgroup_lock_live_group(cgrp)) { - ret = -ENODEV; - goto out_unlock_tree; - } + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) + return -ENODEV; for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { @@ -2610,14 +2668,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgroup_get(child); prepare_to_wait(&child->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); + cgroup_kn_unlock(of->kn); schedule(); finish_wait(&child->offline_waitq, &wait); cgroup_put(child); - ret = restart_syscall(); - goto out_unbreak; + return restart_syscall(); } /* unavailable or not enabled on the parent? */ @@ -2693,12 +2749,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, kernfs_activate(cgrp->kn); ret = 0; out_unlock: - mutex_unlock(&cgroup_mutex); -out_unlock_tree: - mutex_unlock(&cgroup_tree_mutex); -out_unbreak: - kernfs_unbreak_active_protection(of->kn); - cgroup_put(cgrp); + cgroup_kn_unlock(of->kn); return ret ?: nbytes; err_undo_css: @@ -4238,25 +4289,16 @@ err_free_css: static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { - struct cgroup *parent = parent_kn->priv, *cgrp; - struct cgroup_root *root = parent->root; + struct cgroup *parent, *cgrp; + struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; int ssid, ret; - /* - * cgroup_mkdir() grabs cgroup_tree_mutex which nests outside - * kernfs active_ref and cgroup_create() already synchronizes - * properly against removal through cgroup_lock_live_group(). - * Break it before calling cgroup_create(). - */ - cgroup_get(parent); - kernfs_break_active_protection(parent_kn); - mutex_lock(&cgroup_tree_mutex); - if (!cgroup_lock_live_group(parent)) { - ret = -ENODEV; - goto out_unlock_tree; - } + parent = cgroup_kn_lock_live(parent_kn); + if (!parent) + return -ENODEV; + root = parent->root; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); @@ -4348,11 +4390,7 @@ out_free_id: out_free_cgrp: kfree(cgrp); out_unlock: - mutex_unlock(&cgroup_mutex); -out_unlock_tree: - mutex_unlock(&cgroup_tree_mutex); - kernfs_unbreak_active_protection(parent_kn); - cgroup_put(parent); + cgroup_kn_unlock(parent_kn); return ret; out_destroy: @@ -4579,32 +4617,17 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) static int cgroup_rmdir(struct kernfs_node *kn) { - struct cgroup *cgrp = kn->priv; + struct cgroup *cgrp; int ret = 0; - /* - * This is self-destruction but @kn can't be removed while this - * callback is in progress. Let's break active protection. Once - * the protection is broken, @cgrp can be destroyed at any point. - * Pin it so that it stays accessible. - */ - cgroup_get(cgrp); - kernfs_break_active_protection(kn); - - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); - - /* - * @cgrp might already have been destroyed while we're trying to - * grab the mutexes. - */ - if (!cgroup_is_dead(cgrp)) - ret = cgroup_destroy_locked(cgrp); + cgrp = cgroup_kn_lock_live(kn); + if (!cgrp) + return 0; + cgroup_get(cgrp); /* for @kn->priv clearing */ - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); + ret = cgroup_destroy_locked(cgrp); - kernfs_unbreak_active_protection(kn); + cgroup_kn_unlock(kn); /* * There are two control paths which try to determine cgroup from -- cgit From e76ecaeef65c497153ceacf59c2e21c070d43f64 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:23 -0400 Subject: cgroup: use cgroup_kn_lock_live() in other cgroup kernfs methods Make __cgroup_procs_write() and cgroup_release_agent_write() use cgroup_kn_lock_live() and cgroup_kn_unlock() instead of cgroup_lock_live_group(). This puts the operations under both cgroup_tree_mutex and cgroup_mutex protection without circular dependency from kernfs active protection. Also, this means that cgroup_mutex is no longer nested below kernfs active protection. There is no longer any place where the two locks interact. This leaves cgroup_lock_live_group() without any user. Removed. This will help simplifying cgroup locking. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 39 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 21739e481006..b7cd80845f6a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -386,23 +386,6 @@ static int notify_on_release(const struct cgroup *cgrp) ; \ else -/** - * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. - * @cgrp: the cgroup to be checked for liveness - * - * On success, returns true; the mutex should be later unlocked. On - * failure returns false with no lock held. - */ -static bool cgroup_lock_live_group(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_mutex); - if (cgroup_is_dead(cgrp)) { - mutex_unlock(&cgroup_mutex); - return false; - } - return true; -} - /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); @@ -2306,14 +2289,15 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, { struct task_struct *tsk; const struct cred *cred = current_cred(), *tcred; - struct cgroup *cgrp = of_css(of)->cgroup; + struct cgroup *cgrp; pid_t pid; int ret; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; - if (!cgroup_lock_live_group(cgrp)) + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) return -ENODEV; retry_find_task: @@ -2379,7 +2363,7 @@ retry_find_task: put_task_struct(tsk); out_unlock_cgroup: - mutex_unlock(&cgroup_mutex); + cgroup_kn_unlock(of->kn); return ret ?: nbytes; } @@ -2429,17 +2413,18 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct cgroup *cgrp = of_css(of)->cgroup; - struct cgroup_root *root = cgrp->root; + struct cgroup *cgrp; - BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); - if (!cgroup_lock_live_group(cgrp)) + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); - strlcpy(root->release_agent_path, strstrip(buf), - sizeof(root->release_agent_path)); + strlcpy(cgrp->root->release_agent_path, strstrip(buf), + sizeof(cgrp->root->release_agent_path)); spin_unlock(&release_agent_path_lock); - mutex_unlock(&cgroup_mutex); + cgroup_kn_unlock(of->kn); return nbytes; } -- cgit From 01f6474ce04fffd6282b569ac0a31f4b98d4c82a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:23 -0400 Subject: cgroup: nest kernfs active protection under cgroup_mutex After the recent cgroup_kn_lock_live() changes, cgroup_mutex is no longer nested below kernfs active protection. The two don't have any relationship now. This patch nests kernfs active protection under cgroup_mutex. All cftype operations now require both cgroup_tree_mutex and cgroup_mutex, temporary cgroup_mutex releases over kernfs operations are removed, and cgroup_add/rm_cftypes() grab both mutexes. This makes cgroup_tree_mutex redundant, which will be removed by the next patch. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b7cd80845f6a..bf1d7ce250ac 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1127,7 +1127,7 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) cgrp = kn->parent->priv; /* - * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. cgroup liveliness check alone provides enough * protection against removal. Ensure @cgrp stays accessible and * break the active_ref protection. @@ -1150,6 +1150,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) char name[CGROUP_FILE_NAME_MAX]; lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } @@ -1216,11 +1217,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ - mutex_unlock(&cgroup_mutex); for_each_subsys(ss, ssid) if (ss_mask & (1 << ssid)) cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); - mutex_lock(&cgroup_mutex); for_each_subsys(ss, ssid) { struct cgroup_root *src_root; @@ -2946,6 +2945,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], int ret; lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ @@ -2981,6 +2981,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) int ret = 0; lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { @@ -3049,6 +3050,7 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) static int cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); if (!cfts || !cfts[0].ss) return -ENOENT; @@ -3075,7 +3077,9 @@ int cgroup_rm_cftypes(struct cftype *cfts) int ret; mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); ret = cgroup_rm_cftypes_locked(cfts); + mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -3106,12 +3110,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) return ret; mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); list_add_tail(&cfts->node, &ss->cfts); ret = cgroup_apply_cftypes(cfts, true); if (ret) cgroup_rm_cftypes_locked(cfts); + mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -4445,6 +4451,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) static void kill_css(struct cgroup_subsys_state *css) { lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); /* * This must happen before css is disassociated with its cgroup. @@ -4544,13 +4551,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Initiate massacre of all css's. cgroup_destroy_css_killed() * will be invoked to perform the rest of destruction once the - * percpu refs of all css's are confirmed to be killed. This - * involves removing the subsystem's files, drop cgroup_mutex. + * percpu refs of all css's are confirmed to be killed. */ - mutex_unlock(&cgroup_mutex); for_each_css(css, ssid, cgrp) kill_css(css); - mutex_lock(&cgroup_mutex); /* CGRP_DEAD is set, remove from ->release_list for the last time */ raw_spin_lock(&release_list_lock); @@ -4567,10 +4571,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (!cgrp->nr_css) cgroup_destroy_css_killed(cgrp); - /* remove @cgrp directory along with the base files */ - mutex_unlock(&cgroup_mutex); - kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ - mutex_lock(&cgroup_mutex); + /* + * Remove @cgrp directory along with the base files. @cgrp has an + * extra ref on its kn. + */ + kernfs_remove(cgrp->kn); return 0; }; -- cgit From 8353da1f91f12a3079ecc849226f371242d2807c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:23 -0400 Subject: cgroup: remove cgroup_tree_mutex cgroup_tree_mutex was introduced to work around the circular dependency between cgroup_mutex and kernfs active protection - some kernfs file and directory operations needed cgroup_mutex putting cgroup_mutex under active protection but cgroup also needs to be able to access cgroup hierarchies and cftypes to determine which kernfs_nodes need to be removed. cgroup_tree_mutex nested above both cgroup_mutex and kernfs active protection and used to protect the hierarchy and cftypes. While this worked, it added a lot of double lockings and was generally cumbersome. kernfs provides a mechanism to opt out of active protection and cgroup was already using it for removal and subtree_control. There's no reason to mix both methods of avoiding circular locking dependency and the preceding cgroup_kn_lock_live() changes applied it to all relevant cgroup kernfs operations making it unnecessary to nest cgroup_mutex under kernfs active protection. The previous patch reversed the original lock ordering and put cgroup_mutex above kernfs active protection. After these changes, all cgroup_tree_mutex usages are now accompanied by cgroup_mutex making the former completely redundant. This patch removes cgroup_tree_mutex and all its usages. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 64 ++++++++------------------------------------------------- 1 file changed, 9 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bf1d7ce250ac..457e52705f56 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -70,15 +70,6 @@ #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) -/* - * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file - * creation/removal and hierarchy changing operations including cgroup - * creation, removal, css association and controller rebinding. This outer - * lock is needed mainly to resolve the circular dependency between kernfs - * active ref and cgroup_mutex. cgroup_tree_mutex nests above both. - */ -static DEFINE_MUTEX(cgroup_tree_mutex); - /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -111,11 +102,10 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(release_agent_path_lock); -#define cgroup_assert_mutexes_or_rcu_locked() \ +#define cgroup_assert_mutex_or_rcu_locked() \ rcu_lockdep_assert(rcu_read_lock_held() || \ - lockdep_is_held(&cgroup_tree_mutex) || \ lockdep_is_held(&cgroup_mutex), \ - "cgroup_[tree_]mutex or RCU read lock required"); + "cgroup_mutex or RCU read lock required"); /* * cgroup destruction makes heavy use of work items and there can be a lot @@ -243,7 +233,6 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, { if (ss) return rcu_dereference_check(cgrp->subsys[ss->id], - lockdep_is_held(&cgroup_tree_mutex) || lockdep_is_held(&cgroup_mutex)); else return &cgrp->dummy_css; @@ -347,7 +336,6 @@ static int notify_on_release(const struct cgroup *cgrp) for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = rcu_dereference_check( \ (cgrp)->subsys[(ssid)], \ - lockdep_is_held(&cgroup_tree_mutex) || \ lockdep_is_held(&cgroup_mutex)))) { } \ else @@ -381,7 +369,7 @@ static int notify_on_release(const struct cgroup *cgrp) /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ list_for_each_entry((child), &(cgrp)->children, sibling) \ - if (({ lockdep_assert_held(&cgroup_tree_mutex); \ + if (({ lockdep_assert_held(&cgroup_mutex); \ cgroup_is_dead(child); })) \ ; \ else @@ -869,7 +857,6 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); BUG_ON(atomic_read(&root->nr_cgrps)); @@ -899,7 +886,6 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_exit_root_id(root); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); @@ -1096,7 +1082,6 @@ static void cgroup_kn_unlock(struct kernfs_node *kn) cgrp = kn->parent->priv; mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); kernfs_unbreak_active_protection(kn); cgroup_put(cgrp); @@ -1135,7 +1120,6 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) cgroup_get(cgrp); kernfs_break_active_protection(kn); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); if (!cgroup_is_dead(cgrp)) @@ -1149,7 +1133,6 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } @@ -1179,7 +1162,6 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) struct cgroup_subsys *ss; int ssid, i, ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); for_each_subsys(ss, ssid) { @@ -1457,7 +1439,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) return -EINVAL; } - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* See what subsystems are wanted */ @@ -1503,7 +1484,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) kfree(opts.release_agent); kfree(opts.name); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -1606,7 +1586,6 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) struct css_set *cset; int i, ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); @@ -1696,7 +1675,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* First find the desired set of subsystems */ @@ -1761,9 +1739,7 @@ retry: */ if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); msleep(10); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); goto retry; } @@ -1796,7 +1772,6 @@ retry: out_unlock: mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); kfree(opts.release_agent); kfree(opts.name); @@ -2507,7 +2482,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct css_set *src_cset; int ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* look up all csses currently attached to @cgrp's subtree */ @@ -2866,20 +2840,18 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, return -EPERM; /* - * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. kernfs_rename() doesn't require active_ref - * protection. Break them before grabbing cgroup_tree_mutex. + * protection. Break them before grabbing cgroup_mutex. */ kernfs_break_active_protection(new_parent); kernfs_break_active_protection(kn); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); ret = kernfs_rename(kn, new_parent, new_name_str); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); kernfs_unbreak_active_protection(kn); kernfs_unbreak_active_protection(new_parent); @@ -2944,7 +2916,6 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], struct cftype *cft; int ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { @@ -2980,7 +2951,6 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) struct cgroup_subsys_state *css; int ret = 0; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* add/rm files for all cgroups created before */ @@ -3049,7 +3019,6 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) static int cgroup_rm_cftypes_locked(struct cftype *cfts) { - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (!cfts || !cfts[0].ss) @@ -3076,11 +3045,9 @@ int cgroup_rm_cftypes(struct cftype *cfts) { int ret; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); ret = cgroup_rm_cftypes_locked(cfts); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -3109,7 +3076,6 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) if (ret) return ret; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); list_add_tail(&cfts->node, &ss->cfts); @@ -3118,7 +3084,6 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) cgroup_rm_cftypes_locked(cfts); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -3158,7 +3123,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); /* * @pos could already have been removed. Once a cgroup is removed, @@ -3224,7 +3189,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) @@ -3264,7 +3229,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); do { last = pos; @@ -3311,7 +3276,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) @@ -4178,7 +4143,6 @@ static int online_css(struct cgroup_subsys_state *css) struct cgroup_subsys *ss = css->ss; int ret = 0; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (ss->css_online) @@ -4196,7 +4160,6 @@ static void offline_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (!(css->flags & CSS_ONLINE)) @@ -4399,7 +4362,6 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* @@ -4417,7 +4379,6 @@ static void css_killed_work_fn(struct work_struct *work) cgroup_destroy_css_killed(cgrp); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); /* * Put the css refs from kill_css(). Each css holds an extra @@ -4450,7 +4411,6 @@ static void css_killed_ref_fn(struct percpu_ref *ref) */ static void kill_css(struct cgroup_subsys_state *css) { - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* @@ -4510,7 +4470,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) bool empty; int ssid; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* @@ -4593,7 +4552,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) { struct cgroup *parent = cgrp->parent; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* delete this cgroup from parent->children */ @@ -4647,7 +4605,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); idr_init(&ss->css_idr); @@ -4685,7 +4642,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) cgrp_dfl_root.subsys_mask |= 1 << ss->id; mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); } /** @@ -4735,7 +4691,6 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* Add init_css_set to the hash table */ @@ -4745,7 +4700,6 @@ int __init cgroup_init(void) BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); for_each_subsys(ss, ssid) { if (ss->early_init) { -- cgit From ad0dc7f94dbf417b1c7d42e1f0b250f045b27f8f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 19 Feb 2014 10:51:42 -0800 Subject: rcutorture: Add forward-progress checking for writer The rcutorture output currently does not distinguish between stalls in the RCU implementation and stalls in the rcu_torture_writer() kthreads. This commit therefore adds some diagnostics to help distinguish between these two conditions, at least for the non-SRCU implementations. (SRCU does not provide evidence of update-side forward progress by design.) Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 19 +++++++++++++++++++ kernel/rcu/rcutorture.c | 37 +++++++++++++++++++++++++++++++++++++ kernel/rcu/tree.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 00a7fd61b3c6..82973738125b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -51,7 +51,17 @@ extern int rcu_expedited; /* for sysctl */ extern int rcutorture_runnable; /* for sysctl */ #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +enum rcutorture_type { + RCU_FLAVOR, + RCU_BH_FLAVOR, + RCU_SCHED_FLAVOR, + SRCU_FLAVOR, + INVALID_RCU_FLAVOR +}; + #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) +void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, + unsigned long *gpnum, unsigned long *completed); void rcutorture_record_test_transition(void); void rcutorture_record_progress(unsigned long vernum); void do_trace_rcu_torture_read(const char *rcutorturename, @@ -60,6 +70,15 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long c_old, unsigned long c); #else +static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, + int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + *flags = 0; + *gpnum = 0; + *completed = 0; +} static inline void rcutorture_record_test_transition(void) { } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bd30bc61bc05..0d739e3797e3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -138,6 +138,15 @@ static long n_barrier_attempts; static long n_barrier_successes; static struct list_head rcu_torture_removed; +static int rcu_torture_writer_state; +#define RTWS_FIXED_DELAY 0 +#define RTWS_DELAY 1 +#define RTWS_REPLACE 2 +#define RTWS_DEF_FREE 3 +#define RTWS_EXP_SYNC 4 +#define RTWS_STUTTER 5 +#define RTWS_STOPPING 6 + #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 #else @@ -214,6 +223,7 @@ rcu_torture_free(struct rcu_torture *p) */ struct rcu_torture_ops { + int ttype; void (*init)(void); int (*readlock)(void); void (*read_delay)(struct torture_random_state *rrsp); @@ -312,6 +322,7 @@ static void rcu_sync_torture_init(void) } static struct rcu_torture_ops rcu_ops = { + .ttype = RCU_FLAVOR, .init = rcu_sync_torture_init, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, @@ -355,6 +366,7 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) } static struct rcu_torture_ops rcu_bh_ops = { + .ttype = RCU_BH_FLAVOR, .init = rcu_sync_torture_init, .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ @@ -397,6 +409,7 @@ call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } static struct rcu_torture_ops rcu_busted_ops = { + .ttype = INVALID_RCU_FLAVOR, .init = rcu_sync_torture_init, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ @@ -492,6 +505,7 @@ static void srcu_torture_synchronize_expedited(void) } static struct rcu_torture_ops srcu_ops = { + .ttype = SRCU_FLAVOR, .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, @@ -527,6 +541,7 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p) } static struct rcu_torture_ops sched_ops = { + .ttype = RCU_SCHED_FLAVOR, .init = rcu_sync_torture_init, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ @@ -699,12 +714,15 @@ rcu_torture_writer(void *arg) set_user_nice(current, MAX_NICE); do { + rcu_torture_writer_state = RTWS_FIXED_DELAY; schedule_timeout_uninterruptible(1); rp = rcu_torture_alloc(); if (rp == NULL) continue; rp->rtort_pipe_count = 0; + rcu_torture_writer_state = RTWS_DELAY; udelay(torture_random(&rand) & 0x3ff); + rcu_torture_writer_state = RTWS_REPLACE; old_rp = rcu_dereference_check(rcu_torture_current, current == writer_task); rp->rtort_mbtest = 1; @@ -721,8 +739,10 @@ rcu_torture_writer(void *arg) else exp = gp_exp; if (!exp) { + rcu_torture_writer_state = RTWS_DEF_FREE; cur_ops->deferred_free(old_rp); } else { + rcu_torture_writer_state = RTWS_EXP_SYNC; cur_ops->exp_sync(); list_add(&old_rp->rtort_free, &rcu_torture_removed); @@ -743,8 +763,10 @@ rcu_torture_writer(void *arg) } } rcutorture_record_progress(++rcu_torture_current_version); + rcu_torture_writer_state = RTWS_STUTTER; stutter_wait("rcu_torture_writer"); } while (!torture_must_stop()); + rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); return 0; } @@ -937,6 +959,7 @@ rcu_torture_printk(char *page) int i; long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + static unsigned long rtcv_snap = ULONG_MAX; for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { @@ -997,6 +1020,20 @@ rcu_torture_printk(char *page) page += sprintf(page, "\n"); if (cur_ops->stats) cur_ops->stats(page); + if (rtcv_snap == rcu_torture_current_version && + rcu_torture_current != NULL) { + int __maybe_unused flags; + unsigned long __maybe_unused gpnum; + unsigned long __maybe_unused completed; + + rcutorture_get_gp_data(cur_ops->ttype, + &flags, &gpnum, &completed); + page += sprintf(page, + "??? Writer stall state %d g%lu c%lu f%#x\n", + rcu_torture_writer_state, + gpnum, completed, flags); + } + rtcv_snap = rcu_torture_current_version; } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0c47e300210a..3d15b5a82ae8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -293,6 +293,39 @@ void rcutorture_record_test_transition(void) } EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); +/* + * Send along grace-period-related data for rcutorture diagnostics. + */ +void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, + unsigned long *gpnum, unsigned long *completed) +{ + struct rcu_state *rsp = NULL; + + switch (test_type) { + case RCU_FLAVOR: + rsp = rcu_state; + break; + case RCU_BH_FLAVOR: + rsp = &rcu_bh_state; + break; + case RCU_SCHED_FLAVOR: + rsp = &rcu_sched_state; + break; + default: + break; + } + if (rsp != NULL) { + *flags = ACCESS_ONCE(rsp->gp_flags); + *gpnum = ACCESS_ONCE(rsp->gpnum); + *completed = ACCESS_ONCE(rsp->completed); + return; + } + *flags = 0; + *gpnum = 0; + *completed = 0; +} +EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); + /* * Record the number of writer passes through the current rcutorture test. * This is also used to correlate debugfs tracing stats with the rcutorture -- cgit From da601c63fd3a3e6c30f85eefd5ee46397b5b439d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 26 Feb 2014 12:14:51 -0800 Subject: torture: Intensify locking test The current lock_torture_writer() spends too much time sleeping and not enough time hammering locks, as in an eight-CPU test will often only be utilizing a CPU or two. This commit therefore makes lock_torture_writer() sleep less and hammer more. Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f26b1a18e34e..b0d3e3c50672 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -219,7 +219,8 @@ static int lock_torture_writer(void *arg) set_user_nice(current, 19); do { - schedule_timeout_uninterruptible(1); + if ((torture_random(&rand) & 0xfffff) == 0) + schedule_timeout_uninterruptible(1); cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_write_lock_fail++; -- cgit From 4982223e51e8ea9d09bb33c8323b5ec1877b2b51 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 14 May 2014 10:54:19 +0930 Subject: module: set nx before marking module MODULE_STATE_COMING. We currently set RO & NX on modules very late: after we move them from MODULE_STATE_UNFORMED to MODULE_STATE_COMING, and after we call parse_args() (which can exec code in the module). Much better is to do it in complete_formation() and then call the notifier. This means that the notifiers will be called on a module which is already RO & NX, so that may cause problems (ftrace already changed so they're unaffected). Signed-off-by: Rusty Russell --- kernel/module.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 66e4e0d260a9..f944c72d3c8a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3023,21 +3023,6 @@ static int do_init_module(struct module *mod) */ current->flags &= ~PF_USED_ASYNC; - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); - - /* Set RO and NX regions for core */ - set_section_ro_nx(mod->module_core, - mod->core_text_size, - mod->core_ro_size, - mod->core_size); - - /* Set RO and NX regions for init */ - set_section_ro_nx(mod->module_init, - mod->init_text_size, - mod->init_ro_size, - mod->init_size); - do_mod_ctors(mod); /* Start the module */ if (mod->init != NULL) @@ -3168,9 +3153,26 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); + /* Set RO and NX regions for core */ + set_section_ro_nx(mod->module_core, + mod->core_text_size, + mod->core_ro_size, + mod->core_size); + + /* Set RO and NX regions for init */ + set_section_ro_nx(mod->module_init, + mod->init_text_size, + mod->init_ro_size, + mod->init_size); + /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ mod->state = MODULE_STATE_COMING; + mutex_unlock(&module_mutex); + + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_COMING, mod); + return 0; out: mutex_unlock(&module_mutex); -- cgit From 29dedee0e693aa113164c820395ce51446a71ace Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 5 May 2014 16:38:18 +0200 Subject: uprobes: Add mem_cgroup_charge_anon() into uprobe_write_opcode() Hugh says: The one I noticed was that it forgets all about memcg (because it was copied from KSM, and there the replacement page has already been charged to a memcg). See how mm/memory.c do_anonymous_page() does a mem_cgroup_charge_anon(). Hopefully not a big problem, uprobes is a system-wide thing and only root can insert the probes. But I agree, should be fixed anyway. Add mem_cgroup_{un,}charge_anon() into uprobe_write_opcode(). To simplify the error handling (and avoid the new "uncharge" label) the patch also moves anon_vma_prepare() up before we alloc/charge the new page. While at it fix the comment about ->mmap_sem, it is held for write. Suggested-by: Hugh Dickins Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7716c40f2c50..a13251e8bfa4 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -279,18 +279,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * supported by that architecture then we need to modify is_trap_at_addr and * uprobe_write_opcode accordingly. This would never be a problem for archs * that have fixed length instructions. - */ - -/* + * * uprobe_write_opcode - write the opcode at a given virtual address. * @mm: the probed process address space. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * - * Called with mm->mmap_sem held (for read and with a reference to - * mm). - * - * For mm @mm, write the opcode at @vaddr. + * Called with mm->mmap_sem held for write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, @@ -310,21 +305,25 @@ retry: if (ret <= 0) goto put_old; + ret = anon_vma_prepare(vma); + if (ret) + goto put_old; + ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) goto put_old; - __SetPageUptodate(new_page); + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) + goto put_new; + __SetPageUptodate(new_page); copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); - ret = anon_vma_prepare(vma); - if (ret) - goto put_new; - ret = __replace_page(vma, vaddr, old_page, new_page); + if (ret) + mem_cgroup_uncharge_page(new_page); put_new: page_cache_release(new_page); -- cgit From b02ef20a9fba08948e643d3eec0efadf1da01a44 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 12 May 2014 18:24:45 +0200 Subject: uprobes/x86: Fix the wrong ->si_addr when xol triggers a trap If the probed insn triggers a trap, ->si_addr = regs->ip is technically correct, but this is not what the signal handler wants; we need to pass the address of the probed insn, not the address of xol slot. Add the new arch-agnostic helper, uprobe_get_trap_addr(), and change fill_trap_info() and math_error() to use it. !CONFIG_UPROBES case in uprobes.h uses a macro to avoid include hell and ensure that it can be compiled even if an architecture doesn't define instruction_pointer(). Test-case: #include #include #include extern void probe_div(void); void sigh(int sig, siginfo_t *info, void *c) { int passed = (info->si_addr == probe_div); printf(passed ? "PASS\n" : "FAIL\n"); _exit(!passed); } int main(void) { struct sigaction sa = { .sa_sigaction = sigh, .sa_flags = SA_SIGINFO, }; sigaction(SIGFPE, &sa, NULL); asm ( "xor %ecx,%ecx\n" ".globl probe_div; probe_div:\n" "idiv %ecx\n" ); return 0; } it fails if probe_div() is probed. Note: show_unhandled_signals users should probably use this helper too, but we need to cleanup them first. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu --- arch/x86/kernel/traps.c | 7 ++++--- include/linux/uprobes.h | 4 ++++ kernel/events/uprobes.c | 10 ++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 73b3ea32245a..3fdb20548c4b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -148,11 +149,11 @@ static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr, case X86_TRAP_DE: sicode = FPE_INTDIV; - siaddr = regs->ip; + siaddr = uprobe_get_trap_addr(regs); break; case X86_TRAP_UD: sicode = ILL_ILLOPN; - siaddr = regs->ip; + siaddr = uprobe_get_trap_addr(regs); break; case X86_TRAP_AC: sicode = BUS_ADRALN; @@ -531,7 +532,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) task->thread.error_code = error_code; info.si_signo = SIGFPE; info.si_errno = 0; - info.si_addr = (void __user *)regs->ip; + info.si_addr = (void __user *)uprobe_get_trap_addr(regs); if (trapnr == X86_TRAP_MF) { unsigned short cwd, swd; /* diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index edff2b97b864..88c3b7e8b384 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -102,6 +102,7 @@ extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, u extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); extern bool __weak is_trap_insn(uprobe_opcode_t *insn); extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); +extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); @@ -130,6 +131,9 @@ extern bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *r #else /* !CONFIG_UPROBES */ struct uprobes_state { }; + +#define uprobe_get_trap_addr(regs) instruction_pointer(regs) + static inline int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a13251e8bfa4..3b02c72938a8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1351,6 +1351,16 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; } +unsigned long uprobe_get_trap_addr(struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + if (unlikely(utask && utask->active_uprobe)) + return utask->vaddr; + + return instruction_pointer(regs); +} + /* * Called with no locks held. * Called in context of a exiting or a exec-ing thread. -- cgit From a015edd26e28afe225cdd04f25794bd2b3bbe2da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:00 -0400 Subject: cgroup: use restart_syscall() for mount retries cgroup_mount() uses dumb delay-and-retry logic to wait for cgroup_root which is being destroyed. The retry currently loops inside cgroup_mount() proper. This patch makes it return with restart_syscall() instead so that retry travels out to userland boundary. This slightly simplifies the logic and more importantly makes the retry logic behave better when the wait for some reason becomes lengthy or infinite by allowing the operation to be suspended or terminated from userland. v2: The original patch forgot to free memory allocated for @opts. Fixed. Caught by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 457e52705f56..f36fd9c15b3a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1681,7 +1681,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, ret = parse_cgroupfs_options(data, &opts); if (ret) goto out_unlock; -retry: + /* look for a matching existing root */ if (!opts.subsys_mask && !opts.none && !opts.name) { cgrp_dfl_root_visible = true; @@ -1740,8 +1740,8 @@ retry: if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { mutex_unlock(&cgroup_mutex); msleep(10); - mutex_lock(&cgroup_mutex); - goto retry; + ret = restart_syscall(); + goto out_free; } ret = 0; @@ -1772,7 +1772,7 @@ retry: out_unlock: mutex_unlock(&cgroup_mutex); - +out_free: kfree(opts.release_agent); kfree(opts.name); -- cgit From 9d800df12d31734a6853915e9d2deb5d6747985f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:00 -0400 Subject: cgroup: rename cgroup->dummy_css to ->self and move it to the top cgroup->dummy_css is used as the placeholder css when performing css oriended operations on the cgroup. We're gonna shift more cgroup management to this css. Let's rename it to ->self and move it to the top. This is pure rename and field relocation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 6 +++--- kernel/cgroup.c | 20 ++++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index aa7353deaaf3..164851e388e7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -143,6 +143,9 @@ enum { }; struct cgroup { + /* self css with NULL ->ss, points back to this cgroup */ + struct cgroup_subsys_state self; + unsigned long flags; /* "unsigned long" so bitops work */ /* @@ -224,9 +227,6 @@ struct cgroup { struct list_head pidlists; struct mutex pidlist_mutex; - /* dummy css with NULL ->ss, points back to this cgroup */ - struct cgroup_subsys_state dummy_css; - /* For css percpu_ref killing and RCU-protected deletion */ struct rcu_head rcu_head; struct work_struct destroy_work; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f36fd9c15b3a..b57a949ae4bc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -220,7 +220,7 @@ static void cgroup_idr_remove(struct idr *idr, int id) /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest - * @ss: the subsystem of interest (%NULL returns the dummy_css) + * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This * function must be called either under cgroup_mutex or rcu_read_lock() and @@ -235,13 +235,13 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, return rcu_dereference_check(cgrp->subsys[ss->id], lockdep_is_held(&cgroup_mutex)); else - return &cgrp->dummy_css; + return &cgrp->self; } /** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest - * @ss: the subsystem of interest (%NULL returns the dummy_css) + * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Similar to cgroup_css() but returns the effctive css, which is defined * as the matching css of the nearest ancestor including self which has @ss @@ -254,7 +254,7 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, lockdep_assert_held(&cgroup_mutex); if (!ss) - return &cgrp->dummy_css; + return &cgrp->self; if (!(cgrp->root->subsys_mask & (1 << ss->id))) return NULL; @@ -288,7 +288,7 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) if (cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else - return &cgrp->dummy_css; + return &cgrp->self; } EXPORT_SYMBOL_GPL(of_css); @@ -1551,7 +1551,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); - cgrp->dummy_css.cgroup = cgrp; + cgrp->self.cgroup = cgrp; for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); @@ -3454,7 +3454,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) * ->can_attach() fails. */ do { - css_task_iter_start(&from->dummy_css, &it); + css_task_iter_start(&from->self, &it); task = css_task_iter_next(&it); if (task) get_task_struct(task); @@ -3719,7 +3719,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - css_task_iter_start(&cgrp->dummy_css, &it); + css_task_iter_start(&cgrp->self, &it); while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; @@ -3793,7 +3793,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } rcu_read_unlock(); - css_task_iter_start(&cgrp->dummy_css, &it); + css_task_iter_start(&cgrp->self, &it); while ((tsk = css_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: @@ -4274,7 +4274,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, init_cgroup_housekeeping(cgrp); cgrp->parent = parent; - cgrp->dummy_css.parent = &parent->dummy_css; + cgrp->self.parent = &parent->self; cgrp->root = root; if (notify_on_release(parent)) -- cgit From cbc125efada6d8c2555dd35e938694eb9b7cd791 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:01 -0400 Subject: cgroup: separate out cgroup_has_live_children() from cgroup_destroy_locked() We're expecting another user. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b57a949ae4bc..3f5f48130b6b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3295,6 +3295,21 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, return css_parent(pos); } +static bool cgroup_has_live_children(struct cgroup *cgrp) +{ + struct cgroup *child; + + rcu_read_lock(); + list_for_each_entry_rcu(child, &cgrp->children, sibling) { + if (!cgroup_is_dead(child)) { + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + return false; +} + /** * css_advance_task_iter - advance a task itererator to the next css_set * @it: the iterator to advance @@ -4465,7 +4480,6 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { - struct cgroup *child; struct cgroup_subsys_state *css; bool empty; int ssid; @@ -4487,15 +4501,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * emptiness as dead children linger on it while being destroyed; * otherwise, "rmdir parent/child parent" may fail with -EBUSY. */ - empty = true; - rcu_read_lock(); - list_for_each_entry_rcu(child, &cgrp->children, sibling) { - empty = cgroup_is_dead(child); - if (!empty) - break; - } - rcu_read_unlock(); - if (!empty) + if (cgroup_has_live_children(cgrp)) return -EBUSY; /* -- cgit From 9e4173e1f24fa3bd562f13b92ee34c7dfb1db7c9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:01 -0400 Subject: cgroup: move check_for_release(parent) call to the end of cgroup_destroy_locked() Currently, check_for_release() on the parent of a destroyed cgroup is invoked from cgroup_destroy_css_killed(). This is because this is where the destroyed cgroup can be removed from the parent's children list. check_for_release() tests the emptiness of the list directly, so invoking it before removing the cgroup from the list makes it think that the parent still has children even when it no longer does. This patch updates check_for_release() to use cgroup_has_live_children() instead of directly testing ->children emptiness and moves check_for_release(parent) earlier to the end of cgroup_destroy_locked(). As cgroup_has_live_children() ignores cgroups marked DEAD, check_for_release() functions correctly as long as it's called after asserting DEAD. This makes release notification slightly more timely and more importantly enables further simplification of cgroup destruction path. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3f5f48130b6b..061569fba245 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4542,6 +4542,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ kernfs_remove(cgrp->kn); + set_bit(CGRP_RELEASABLE, &cgrp->parent->flags); + check_for_release(cgrp->parent); + return 0; }; @@ -4556,17 +4559,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ static void cgroup_destroy_css_killed(struct cgroup *cgrp) { - struct cgroup *parent = cgrp->parent; - lockdep_assert_held(&cgroup_mutex); /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); cgroup_put(cgrp); - - set_bit(CGRP_RELEASABLE, &parent->flags); - check_for_release(parent); } static int cgroup_rmdir(struct kernfs_node *kn) @@ -5006,7 +5004,7 @@ void cgroup_exit(struct task_struct *tsk) static void check_for_release(struct cgroup *cgrp) { if (cgroup_is_releasable(cgrp) && - list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { + list_empty(&cgrp->cset_links) && !cgroup_has_live_children(cgrp)) { /* * Control Group is currently removeable. If it's not * already queued for a userspace notification, queue -- cgit From 4e4e28472365f8c7a7c55f6b5706f68bc40c5b13 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:01 -0400 Subject: cgroup: move cgroup->sibling unlinking to cgroup_put() Move cgroup->sibling unlinking from cgroup_destroy_css_killed() to cgroup_put(). This is later but still before the RCU grace period, so it doesn't break css_next_child() although there now is a larger window in which a dead cgroup is visible during css iteration. As css iteration always could have included offline csses, this doesn't affect correctness; however, it does make css_next_child() fall back to reiterting mode more often. This also makes cgroup_put() directly take cgroup_mutex, which limits where it can be called from. These are not immediately problematic and will be dealt with later. This change enables simplification of cgroup destruction path. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 061569fba245..e9aa2a51ca68 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1056,6 +1056,11 @@ static void cgroup_put(struct cgroup *cgrp) if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) return; + /* delete this cgroup from parent->children */ + mutex_lock(&cgroup_mutex); + list_del_rcu(&cgrp->sibling); + mutex_unlock(&cgroup_mutex); + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -4561,9 +4566,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) { lockdep_assert_held(&cgroup_mutex); - /* delete this cgroup from parent->children */ - list_del_rcu(&cgrp->sibling); - cgroup_put(cgrp); } -- cgit From 249f3468a282dcbad53484c821bebb447f14ee03 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:01 -0400 Subject: cgroup: remove cgroup_destory_css_killed() cgroup_destroy_css_killed() is cgroup destruction stage which happens after all csses are offlined. After the recent updates, it no longer does anything other than putting the base reference. This patch removes the function and makes cgroup_destroy_locked() put the base ref at the end isntead. This also makes cgroup->nr_css unnecessary. Removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 3 --- kernel/cgroup.c | 62 +++++--------------------------------------------- 2 files changed, 6 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 164851e388e7..160fcc69149e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -158,9 +158,6 @@ struct cgroup { */ int id; - /* the number of attached css's */ - int nr_css; - /* * If this cgroup contains any tasks, it contributes one to * populated_cnt. All children with non-zero popuplated_cnt of diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e9aa2a51ca68..4a94b0be598d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -178,7 +178,6 @@ static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); -static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); static void kill_css(struct cgroup_subsys_state *css); @@ -4169,7 +4168,6 @@ static int online_css(struct cgroup_subsys_state *css) ret = ss->css_online(css); if (!ret) { css->flags |= CSS_ONLINE; - css->cgroup->nr_css++; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); } return ret; @@ -4189,7 +4187,6 @@ static void offline_css(struct cgroup_subsys_state *css) ss->css_offline(css); css->flags &= ~CSS_ONLINE; - css->cgroup->nr_css--; RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); @@ -4374,39 +4371,18 @@ out_destroy: /* * This is called when the refcnt of a css is confirmed to be killed. - * css_tryget_online() is now guaranteed to fail. + * css_tryget_online() is now guaranteed to fail. Tell the subsystem to + * initate destruction and put the css ref from kill_css(). */ static void css_killed_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); - struct cgroup *cgrp = css->cgroup; mutex_lock(&cgroup_mutex); - - /* - * css_tryget_online() is guaranteed to fail now. Tell subsystems - * to initate destruction. - */ offline_css(css); - - /* - * If @cgrp is marked dead, it's waiting for refs of all css's to - * be disabled before proceeding to the second phase of cgroup - * destruction. If we are the last one, kick it off. - */ - if (!cgrp->nr_css && cgroup_is_dead(cgrp)) - cgroup_destroy_css_killed(cgrp); - mutex_unlock(&cgroup_mutex); - /* - * Put the css refs from kill_css(). Each css holds an extra - * reference to the cgroup's dentry and cgroup removal proceeds - * regardless of css refs. On the last put of each css, whenever - * that may be, the extra dentry ref is put so that dentry - * destruction happens only after all css's are released. - */ css_put(css); } @@ -4518,11 +4494,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ set_bit(CGRP_DEAD, &cgrp->flags); - /* - * Initiate massacre of all css's. cgroup_destroy_css_killed() - * will be invoked to perform the rest of destruction once the - * percpu refs of all css's are confirmed to be killed. - */ + /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); @@ -4532,15 +4504,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); - /* - * If @cgrp has css's attached, the second stage of cgroup - * destruction is kicked off from css_killed_work_fn() after the - * refs of all attached css's are killed. If @cgrp doesn't have - * any css, we kick it off here. - */ - if (!cgrp->nr_css) - cgroup_destroy_css_killed(cgrp); - /* * Remove @cgrp directory along with the base files. @cgrp has an * extra ref on its kn. @@ -4550,25 +4513,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) set_bit(CGRP_RELEASABLE, &cgrp->parent->flags); check_for_release(cgrp->parent); + /* put the base reference */ + cgroup_put(cgrp); + return 0; }; -/** - * cgroup_destroy_css_killed - the second step of cgroup destruction - * @cgrp: the cgroup whose csses have just finished offlining - * - * This function is invoked from a work item for a cgroup which is being - * destroyed after all css's are offlined and performs the rest of - * destruction. This is the second step of destruction described in the - * comment above cgroup_destroy_locked(). - */ -static void cgroup_destroy_css_killed(struct cgroup *cgrp) -{ - lockdep_assert_held(&cgroup_mutex); - - cgroup_put(cgrp); -} - static int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; -- cgit From 25e15d835036a70a53dcc993beaa036f8919a373 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:02 -0400 Subject: cgroup: bounce css release through css->destroy_work css release is planned to do more and would require process context. Bounce it through css->destroy_work. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4a94b0be598d..e694f4153edb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4126,10 +4126,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) queue_work(cgroup_destroy_wq, &css->destroy_work); } -static void css_release(struct percpu_ref *ref) +static void css_release_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = - container_of(ref, struct cgroup_subsys_state, refcnt); + container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; cgroup_idr_remove(&ss->css_idr, css->id); @@ -4137,6 +4137,15 @@ static void css_release(struct percpu_ref *ref) call_rcu(&css->rcu_head, css_free_rcu_fn); } +static void css_release(struct percpu_ref *ref) +{ + struct cgroup_subsys_state *css = + container_of(ref, struct cgroup_subsys_state, refcnt); + + INIT_WORK(&css->destroy_work, css_release_work_fn); + queue_work(cgroup_destroy_wq, &css->destroy_work); +} + static void init_and_link_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { -- cgit From 9395a4500404e05173eda9a2d198b6fa500e90c5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:02 -0400 Subject: cgroup: enable refcnting for root csses Currently, css_get(), css_tryget() and css_tryget_online() are noops for root csses as an optimization; however, we're planning to use css refcnts to track of cgroup lifetime too and root cgroups also need to be reference counted. Since css has been converted to percpu_refcnt, the overhead of refcnting is miniscule and this optimization isn't too meaningful anymore. Furthermore, controllers which optimize the root cgroup often never even invoke these functions in their hot paths. This patch enables refcnting for root csses too. This makes CSS_ROOT flag unused and removes it. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 10 ++-------- kernel/cgroup.c | 6 +++--- 2 files changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 160fcc69149e..286e39e4e9bf 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -77,7 +77,6 @@ struct cgroup_subsys_state { /* bits in struct cgroup_subsys_state flags field */ enum { - CSS_ROOT = (1 << 0), /* this CSS is the root of the subsystem */ CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ }; @@ -89,9 +88,7 @@ enum { */ static inline void css_get(struct cgroup_subsys_state *css) { - /* We don't need to reference count the root state */ - if (!(css->flags & CSS_ROOT)) - percpu_ref_get(&css->refcnt); + percpu_ref_get(&css->refcnt); } /** @@ -106,8 +103,6 @@ static inline void css_get(struct cgroup_subsys_state *css) */ static inline bool css_tryget_online(struct cgroup_subsys_state *css) { - if (css->flags & CSS_ROOT) - return true; return percpu_ref_tryget_live(&css->refcnt); } @@ -119,8 +114,7 @@ static inline bool css_tryget_online(struct cgroup_subsys_state *css) */ static inline void css_put(struct cgroup_subsys_state *css) { - if (!(css->flags & CSS_ROOT)) - percpu_ref_put(&css->refcnt); + percpu_ref_put(&css->refcnt); } /* bits in struct cgroup flags field */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e694f4153edb..cb5864e36f99 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4158,8 +4158,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css, if (cgrp->parent) { css->parent = cgroup_css(cgrp->parent, ss); css_get(css->parent); - } else { - css->flags |= CSS_ROOT; } BUG_ON(cgroup_css(cgrp, ss)); @@ -4582,9 +4580,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); if (early) { - /* idr_alloc() can't be called safely during early init */ + /* allocation can't be done safely during early init */ css->id = 1; } else { + BUG_ON(percpu_ref_init(&css->refcnt, css_release)); css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); } @@ -4671,6 +4670,7 @@ int __init cgroup_init(void) struct cgroup_subsys_state *css = init_css_set.subsys[ss->id]; + BUG_ON(percpu_ref_init(&css->refcnt, css_release)); css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); -- cgit From 9d755d33f0db8c9b49438f71b38a56e375b34360 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:02 -0400 Subject: cgroup: use cgroup->self.refcnt for cgroup refcnting Currently cgroup implements refcnting separately using atomic_t cgroup->refcnt. The destruction paths of cgroup and css are rather complex and bear a lot of similiarities including the use of RCU and bouncing to a work item. This patch makes cgroup use the refcnt of self css for refcnting instead of using its own. This makes cgroup refcnting use css's percpu refcnt and share the destruction mechanism. * css_release_work_fn() and css_free_work_fn() are updated to handle both csses and cgroups. This is a bit messy but should do until we can make cgroup->self a full css, which currently can't be done thanks to multiple hierarchies. * cgroup_destroy_locked() now performs percpu_ref_kill(&cgrp->self.refcnt) instead of cgroup_put(cgrp). * Negative refcnt sanity check in cgroup_get() is no longer necessary as percpu_ref already handles it. * Similarly, as a cgroup which hasn't been killed will never be released regardless of its refcnt value and percpu_ref has sanity check on kill, cgroup_is_dead() sanity check in cgroup_put() is no longer necessary. * As whether a refcnt reached zero or not can only be decided after the reference count is killed, cgroup_root->cgrp's refcnting can no longer be used to decide whether to kill the root or not. Let's make cgroup_kill_sb() explicitly initiate destruction if the root doesn't have any children. This makes sense anyway as unmounted cgroup hierarchy without any children should be destroyed. While this is a bit messy, this will allow pushing more bookkeeping towards cgroup->self and thus handling cgroups and csses in more uniform way. In the very long term, it should be possible to introduce a base subsystem and convert the self css to a proper one making things whole lot simpler and unified. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 6 -- kernel/cgroup.c | 146 +++++++++++++++++++++++++++---------------------- 2 files changed, 80 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 286e39e4e9bf..76dadd77a120 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -160,8 +160,6 @@ struct cgroup { */ int populated_cnt; - atomic_t refcnt; - /* * We link our 'sibling' struct into our parent's 'children'. * Our children link their 'sibling' into our 'children'. @@ -218,10 +216,6 @@ struct cgroup { struct list_head pidlists; struct mutex pidlist_mutex; - /* For css percpu_ref killing and RCU-protected deletion */ - struct rcu_head rcu_head; - struct work_struct destroy_work; - /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; }; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cb5864e36f99..c01e8e8dfad0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -176,10 +176,12 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); +static bool cgroup_has_live_children(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); static int cgroup_destroy_locked(struct cgroup *cgrp); static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); @@ -1008,62 +1010,15 @@ static umode_t cgroup_file_mode(const struct cftype *cft) return mode; } -static void cgroup_free_fn(struct work_struct *work) -{ - struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - - atomic_dec(&cgrp->root->nr_cgrps); - cgroup_pidlist_destroy_all(cgrp); - - if (cgrp->parent) { - /* - * We get a ref to the parent, and put the ref when this - * cgroup is being freed, so it's guaranteed that the - * parent won't be destroyed before its children. - */ - cgroup_put(cgrp->parent); - kernfs_put(cgrp->kn); - kfree(cgrp); - } else { - /* - * This is root cgroup's refcnt reaching zero, which - * indicates that the root should be released. - */ - cgroup_destroy_root(cgrp->root); - } -} - -static void cgroup_free_rcu(struct rcu_head *head) -{ - struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); - - INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); - queue_work(cgroup_destroy_wq, &cgrp->destroy_work); -} - static void cgroup_get(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); - WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); - atomic_inc(&cgrp->refcnt); + css_get(&cgrp->self); } static void cgroup_put(struct cgroup *cgrp) { - if (!atomic_dec_and_test(&cgrp->refcnt)) - return; - if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) - return; - - /* delete this cgroup from parent->children */ - mutex_lock(&cgroup_mutex); - list_del_rcu(&cgrp->sibling); - mutex_unlock(&cgroup_mutex); - - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - cgrp->id = -1; - - call_rcu(&cgrp->rcu_head, cgroup_free_rcu); + css_put(&cgrp->self); } /** @@ -1548,7 +1503,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) struct cgroup_subsys *ss; int ssid; - atomic_set(&cgrp->refcnt, 1); INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->cset_links); @@ -1597,6 +1551,10 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) goto out; root_cgrp->id = ret; + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); + if (ret) + goto out; + /* * We're accessing css_set_count without locking css_set_rwsem here, * but that's OK - it can only be increased by someone holding @@ -1605,11 +1563,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) */ ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); if (ret) - goto out; + goto cancel_ref; ret = cgroup_init_root_id(root); if (ret) - goto out; + goto cancel_ref; root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, KERNFS_ROOT_CREATE_DEACTIVATED, @@ -1657,6 +1615,8 @@ destroy_root: root->kf_root = NULL; exit_root_id: cgroup_exit_root_id(root); +cancel_ref: + percpu_ref_cancel_init(&root_cgrp->self.refcnt); out: free_cgrp_cset_links(&tmp_links); return ret; @@ -1735,13 +1695,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, } /* - * A root's lifetime is governed by its root cgroup. Zero - * ref indicate that the root is being destroyed. Wait for - * destruction to complete so that the subsystems are free. - * We can use wait_queue for the wait but this path is - * super cold. Let's just sleep for a bit and retry. + * A root's lifetime is governed by its root cgroup. + * tryget_live failure indicate that the root is being + * destroyed. Wait for destruction to complete so that the + * subsystems are free. We can use wait_queue for the wait + * but this path is super cold. Let's just sleep for a bit + * and retry. */ - if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { + if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { mutex_unlock(&cgroup_mutex); msleep(10); ret = restart_syscall(); @@ -1794,7 +1755,16 @@ static void cgroup_kill_sb(struct super_block *sb) struct kernfs_root *kf_root = kernfs_root_from_sb(sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); - cgroup_put(&root->cgrp); + /* + * If @root doesn't have any mounts or children, start killing it. + * This prevents new mounts by disabling percpu_ref_tryget_live(). + * cgroup_mount() may wait for @root's release. + */ + if (cgroup_has_live_children(&root->cgrp)) + cgroup_put(&root->cgrp); + else + percpu_ref_kill(&root->cgrp.self.refcnt); + kernfs_kill_sb(sb); } @@ -4110,11 +4080,37 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; - if (css->parent) - css_put(css->parent); + if (css->ss) { + /* css free path */ + if (css->parent) + css_put(css->parent); - css->ss->css_free(css); - cgroup_put(cgrp); + css->ss->css_free(css); + cgroup_put(cgrp); + } else { + /* cgroup free path */ + atomic_dec(&cgrp->root->nr_cgrps); + cgroup_pidlist_destroy_all(cgrp); + + if (cgrp->parent) { + /* + * We get a ref to the parent, and put the ref when + * this cgroup is being freed, so it's guaranteed + * that the parent won't be destroyed before its + * children. + */ + cgroup_put(cgrp->parent); + kernfs_put(cgrp->kn); + kfree(cgrp); + } else { + /* + * This is root cgroup's refcnt reaching zero, + * which indicates that the root should be + * released. + */ + cgroup_destroy_root(cgrp->root); + } + } } static void css_free_rcu_fn(struct rcu_head *rcu_head) @@ -4131,8 +4127,20 @@ static void css_release_work_fn(struct work_struct *work) struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; + struct cgroup *cgrp = css->cgroup; - cgroup_idr_remove(&ss->css_idr, css->id); + if (ss) { + /* css release path */ + cgroup_idr_remove(&ss->css_idr, css->id); + } else { + /* cgroup release path */ + mutex_lock(&cgroup_mutex); + list_del_rcu(&cgrp->sibling); + mutex_unlock(&cgroup_mutex); + + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + cgrp->id = -1; + } call_rcu(&css->rcu_head, css_free_rcu_fn); } @@ -4285,6 +4293,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, goto out_unlock; } + ret = percpu_ref_init(&cgrp->self.refcnt, css_release); + if (ret) + goto out_free_cgrp; + /* * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. @@ -4292,7 +4304,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); if (cgrp->id < 0) { ret = -ENOMEM; - goto out_free_cgrp; + goto out_cancel_ref; } init_cgroup_housekeeping(cgrp); @@ -4365,6 +4377,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, out_free_id: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); +out_cancel_ref: + percpu_ref_cancel_init(&cgrp->self.refcnt); out_free_cgrp: kfree(cgrp); out_unlock: @@ -4521,7 +4535,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) check_for_release(cgrp->parent); /* put the base reference */ - cgroup_put(cgrp); + percpu_ref_kill(&cgrp->self.refcnt); return 0; }; -- cgit From 66209a5bd4825e8890bfb65d48efa8a47c647fea Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 6 May 2014 21:57:49 -0400 Subject: ftrace: Remove boolean of hash_enable and hash_disable Commit 4104d326b670 "ftrace: Remove global function list and call function directly" cleaned up the global_ops filtering and made the code simpler, but it left a variable "hash_enable" that was used to know if the hash functions should be updated or not. It was updated if the global_ops did not override them. As the global_ops are now no different than any other ftrace_ops, the hash always gets updated and there's no reason to use the hash_enable boolean. The same goes for hash_disable used in ftrace_shutdown(). Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 38e5cf73b9ae..2c99d1f7caf1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2042,7 +2042,6 @@ static void ftrace_startup_enable(int command) static int ftrace_startup(struct ftrace_ops *ops, int command) { - bool hash_enable = true; int ret; if (unlikely(ftrace_disabled)) @@ -2056,8 +2055,8 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) command |= FTRACE_UPDATE_CALLS; ops->flags |= FTRACE_OPS_FL_ENABLED; - if (hash_enable) - ftrace_hash_rec_enable(ops, 1); + + ftrace_hash_rec_enable(ops, 1); ftrace_startup_enable(command); @@ -2066,7 +2065,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) static int ftrace_shutdown(struct ftrace_ops *ops, int command) { - bool hash_disable = true; int ret; if (unlikely(ftrace_disabled)) @@ -2084,8 +2082,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) */ WARN_ON_ONCE(ftrace_start_up < 0); - if (hash_disable) - ftrace_hash_rec_disable(ops, 1); + ftrace_hash_rec_disable(ops, 1); if (!global_start_up) ops->flags &= ~FTRACE_OPS_FL_ENABLED; -- cgit From 19eab4a472cfe4a3ae51cff1711d795e3f9bb564 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 7 May 2014 15:06:14 -0400 Subject: ftrace: Write in missing comment from a very old commit Back in 2011 Commit ed926f9b35cda "ftrace: Use counters to enable functions to trace" changed the way ftrace accounts for enabled and disabled traced functions. There was a comment started as: /* * */ But never finished. Well, that's rather useless. I probably forgot to save the file before committing it. And it passed review from all this time. Anyway, better late than never. I updated the comment to express what is happening in that somewhat complex code. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2c99d1f7caf1..61f39f8b62e1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1552,7 +1552,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); /* + * If filter_hash is set, we want to match all functions + * that are in the hash but not in the other hash. * + * If filter_hash is not set, then we are decrementing. + * That means we match anything that is in the hash + * and also in the other_hash. That is, we need to turn + * off functions in the other hash because they are disabled + * by this hash. */ if (filter_hash && in_hash && !in_other_hash) match = 1; -- cgit From 68f40969f0173c02ddc22a40df865c81c29070e4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 1 May 2014 12:44:50 -0400 Subject: ftrace: Always inline ftrace_hash_empty() helper function The ftrace_hash_empty() function is a simple test: return !hash || !hash->count; But gcc seems to want to make it a call. As this is in an extreme hot path of the function tracer, there's no reason it needs to be a call. I only wrote it to be a helper function anyway, otherwise it would have been inlined manually. Force gcc to inline it, as it could have also been a macro. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 61f39f8b62e1..98fa931b6864 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1105,7 +1105,7 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static bool ftrace_hash_empty(struct ftrace_hash *hash) +static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash) { return !hash || !hash->count; } -- cgit From 7413af1fb70e7efa6dbc7f27663e7a5126b3aa33 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 6 May 2014 21:34:14 -0400 Subject: ftrace: Make get_ftrace_addr() and get_ftrace_addr_old() global Move and rename get_ftrace_addr() and get_ftrace_addr_old() to ftrace_get_addr_new() and ftrace_get_addr_curr() respectively. This moves these two helper functions in the generic code out from the arch specific code, and renames them to have a better generic name. This will allow other archs to use them as well as makes it a bit easier to work on getting separate trampolines for different functions. ftrace_get_addr_new() returns the trampoline address that the mcount call address will be converted to. ftrace_get_addr_curr() returns the trampoline address of what the mcount call address currently jumps to. Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 36 +++++------------------------------- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4b3c195d4133..5ef43ce8492f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -349,38 +349,12 @@ static int add_brk_on_nop(struct dyn_ftrace *rec) return add_break(rec->ip, old); } -/* - * If the record has the FTRACE_FL_REGS set, that means that it - * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS - * is not not set, then it wants to convert to the normal callback. - */ -static unsigned long get_ftrace_addr(struct dyn_ftrace *rec) -{ - if (rec->flags & FTRACE_FL_REGS) - return (unsigned long)FTRACE_REGS_ADDR; - else - return (unsigned long)FTRACE_ADDR; -} - -/* - * The FTRACE_FL_REGS_EN is set when the record already points to - * a function that saves all the regs. Basically the '_EN' version - * represents the current state of the function. - */ -static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec) -{ - if (rec->flags & FTRACE_FL_REGS_EN) - return (unsigned long)FTRACE_REGS_ADDR; - else - return (unsigned long)FTRACE_ADDR; -} - static int add_breakpoints(struct dyn_ftrace *rec, int enable) { unsigned long ftrace_addr; int ret; - ftrace_addr = get_ftrace_old_addr(rec); + ftrace_addr = ftrace_get_addr_curr(rec); ret = ftrace_test_record(rec, enable); @@ -438,14 +412,14 @@ static int remove_breakpoint(struct dyn_ftrace *rec) * If not, don't touch the breakpoint, we make just create * a disaster. */ - ftrace_addr = get_ftrace_addr(rec); + ftrace_addr = ftrace_get_addr_new(rec); nop = ftrace_call_replace(ip, ftrace_addr); if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0) goto update; /* Check both ftrace_addr and ftrace_old_addr */ - ftrace_addr = get_ftrace_old_addr(rec); + ftrace_addr = ftrace_get_addr_curr(rec); nop = ftrace_call_replace(ip, ftrace_addr); if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) @@ -489,7 +463,7 @@ static int add_update(struct dyn_ftrace *rec, int enable) ret = ftrace_test_record(rec, enable); - ftrace_addr = get_ftrace_addr(rec); + ftrace_addr = ftrace_get_addr_new(rec); switch (ret) { case FTRACE_UPDATE_IGNORE: @@ -536,7 +510,7 @@ static int finish_update(struct dyn_ftrace *rec, int enable) ret = ftrace_update_record(rec, enable); - ftrace_addr = get_ftrace_addr(rec); + ftrace_addr = ftrace_get_addr_new(rec); switch (ret) { case FTRACE_UPDATE_IGNORE: diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f0ff2c2453e7..2f8cbffecd3d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -400,6 +400,8 @@ int ftrace_update_record(struct dyn_ftrace *rec, int enable); int ftrace_test_record(struct dyn_ftrace *rec, int enable); void ftrace_run_stop_machine(int command); unsigned long ftrace_location(unsigned long ip); +unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec); +unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec); extern ftrace_func_t ftrace_trace_function; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 98fa931b6864..e825fded435d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1755,6 +1755,42 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) return ftrace_check_record(rec, enable, 0); } +/** + * ftrace_get_addr_new - Get the call address to set to + * @rec: The ftrace record descriptor + * + * If the record has the FTRACE_FL_REGS set, that means that it + * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS + * is not not set, then it wants to convert to the normal callback. + * + * Returns the address of the trampoline to set to + */ +unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_REGS) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + +/** + * ftrace_get_addr_curr - Get the call address that is already there + * @rec: The ftrace record descriptor + * + * The FTRACE_FL_REGS_EN is set when the record already points to + * a function that saves all the regs. Basically the '_EN' version + * represents the current state of the function. + * + * Returns the address of the trampoline that is currently being called + */ +unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_REGS_EN) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { -- cgit From 7c0868e03b7a7c50fa10957d8dddaebb09c72044 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 8 May 2014 07:01:21 -0400 Subject: ftrace: Use the ftrace_addr helper functions to find the ftrace_addr With the moving of the functions that determine what the mcount call site should be replaced with into the generic code, there is a few places in the generic code that can use them instead of hard coding it as it does. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e825fded435d..52c2b53b7953 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1798,12 +1798,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) unsigned long ftrace_addr; int ret; - ret = ftrace_update_record(rec, enable); + ftrace_addr = ftrace_get_addr_new(rec); - if (rec->flags & FTRACE_FL_REGS) - ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; - else - ftrace_addr = (unsigned long)FTRACE_ADDR; + /* This needs to be done before we call ftrace_update_record */ + ftrace_old_addr = ftrace_get_addr_curr(rec); + + ret = ftrace_update_record(rec, enable); switch (ret) { case FTRACE_UPDATE_IGNORE: @@ -1817,11 +1817,6 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) case FTRACE_UPDATE_MODIFY_CALL_REGS: case FTRACE_UPDATE_MODIFY_CALL: - if (rec->flags & FTRACE_FL_REGS) - ftrace_old_addr = (unsigned long)FTRACE_ADDR; - else - ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR; - return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } -- cgit From f1b2f2bd5821c6ab7feed2e133343dd54b212ed9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 7 May 2014 16:09:49 -0400 Subject: ftrace: Remove FTRACE_UPDATE_MODIFY_CALL_REGS flag As the decision to what needs to be done (converting a call to the ftrace_caller to ftrace_caller_regs or to convert from ftrace_caller_regs to ftrace_caller) can easily be determined from the rec->flags of FTRACE_FL_REGS and FTRACE_FL_REGS_EN, there's no need to have the ftrace_check_record() return either a UPDATE_MODIFY_CALL_REGS or a UPDATE_MODIFY_CALL. Just he latter is enough. This added flag causes more complexity than is required. Remove it. Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 3 --- include/linux/ftrace.h | 2 -- kernel/trace/ftrace.c | 13 ++++--------- 3 files changed, 4 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 5ef43ce8492f..89de3eaf8772 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -366,7 +366,6 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable) /* converting nop to call */ return add_brk_on_nop(rec); - case FTRACE_UPDATE_MODIFY_CALL_REGS: case FTRACE_UPDATE_MODIFY_CALL: case FTRACE_UPDATE_MAKE_NOP: /* converting a call to a nop */ @@ -469,7 +468,6 @@ static int add_update(struct dyn_ftrace *rec, int enable) case FTRACE_UPDATE_IGNORE: return 0; - case FTRACE_UPDATE_MODIFY_CALL_REGS: case FTRACE_UPDATE_MODIFY_CALL: case FTRACE_UPDATE_MAKE_CALL: /* converting nop to call */ @@ -516,7 +514,6 @@ static int finish_update(struct dyn_ftrace *rec, int enable) case FTRACE_UPDATE_IGNORE: return 0; - case FTRACE_UPDATE_MODIFY_CALL_REGS: case FTRACE_UPDATE_MODIFY_CALL: case FTRACE_UPDATE_MAKE_CALL: /* converting nop to call */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 2f8cbffecd3d..3e6dfb31f8e6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -362,14 +362,12 @@ enum { * IGNORE - The function is already what we want it to be * MAKE_CALL - Start tracing the function * MODIFY_CALL - Stop saving regs for the function - * MODIFY_CALL_REGS - Start saving regs for the function * MAKE_NOP - Stop tracing the function */ enum { FTRACE_UPDATE_IGNORE, FTRACE_UPDATE_MAKE_CALL, FTRACE_UPDATE_MODIFY_CALL, - FTRACE_UPDATE_MODIFY_CALL_REGS, FTRACE_UPDATE_MAKE_NOP, }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 52c2b53b7953..cc07b7fc4372 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1701,19 +1701,15 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) /* * If this record is being updated from a nop, then * return UPDATE_MAKE_CALL. - * Otherwise, if the EN flag is set, then return - * UPDATE_MODIFY_CALL_REGS to tell the caller to convert - * from the non-save regs, to a save regs function. * Otherwise, * return UPDATE_MODIFY_CALL to tell the caller to convert - * from the save regs, to a non-save regs function. + * from the save regs, to a non-save regs function or + * vice versa. */ if (flag & FTRACE_FL_ENABLED) return FTRACE_UPDATE_MAKE_CALL; - else if (rec->flags & FTRACE_FL_REGS_EN) - return FTRACE_UPDATE_MODIFY_CALL_REGS; - else - return FTRACE_UPDATE_MODIFY_CALL; + + return FTRACE_UPDATE_MODIFY_CALL; } if (update) { @@ -1815,7 +1811,6 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) case FTRACE_UPDATE_MAKE_NOP: return ftrace_make_nop(NULL, rec, ftrace_addr); - case FTRACE_UPDATE_MODIFY_CALL_REGS: case FTRACE_UPDATE_MODIFY_CALL: return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } -- cgit From b3b8a4d42bba8e1fb1b91cc6fd53829d28a503de Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 27 Feb 2014 17:16:57 +0530 Subject: rcutorture: Mark function as static in kernel/rcu/torture.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark functions as static in kernel/rcu/torture.c because they are not used outside this file. This eliminates the following warning in kernel/rcu/torture.c: kernel/rcu/torture.c:902:6: warning: no previous prototype for ‘rcutorture_trace_dump’ [-Wmissing-prototypes] kernel/rcu/torture.c:1572:6: warning: no previous prototype for ‘rcu_torture_barrier_cbf’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0d739e3797e3..0bc5f0a4c1ab 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -806,7 +806,7 @@ rcu_torture_fakewriter(void *arg) return 0; } -void rcutorture_trace_dump(void) +static void rcutorture_trace_dump(void) { static atomic_t beenhere = ATOMIC_INIT(0); @@ -1183,7 +1183,7 @@ static int __init rcu_torture_stall_init(void) } /* Callback function for RCU barrier testing. */ -void rcu_torture_barrier_cbf(struct rcu_head *rcu) +static void rcu_torture_barrier_cbf(struct rcu_head *rcu) { atomic_inc(&barrier_cbs_invoked); } -- cgit From 589a8f59509dc4c68f9c1e2522c5b0b556009221 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Mar 2014 16:51:08 -0800 Subject: rcutorture: Print negatives for SRCU counter wraparound The srcu_torture_stats() function prints SRCU's per-CPU c[] array with an unsigned format, which means that the number one less than zero is a very large number. This commit therefore prints this array with a signed format in order to improve readability of the rcutorture output. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0bc5f0a4c1ab..80d2d2440210 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -492,9 +492,11 @@ static void srcu_torture_stats(char *page) page += sprintf(page, "%s%s per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { - page += sprintf(page, " %d(%lu,%lu)", cpu, - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); + long c0, c1; + + c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; + c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; + page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1); } sprintf(page, "\n"); } -- cgit From 0d6821d5f70b7137974575758962bae61ed0fc63 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Mar 2014 16:58:03 -0800 Subject: torture: Include "Stopping" string to torture_kthread_stopping() Currently, torture_kthread_stopping() prints only the name of the kthread that is stopping, which can be unedifying. This commit therefore adds "Stopping" to make things more evident. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index acc9afc2f26e..e5af6be2594d 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -674,8 +674,10 @@ EXPORT_SYMBOL_GPL(torture_must_stop_irq); */ void torture_kthread_stopping(char *title) { - if (verbose) - VERBOSE_TOROUT_STRING(title); + char buf[128]; + + snprintf(buf, sizeof(buf), "Stopping %s", title); + VERBOSE_TOROUT_STRING(buf); while (!kthread_should_stop()) { torture_shutdown_absorb(title); schedule_timeout_uninterruptible(1); -- cgit From ab7d45053f99f44f81a221eb5c9fbe253ee94524 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Mar 2014 11:03:21 -0800 Subject: torture: Increase stutter-end intensity Currently, all stuttered kthreads block a jiffy at a time, which can result in them starting at different times. (Note: This is not an energy-efficiency problem unless you run torture tests in production, in which case you have other problems!) This commit increases the intensity of the restart event by causing kthreads to spin through the last jiffy, restarting when they see the variable change. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/torture.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index e5af6be2594d..bc0ee382b3c8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -533,7 +533,11 @@ void stutter_wait(const char *title) while (ACCESS_ONCE(stutter_pause_test) || (torture_runnable && !ACCESS_ONCE(*torture_runnable))) { if (stutter_pause_test) - schedule_timeout_interruptible(1); + if (ACCESS_ONCE(stutter_pause_test) == 1) + schedule_timeout_interruptible(1); + else + while (ACCESS_ONCE(stutter_pause_test)) + cond_resched(); else schedule_timeout_interruptible(round_jiffies_relative(HZ)); torture_shutdown_absorb(title); @@ -550,7 +554,11 @@ static int torture_stutter(void *arg) VERBOSE_TOROUT_STRING("torture_stutter task started"); do { if (!torture_must_stop()) { - schedule_timeout_interruptible(stutter); + if (stutter > 1) { + schedule_timeout_interruptible(stutter - 1); + ACCESS_ONCE(stutter_pause_test) = 2; + } + schedule_timeout_interruptible(1); ACCESS_ONCE(stutter_pause_test) = 1; } if (!torture_must_stop()) -- cgit From 945fa9c631b04febe295a3a2a00c7e4a3cfb97db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Mar 2014 14:15:28 -0800 Subject: torture: Dump ftrace buffer when the RCU grace period stalls This commit adds a call to rcutorture_trace_dump() to dump the ftrace buffer when the RCU grace period stalls in order to help debug the stall. Note that this is different than the RCU CPU stall warning, as it is rcutorture detecting the stall rather than the underlying RCU implementation. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 80d2d2440210..9decce0f110c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1034,6 +1034,7 @@ rcu_torture_printk(char *page) "??? Writer stall state %d g%lu c%lu f%#x\n", rcu_torture_writer_state, gpnum, completed, flags); + rcutorture_trace_dump(); } rtcv_snap = rcu_torture_current_version; } -- cgit From afea227fd4acf4f097a9e77bbc2f07d4856ebd01 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Mar 2014 07:10:41 -0700 Subject: rcutorture: Export RCU grace-period kthread wait state to rcutorture This commit allows rcutorture to print additional state for the RCU grace-period kthreads in cases where RCU seems reluctant to start a new grace period. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcutiny.h | 4 ++++ include/linux/rcutree.h | 1 + kernel/rcu/rcutorture.c | 1 + kernel/rcu/tree.c | 17 +++++++++++++++++ kernel/rcu/tree.h | 8 +++++++- 5 files changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 425c659d54e5..d40a6a451330 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -119,6 +119,10 @@ static inline void rcu_sched_force_quiescent_state(void) { } +static inline void show_rcu_gp_kthreads(void) +{ +} + static inline void rcu_cpu_stall_reset(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index a59ca05fd4e3..3e2f5d432743 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -84,6 +84,7 @@ extern unsigned long rcutorture_vernum; long rcu_batches_completed(void); long rcu_batches_completed_bh(void); long rcu_batches_completed_sched(void); +void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 9decce0f110c..37ae5e1d4a1d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1034,6 +1034,7 @@ rcu_torture_printk(char *page) "??? Writer stall state %d g%lu c%lu f%#x\n", rcu_torture_writer_state, gpnum, completed, flags); + show_rcu_gp_kthreads(); rcutorture_trace_dump(); } rtcv_snap = rcu_torture_current_version; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3d15b5a82ae8..93e64381aa2a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -279,6 +279,21 @@ void rcu_bh_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); +/* + * Show the state of the grace-period kthreads. + */ +void show_rcu_gp_kthreads(void) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + pr_info("%s: wait state: %d ->state: %#lx\n", + rsp->name, rsp->gp_state, rsp->gp_kthread->state); + /* sched_show_task(rsp->gp_kthread); */ + } +} +EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); + /* * Record the number of times rcutorture tests have been initiated and * terminated. This information allows the debugfs tracing stats to be @@ -1626,6 +1641,7 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("reqwait")); + rsp->gp_state = RCU_GP_WAIT_GPS; wait_event_interruptible(rsp->gp_wq, ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); @@ -1653,6 +1669,7 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("fqswait")); + rsp->gp_state = RCU_GP_WAIT_FQS; ret = wait_event_interruptible_timeout(rsp->gp_wq, ((gf = ACCESS_ONCE(rsp->gp_flags)) & RCU_GP_FLAG_FQS) || diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 75dc3c39a02a..c2fd1e722879 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -406,7 +406,8 @@ struct rcu_state { unsigned long completed; /* # of last completed gp. */ struct task_struct *gp_kthread; /* Task for grace periods. */ wait_queue_head_t gp_wq; /* Where GP task waits. */ - int gp_flags; /* Commands for GP task. */ + short gp_flags; /* Commands for GP task. */ + short gp_state; /* GP kthread sleep state. */ /* End of fields guarded by root rcu_node's lock. */ @@ -469,6 +470,11 @@ struct rcu_state { #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ +/* Values for rcu_state structure's gp_flags field. */ +#define RCU_GP_WAIT_INIT 0 /* Initial state. */ +#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ +#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */ + extern struct list_head rcu_struct_flavors; /* Sequence through rcu_state structures for each RCU flavor. */ -- cgit From ac1bea85781e9004da9b3e8a4b097c18492d857c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 16 Mar 2014 21:36:25 -0700 Subject: sched,rcu: Make cond_resched() report RCU quiescent states Given a CPU running a loop containing cond_resched(), with no other tasks runnable on that CPU, RCU will eventually report RCU CPU stall warnings due to lack of quiescent states. Fortunately, every call to cond_resched() is a perfectly good quiescent state. Unfortunately, invoking rcu_note_context_switch() is a bit heavyweight for cond_resched(), especially given the need to disable preemption, and, for RCU-preempt, interrupts as well. This commit therefore maintains a per-CPU counter that causes cond_resched(), cond_resched_lock(), and cond_resched_softirq() to call rcu_note_context_switch(), but only about once per 256 invocations. This ratio was chosen in keeping with the relative time constants of RCU grace periods. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 36 ++++++++++++++++++++++++++++++++++++ kernel/rcu/update.c | 18 ++++++++++++++++++ kernel/sched/core.c | 7 ++++++- 3 files changed, 60 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 82973738125b..97cc8d6679b4 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -44,6 +44,7 @@ #include #include #include +#include #include extern int rcu_expedited; /* for sysctl */ @@ -286,6 +287,41 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, bool __rcu_is_watching(void); #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ +/* + * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. + */ + +#define RCU_COND_RESCHED_LIM 256 /* ms vs. 100s of ms. */ +DECLARE_PER_CPU(int, rcu_cond_resched_count); +void rcu_resched(void); + +/* + * Is it time to report RCU quiescent states? + * + * Note unsynchronized access to rcu_cond_resched_count. Yes, we might + * increment some random CPU's count, and possibly also load the result from + * yet another CPU's count. We might even clobber some other CPU's attempt + * to zero its counter. This is all OK because the goal is not precision, + * but rather reasonable amortization of rcu_note_context_switch() overhead + * and extremely high probability of avoiding RCU CPU stall warnings. + * Note that this function has to be preempted in just the wrong place, + * many thousands of times in a row, for anything bad to happen. + */ +static inline bool rcu_should_resched(void) +{ + return raw_cpu_inc_return(rcu_cond_resched_count) >= + RCU_COND_RESCHED_LIM; +} + +/* + * Report quiscent states to RCU if it is time to do so. + */ +static inline void rcu_cond_resched(void) +{ + if (unlikely(rcu_should_resched())) + rcu_resched(); +} + /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4c0a9b0af469..ed7a0d72562c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -338,3 +338,21 @@ static int __init check_cpu_stall_init(void) early_initcall(check_cpu_stall_init); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ + +/* + * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. + */ + +DEFINE_PER_CPU(int, rcu_cond_resched_count); + +/* + * Report a set of RCU quiescent states, for use by cond_resched() + * and friends. Out of line due to being called infrequently. + */ +void rcu_resched(void) +{ + preempt_disable(); + __this_cpu_write(rcu_cond_resched_count, 0); + rcu_note_context_switch(smp_processor_id()); + preempt_enable(); +} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45ea238c..9f530c9ed911 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4051,6 +4051,7 @@ static void __cond_resched(void) int __sched _cond_resched(void) { + rcu_cond_resched(); if (should_resched()) { __cond_resched(); return 1; @@ -4069,15 +4070,18 @@ EXPORT_SYMBOL(_cond_resched); */ int __cond_resched_lock(spinlock_t *lock) { + bool need_rcu_resched = rcu_should_resched(); int resched = should_resched(); int ret = 0; lockdep_assert_held(lock); - if (spin_needbreak(lock) || resched) { + if (spin_needbreak(lock) || resched || need_rcu_resched) { spin_unlock(lock); if (resched) __cond_resched(); + else if (unlikely(need_rcu_resched)) + rcu_resched(); else cpu_relax(); ret = 1; @@ -4091,6 +4095,7 @@ int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); + rcu_cond_resched(); /* BH disabled OK, just recording QSes. */ if (should_resched()) { local_bh_enable(); __cond_resched(); -- cgit From 64e4b43ae050146fcfafe696e61efc306f73d449 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Mar 2014 10:26:35 -0700 Subject: rcutorture: Make rcu_torture_reader() use cond_resched() The rcu_torture_reader() function currently uses schedule(). This commit therefore speeds things up a bit by substituting cond_resched(). This change makes rcu_torture_reader() more CPU-bound, so this commit also adjusts the number of readers (the "nreaders" module parameter, which feeds into the "nrealreaders" variable) to allow one CPU to be free of readers on SMP systems. The point of this is to increase the probability that readers will be watching while an updater makes a change. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 37ae5e1d4a1d..693a90fcee83 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -942,7 +942,7 @@ rcu_torture_reader(void *arg) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); - schedule(); + cond_resched(); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) @@ -1482,10 +1482,13 @@ rcu_torture_init(void) if (cur_ops->init) cur_ops->init(); /* no "goto unwind" prior to this point!!! */ - if (nreaders >= 0) + if (nreaders >= 0) { nrealreaders = nreaders; - else - nrealreaders = 2 * num_online_cpus(); + } else { + nrealreaders = num_online_cpus() - 1; + if (nrealreaders <= 0) + nrealreaders = 1; + } rcu_torture_print_module_parms(cur_ops, "Start of test"); /* Set up the freelist. */ -- cgit From 5ed63b199c5b58ed150ce50f1ea68de54669b13f Mon Sep 17 00:00:00 2001 From: Iulia Manda Date: Mon, 17 Mar 2014 15:21:21 +0200 Subject: torture: Notice if an all-zero cpumask is passed inside a critical section In torture_shuffle_tasks function, the check if an all-zero mask can be passed to set_cpus_allowed_ptr() is redundant after clearing the shuffle_idle_cpu bit. If the mask had more than one bit set, after clearing a bit it has at least one bit set. If the mask had only one bit set, a check is made at the beginning, where the function returns, as there is no need to shuffle only one cpu. Also, this code is executed inside a critical section, delimited by get_online_cpus(), and put_online_cpus(), preventing CPUs from leaving between the check of num_online_cpus and the calls to set_cpus_allowed_ptr() function. Signed-off-by: Iulia Manda Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/torture.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index bc0ee382b3c8..ae1723a4c751 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -335,13 +335,8 @@ static void torture_shuffle_tasks(void) shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask); if (shuffle_idle_cpu >= nr_cpu_ids) shuffle_idle_cpu = -1; - if (shuffle_idle_cpu != -1) { + else cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask); - if (cpumask_empty(shuffle_tmp_mask)) { - put_online_cpus(); - return; - } - } mutex_lock(&shuffle_task_mutex); list_for_each_entry(stp, &shuffle_task_list, st_l) -- cgit From d0d0606e2c13ad445a58b9d9547de617429cabf9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Mar 2014 20:56:45 -0700 Subject: rcutorture: Check for rcu_torture_fqs creation errors The return value from torture_create_kthread() is currently ignored when creating the rcu_torture_fqs kthread. This commit therefore captures the return value so that it can be tested for errors. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 693a90fcee83..dfec2582899f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1577,7 +1577,8 @@ rcu_torture_init(void) fqs_duration = 0; if (fqs_duration) { /* Create the fqs thread */ - torture_create_kthread(rcu_torture_fqs, NULL, fqs_task); + firsterr = torture_create_kthread(rcu_torture_fqs, NULL, + fqs_task); if (firsterr) goto unwind; } -- cgit From a48f3fad4f97fe6a2522fe2f5b3054b4c48a8eac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Mar 2014 15:57:41 -0700 Subject: rcutorture: Add tests for get_state_synchronize_rcu() This commit adds rcutorture testing for get_state_synchronize_rcu() and cond_synchronize_rcu(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 130 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 95 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index dfec2582899f..0d27b9cc14e4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -58,6 +58,7 @@ torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); +torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); @@ -144,8 +145,10 @@ static int rcu_torture_writer_state; #define RTWS_REPLACE 2 #define RTWS_DEF_FREE 3 #define RTWS_EXP_SYNC 4 -#define RTWS_STUTTER 5 -#define RTWS_STOPPING 6 +#define RTWS_COND_GET 5 +#define RTWS_COND_SYNC 6 +#define RTWS_STUTTER 7 +#define RTWS_STOPPING 8 #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 @@ -232,6 +235,8 @@ struct rcu_torture_ops { void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); + unsigned long (*get_state)(void); + void (*cond_sync)(unsigned long oldstate); void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void (*cb_barrier)(void); void (*fqs)(void); @@ -283,10 +288,48 @@ static int rcu_torture_completed(void) return rcu_batches_completed(); } +/* + * Update callback in the pipe. This should be invoked after a grace period. + */ +static bool +rcu_torture_pipe_update_one(struct rcu_torture *rp) +{ + int i; + + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + return true; + } + return false; +} + +/* + * Update all callbacks in the pipe. Suitable for synchronous grace-period + * primitives. + */ +static void +rcu_torture_pipe_update(struct rcu_torture *old_rp) +{ + struct rcu_torture *rp; + struct rcu_torture *rp1; + + if (old_rp) + list_add(&old_rp->rtort_free, &rcu_torture_removed); + list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { + if (rcu_torture_pipe_update_one(rp)) { + list_del(&rp->rtort_free); + rcu_torture_free(rp); + } + } +} + static void rcu_torture_cb(struct rcu_head *p) { - int i; struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); if (torture_must_stop_irq()) { @@ -294,16 +337,10 @@ rcu_torture_cb(struct rcu_head *p) /* The next initialization will pick up the pieces. */ return; } - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; + if (rcu_torture_pipe_update_one(rp)) rcu_torture_free(rp); - } else { + else cur_ops->deferred_free(rp); - } } static int rcu_no_completed(void) @@ -331,6 +368,8 @@ static struct rcu_torture_ops rcu_ops = { .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, + .get_state = get_state_synchronize_rcu, + .cond_sync = cond_synchronize_rcu, .call = call_rcu, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, @@ -705,16 +744,39 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { - bool exp; + unsigned long gp_snap; + bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; int i; struct rcu_torture *rp; - struct rcu_torture *rp1; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); + int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET }; + int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); set_user_nice(current, MAX_NICE); + /* Initialize synctype[] array. If none set, take default. */ + if (!gp_cond1 && !gp_exp1 && !gp_normal1) + gp_cond1 = gp_exp1 = gp_normal1 = true; + if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) + synctype[nsynctypes++] = RTWS_COND_GET; + else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) + pr_alert("rcu_torture_writer: gp_cond without primitives.\n"); + if (gp_exp1 && cur_ops->exp_sync) + synctype[nsynctypes++] = RTWS_EXP_SYNC; + else if (gp_exp && !cur_ops->exp_sync) + pr_alert("rcu_torture_writer: gp_exp without primitives.\n"); + if (gp_normal1 && cur_ops->deferred_free) + synctype[nsynctypes++] = RTWS_DEF_FREE; + else if (gp_normal && !cur_ops->deferred_free) + pr_alert("rcu_torture_writer: gp_normal without primitives.\n"); + if (WARN_ONCE(nsynctypes == 0, + "rcu_torture_writer: No update-side primitives.\n")) { + rcu_torture_writer_state = RTWS_STOPPING; + torture_kthread_stopping("rcu_torture_writer"); + } + do { rcu_torture_writer_state = RTWS_FIXED_DELAY; schedule_timeout_uninterruptible(1); @@ -736,32 +798,30 @@ rcu_torture_writer(void *arg) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); old_rp->rtort_pipe_count++; - if (gp_normal == gp_exp) - exp = !!(torture_random(&rand) & 0x80); - else - exp = gp_exp; - if (!exp) { + switch (synctype[torture_random(&rand) % nsynctypes]) { + case RTWS_DEF_FREE: rcu_torture_writer_state = RTWS_DEF_FREE; cur_ops->deferred_free(old_rp); - } else { + break; + case RTWS_EXP_SYNC: rcu_torture_writer_state = RTWS_EXP_SYNC; cur_ops->exp_sync(); - list_add(&old_rp->rtort_free, - &rcu_torture_removed); - list_for_each_entry_safe(rp, rp1, - &rcu_torture_removed, - rtort_free) { - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= - RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - list_del(&rp->rtort_free); - rcu_torture_free(rp); - } - } + rcu_torture_pipe_update(old_rp); + break; + case RTWS_COND_GET: + rcu_torture_writer_state = RTWS_COND_GET; + gp_snap = cur_ops->get_state(); + i = torture_random(&rand) % 16; + if (i != 0) + schedule_timeout_interruptible(i); + udelay(torture_random(&rand) % 1000); + rcu_torture_writer_state = RTWS_COND_SYNC; + cur_ops->cond_sync(gp_snap); + rcu_torture_pipe_update(old_rp); + break; + default: + WARN_ON_ONCE(1); + break; } } rcutorture_record_progress(++rcu_torture_current_version); -- cgit From f0bf8fab4f311cffa869a462ffd182465c4caee6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Mar 2014 16:17:56 -0700 Subject: rcutorture: Explicitly test synchronous grace-period primitives The original rcu_torture_writer() avoided testing the synchronous grace-period primitives because they were simply wrappers around call_rcu() invocations. The testing of these synchronous primitives was delegated to the fake writers. However, there really is no excuse not to test them, especially in the case of SRCU, where the wrappering is somewhat more elaborate. This commit therefore makes the default rcutorture parameters cause rcu_torture_writer() to include synchronous grace-period primitives in its testing. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0d27b9cc14e4..8b2748486c17 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -62,6 +62,7 @@ torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); +torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing"); @@ -147,8 +148,9 @@ static int rcu_torture_writer_state; #define RTWS_EXP_SYNC 4 #define RTWS_COND_GET 5 #define RTWS_COND_SYNC 6 -#define RTWS_STUTTER 7 -#define RTWS_STOPPING 8 +#define RTWS_SYNC 7 +#define RTWS_STUTTER 8 +#define RTWS_STOPPING 9 #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 @@ -746,19 +748,21 @@ rcu_torture_writer(void *arg) { unsigned long gp_snap; bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; + bool gp_sync1 = gp_sync; int i; struct rcu_torture *rp; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); - int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET }; + int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, + RTWS_COND_GET, RTWS_SYNC }; int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); set_user_nice(current, MAX_NICE); /* Initialize synctype[] array. If none set, take default. */ - if (!gp_cond1 && !gp_exp1 && !gp_normal1) - gp_cond1 = gp_exp1 = gp_normal1 = true; + if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync) + gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) synctype[nsynctypes++] = RTWS_COND_GET; else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) @@ -771,8 +775,17 @@ rcu_torture_writer(void *arg) synctype[nsynctypes++] = RTWS_DEF_FREE; else if (gp_normal && !cur_ops->deferred_free) pr_alert("rcu_torture_writer: gp_normal without primitives.\n"); + if (gp_sync1 && cur_ops->sync) + synctype[nsynctypes++] = RTWS_SYNC; + else if (gp_sync && !cur_ops->sync) + pr_alert("rcu_torture_writer: gp_sync without primitives.\n"); if (WARN_ONCE(nsynctypes == 0, "rcu_torture_writer: No update-side primitives.\n")) { + /* + * No updates primitives, so don't try updating. + * The resulting test won't be testing much, hence the + * above WARN_ONCE(). + */ rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); } @@ -819,6 +832,11 @@ rcu_torture_writer(void *arg) cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); break; + case RTWS_SYNC: + rcu_torture_writer_state = RTWS_SYNC; + cur_ops->sync(); + rcu_torture_pipe_update(old_rp); + break; default: WARN_ON_ONCE(1); break; -- cgit From 424c1b682051c48e1da24e503b96a8a72e114ea4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Mar 2014 08:58:27 -0700 Subject: rcutorture: Add missing destroy_timer_on_stack() The rcu_torture_reader() function uses an on-stack timer_list structure which it initializes with setup_timer_on_stack(). However, it fails to use destroy_timer_on_stack() before exiting, which results in leaking a tracking object if DEBUG_OBJECTS is enabled. This commit therefore invokes destroy_timer_on_stack() to avoid this leakage. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8b2748486c17..a7d18069a96d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1023,8 +1023,10 @@ rcu_torture_reader(void *arg) cond_resched(); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); - if (irqreader && cur_ops->irq_capable) + if (irqreader && cur_ops->irq_capable) { del_timer_sync(&t); + destroy_timer_on_stack(&t); + } torture_kthread_stopping("rcu_torture_reader"); return 0; } -- cgit From 48d684fdad83d7525a557e6ff9c37811b6a9947b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Apr 2014 14:57:13 -0700 Subject: rcutorture: Run rcu_torture_writer at normal priority There are usually lots of readers and only one writer, so if there has to be a choice, we would want rcu_torture_writer to win. This commit therefore removes the set_user_nice() from rcu_torture_writer(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a7d18069a96d..4b7b97ff1195 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -758,7 +758,6 @@ rcu_torture_writer(void *arg) int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - set_user_nice(current, MAX_NICE); /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync) -- cgit From d065eacfdb9d47010f120e9310d7fc8ef2eba272 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 Apr 2014 17:17:35 -0700 Subject: locktorture: Remove reference to nonexistent Kconfig parameter The locktorture module references CONFIG_LOCK_TORTURE_TEST_RUNNABLE, which does not exist. Which is a good thing, because otherwise randconfig testing could enable both rcutorture and locktorture concurrently, which the torture tests are not set up for. This commit therefore removes the reference, so that test is runnable immediately only when inserted as a module. Reported-by: Paul Bolle Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/locking/locktorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index b0d3e3c50672..1952466c7db5 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -82,14 +82,14 @@ struct lock_writer_stress_stats { }; static struct lock_writer_stress_stats *lwsa; -#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE) +#if defined(MODULE) #define LOCKTORTURE_RUNNABLE_INIT 1 #else #define LOCKTORTURE_RUNNABLE_INIT 0 #endif int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; module_param(locktorture_runnable, int, 0444); -MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot"); +MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init"); /* Forward reference. */ static void lock_torture_cleanup(void); -- cgit From 5228084eed8d54c426c7abde3be66daf8e1b0e57 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 Apr 2014 09:14:11 -0700 Subject: torture: Check for multiple concurrent torture tests The torture tests are designed to run in isolation, but do not enforce this isolation. This commit therefore checks for concurrent torture tests, and refuses to start new tests while old tests are running. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/torture.h | 2 +- kernel/locking/locktorture.c | 3 ++- kernel/rcu/rcutorture.c | 3 ++- kernel/torture.c | 13 +++++++++++-- 4 files changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/torture.h b/include/linux/torture.h index b2e2b468e511..f998574247fd 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -81,7 +81,7 @@ void stutter_wait(const char *title); int torture_stutter_init(int s); /* Initialization and cleanup. */ -void torture_init_begin(char *ttype, bool v, int *runnable); +bool torture_init_begin(char *ttype, bool v, int *runnable); void torture_init_end(void); bool torture_cleanup(void); bool torture_must_stop(void); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 1952466c7db5..dbafeac18e4d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -355,7 +355,8 @@ static int __init lock_torture_init(void) &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, }; - torture_init_begin(torture_type, verbose, &locktorture_runnable); + if (!torture_init_begin(torture_type, verbose, &locktorture_runnable)) + return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 4b7b97ff1195..7fa34f86e5ba 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1536,7 +1536,8 @@ rcu_torture_init(void) &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, }; - torture_init_begin(torture_type, verbose, &rcutorture_runnable); + if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable)) + return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { diff --git a/kernel/torture.c b/kernel/torture.c index ae1723a4c751..0ed0b49d2ce1 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -599,14 +599,20 @@ static void torture_stutter_cleanup(void) * The runnable parameter points to a flag that controls whether or not * the test is currently runnable. If there is no such flag, pass in NULL. */ -void __init torture_init_begin(char *ttype, bool v, int *runnable) +bool __init torture_init_begin(char *ttype, bool v, int *runnable) { mutex_lock(&fullstop_mutex); + if (torture_type != NULL) { + pr_alert("torture_init_begin: refusing %s init: %s running", + ttype, torture_type); + mutex_unlock(&fullstop_mutex); + return false; + } torture_type = ttype; verbose = v; torture_runnable = runnable; fullstop = FULLSTOP_DONTSTOP; - + return true; } EXPORT_SYMBOL_GPL(torture_init_begin); @@ -645,6 +651,9 @@ bool torture_cleanup(void) torture_shuffle_cleanup(); torture_stutter_cleanup(); torture_onoff_cleanup(); + mutex_lock(&fullstop_mutex); + torture_type = NULL; + mutex_unlock(&fullstop_mutex); return false; } EXPORT_SYMBOL_GPL(torture_cleanup); -- cgit From 2b3f8ffe46f843ac3ae589c25603d66da4a99620 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Wed, 16 Apr 2014 13:42:09 -0700 Subject: torture: Remove __init from torture_init_begin/end Loading rcutorture as a module (as opposed to building it directly into the kernel) results in the following splat: [Wed Apr 16 15:29:33 2014] BUG: unable to handle kernel paging request at ffffffffa0003000 [Wed Apr 16 15:29:33 2014] IP: [] 0xffffffffa0003000 [Wed Apr 16 15:29:33 2014] PGD 1c0f067 PUD 1c10063 PMD 378a6067 PTE 0 [Wed Apr 16 15:29:33 2014] Oops: 0010 [#1] SMP [Wed Apr 16 15:29:33 2014] Modules linked in: rcutorture(+) torture [Wed Apr 16 15:29:33 2014] CPU: 0 PID: 4257 Comm: modprobe Not tainted 3.15.0-rc1 #10 [Wed Apr 16 15:29:33 2014] Hardware name: innotek GmbH VirtualBox, BIOS VirtualBox 12/01/2006 [Wed Apr 16 15:29:33 2014] task: ffff8800db1e88d0 ti: ffff8800db25c000 task.ti: ffff8800db25c000 [Wed Apr 16 15:29:33 2014] RIP: 0010:[] [] 0xffffffffa0003000 [Wed Apr 16 15:29:33 2014] RSP: 0018:ffff8800db25dca0 EFLAGS: 00010282 [Wed Apr 16 15:29:33 2014] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [Wed Apr 16 15:29:33 2014] RDX: ffffffffa00090a8 RSI: 0000000000000001 RDI: ffffffffa0008337 [Wed Apr 16 15:29:33 2014] RBP: ffff8800db25dd50 R08: 0000000000000000 R09: 0000000000000000 [Wed Apr 16 15:29:33 2014] R10: ffffea000357b680 R11: ffffffff8113257a R12: ffffffffa000d000 [Wed Apr 16 15:29:33 2014] R13: ffffffffa00094c0 R14: ffffffffa0009510 R15: 0000000000000001 [Wed Apr 16 15:29:33 2014] FS: 00007fee30ce5700(0000) GS:ffff88021fc00000(0000) knlGS:0000000000000000 [Wed Apr 16 15:29:33 2014] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [Wed Apr 16 15:29:33 2014] CR2: ffffffffa0003000 CR3: 00000000d5eb1000 CR4: 00000000000006f0 [Wed Apr 16 15:29:33 2014] Stack: [Wed Apr 16 15:29:33 2014] ffffffffa000d02c 0000000000000000 ffff88021700d400 0000000000000000 [Wed Apr 16 15:29:33 2014] ffff8800db25dd40 ffffffff81647951 ffff8802162bd000 ffff88021541846c [Wed Apr 16 15:29:33 2014] 0000000000000000 ffffffff817dbe2d ffffffff817dbe2d 0000000000000001 [Wed Apr 16 15:29:33 2014] Call Trace: [Wed Apr 16 15:29:33 2014] [] ? rcu_torture_init+0x2c/0x8b4 [rcutorture] [Wed Apr 16 15:29:33 2014] [] ? netlink_broadcast_filtered+0x121/0x3a0 [Wed Apr 16 15:29:33 2014] [] ? mutex_lock+0xd/0x2a [Wed Apr 16 15:29:33 2014] [] ? mutex_lock+0xd/0x2a [Wed Apr 16 15:29:33 2014] [] ? trace_module_notify+0x62/0x1d0 [Wed Apr 16 15:29:33 2014] [] ? 0xffffffffa000cfff [Wed Apr 16 15:29:33 2014] [] do_one_initcall+0xfa/0x140 [Wed Apr 16 15:29:33 2014] [] ? __blocking_notifier_call_chain+0x5e/0x80 [Wed Apr 16 15:29:33 2014] [] load_module+0x1931/0x21b0 [Wed Apr 16 15:29:33 2014] [] ? show_initstate+0x50/0x50 [Wed Apr 16 15:29:33 2014] [] SyS_init_module+0x9e/0xc0 [Wed Apr 16 15:29:33 2014] [] system_call_fastpath+0x16/0x1b [Wed Apr 16 15:29:33 2014] Code: Bad RIP value. [Wed Apr 16 15:29:33 2014] RIP [] 0xffffffffa0003000 [Wed Apr 16 15:29:33 2014] RSP [Wed Apr 16 15:29:33 2014] CR2: ffffffffa0003000 [Wed Apr 16 15:29:33 2014] ---[ end trace 3e88c173037af84b ]--- This splat is due to the fact that torture_init_begin() and torture_init_end() are both marked with __init, despite their use at runtime. This commit therefore removes __init from both functions. Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/torture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 0ed0b49d2ce1..40bb511cca48 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -599,7 +599,7 @@ static void torture_stutter_cleanup(void) * The runnable parameter points to a flag that controls whether or not * the test is currently runnable. If there is no such flag, pass in NULL. */ -bool __init torture_init_begin(char *ttype, bool v, int *runnable) +bool torture_init_begin(char *ttype, bool v, int *runnable) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { @@ -619,7 +619,7 @@ EXPORT_SYMBOL_GPL(torture_init_begin); /* * Tell the torture module that initialization is complete. */ -void __init torture_init_end(void) +void torture_init_end(void) { mutex_unlock(&fullstop_mutex); register_reboot_notifier(&torture_shutdown_nb); -- cgit From e534165bbf6a04d001748c573c7d6a7bae3713a5 Mon Sep 17 00:00:00 2001 From: Uma Sharma Date: Sun, 23 Mar 2014 22:32:09 -0700 Subject: rcu: Variable name changed in tree_plugin.h and used in tree.c The variable and struct both having the name "rcu_state" confuses sparse in some situations, so this commit changes the variable to "rcu_state_p" in order to avoid this confusion. This also makes things easier for human readers. Signed-off-by: Uma Sharma [ paulmck: Changed the declaration and several additional uses. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 16 ++++++++-------- kernel/rcu/tree_plugin.h | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3bbe48939e47..3e3f13e8b429 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -101,7 +101,7 @@ DEFINE_PER_CPU(struct rcu_data, sname##_data) RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); -static struct rcu_state *rcu_state; +static struct rcu_state *rcu_state_p; LIST_HEAD(rcu_struct_flavors); /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ @@ -275,7 +275,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); */ void rcu_force_quiescent_state(void) { - force_quiescent_state(rcu_state); + force_quiescent_state(rcu_state_p); } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); @@ -327,7 +327,7 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, switch (test_type) { case RCU_FLAVOR: - rsp = rcu_state; + rsp = rcu_state_p; break; case RCU_BH_FLAVOR: rsp = &rcu_bh_state; @@ -910,7 +910,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * we will beat on the first one until it gets unstuck, then move * to the next. Only do this for the primary flavor of RCU. */ - if (rdp->rsp == rcu_state && + if (rdp->rsp == rcu_state_p && ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { rdp->rsp->jiffies_resched += 5; resched_cpu(rdp->cpu); @@ -2660,7 +2660,7 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); void kfree_call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, rcu_state, -1, 1); + __call_rcu(head, func, rcu_state_p, -1, 1); } EXPORT_SYMBOL_GPL(kfree_call_rcu); @@ -2787,7 +2787,7 @@ unsigned long get_state_synchronize_rcu(void) * time-consuming work between get_state_synchronize_rcu() * and cond_synchronize_rcu(). */ - return smp_load_acquire(&rcu_state->gpnum); + return smp_load_acquire(&rcu_state_p->gpnum); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); @@ -2813,7 +2813,7 @@ void cond_synchronize_rcu(unsigned long oldstate) * Ensure that this load happens before any RCU-destructive * actions the caller might carry out after we return. */ - newstate = smp_load_acquire(&rcu_state->completed); + newstate = smp_load_acquire(&rcu_state_p->completed); if (ULONG_CMP_GE(oldstate, newstate)) synchronize_rcu(); } @@ -3354,7 +3354,7 @@ static int rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2e579c38bd91..29977ae84e7e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -116,7 +116,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_TREE_PREEMPT_RCU RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); -static struct rcu_state *rcu_state = &rcu_preempt_state; +static struct rcu_state *rcu_state_p = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); @@ -947,7 +947,7 @@ void exit_rcu(void) #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -static struct rcu_state *rcu_state = &rcu_sched_state; +static struct rcu_state *rcu_state_p = &rcu_sched_state; /* * Tell them what RCU they are running. @@ -1468,11 +1468,11 @@ static int __init rcu_spawn_kthreads(void) for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); - rnp = rcu_get_root(rcu_state); - (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); + rnp = rcu_get_root(rcu_state_p); + (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); if (NUM_RCU_NODES > 1) { - rcu_for_each_leaf_node(rcu_state, rnp) - (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); + rcu_for_each_leaf_node(rcu_state_p, rnp) + (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); } return 0; } @@ -1480,12 +1480,12 @@ early_initcall(rcu_spawn_kthreads); static void rcu_prepare_kthreads(int cpu) { - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ if (rcu_scheduler_fully_active) - (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); + (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); } #else /* #ifdef CONFIG_RCU_BOOST */ -- cgit From 122ff243f5f104194750ecbc76d5946dd1eec934 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 12 May 2014 16:04:53 -0700 Subject: ipv4: make ip_local_reserved_ports per netns ip_local_port_range is already per netns, so should ip_local_reserved_ports be. And since it is none by default we don't actually need it when we don't enable CONFIG_SYSCTL. By the way, rename inet_is_reserved_local_port() to inet_is_local_reserved_port() Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/ip.h | 14 +++++++++++--- include/net/netns/ipv4.h | 4 ++++ kernel/sysctl.c | 4 ++-- net/ipv4/af_inet.c | 8 +------- net/ipv4/inet_connection_sock.c | 5 +---- net/ipv4/inet_hashtables.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 31 ++++++++++++++----------------- net/ipv4/udp.c | 2 +- net/sctp/socket.c | 5 +++-- 9 files changed, 38 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/net/ip.h b/include/net/ip.h index 14c50a1650ef..512bcd5dabac 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -208,11 +208,19 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o void inet_get_local_port_range(struct net *net, int *low, int *high); -extern unsigned long *sysctl_local_reserved_ports; -static inline int inet_is_reserved_local_port(int port) +#if CONFIG_SYSCTL +static inline int inet_is_local_reserved_port(struct net *net, int port) { - return test_bit(port, sysctl_local_reserved_ports); + if (!net->ipv4.sysctl_local_reserved_ports) + return 0; + return test_bit(port, net->ipv4.sysctl_local_reserved_ports); } +#else +static inline int inet_is_local_reserved_port(struct net *net, int port) +{ + return 0; +} +#endif extern int sysctl_ip_nonlocal_bind; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2f0cfad66666..aec5e12f9f19 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -84,6 +84,10 @@ struct netns_ipv4 { atomic_t dev_addr_genid; +#ifdef CONFIG_SYSCTL + unsigned long *sysctl_local_reserved_ports; +#endif + #ifdef CONFIG_IP_MROUTE #ifndef CONFIG_IP_MROUTE_MULTIPLE_TABLES struct mr_table *mrt; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..e36ae4b15726 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2501,11 +2501,11 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, bool first = 1; size_t left = *lenp; unsigned long bitmap_len = table->maxlen; - unsigned long *bitmap = (unsigned long *) table->data; + unsigned long *bitmap = *(unsigned long **) table->data; unsigned long *tmp_bitmap = NULL; char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; - if (!bitmap_len || !left || (*ppos && !write)) { + if (!bitmap || !bitmap_len || !left || (*ppos && !write)) { *lenp = 0; return 0; } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 211c0cc6c3d3..279132bcadd9 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1705,13 +1705,9 @@ static int __init inet_init(void) BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); - sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); - if (!sysctl_local_reserved_ports) - goto out; - rc = proto_register(&tcp_prot, 1); if (rc) - goto out_free_reserved_ports; + goto out; rc = proto_register(&udp_prot, 1); if (rc) @@ -1821,8 +1817,6 @@ out_unregister_udp_proto: proto_unregister(&udp_prot); out_unregister_tcp_proto: proto_unregister(&tcp_prot); -out_free_reserved_ports: - kfree(sysctl_local_reserved_ports); goto out; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 12e502cbfdc7..14d02ea905b6 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -29,9 +29,6 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif -unsigned long *sysctl_local_reserved_ports; -EXPORT_SYMBOL(sysctl_local_reserved_ports); - void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; @@ -113,7 +110,7 @@ again: smallest_size = -1; do { - if (inet_is_reserved_local_port(rover)) + if (inet_is_local_reserved_port(net, rover)) goto next_nolock; head = &hashinfo->bhash[inet_bhashfn(net, rover, hashinfo->bhash_size)]; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 8b9cf279450d..83331f1b86ac 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -500,7 +500,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, local_bh_disable(); for (i = 1; i <= remaining; i++) { port = low + (i + offset) % remaining; - if (inet_is_reserved_local_port(port)) + if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a33b9fbc1d80..79a007c52558 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -436,13 +436,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "ip_local_reserved_ports", - .data = NULL, /* initialized in sysctl_ipv4_init */ - .maxlen = 65536, - .mode = 0644, - .proc_handler = proc_do_large_bitmap, - }, { .procname = "igmp_max_memberships", .data = &sysctl_igmp_max_memberships, @@ -824,6 +817,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = ipv4_local_port_range, }, + { + .procname = "ip_local_reserved_ports", + .data = &init_net.ipv4.sysctl_local_reserved_ports, + .maxlen = 65536, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, + }, { .procname = "ip_no_pmtu_disc", .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, @@ -876,8 +876,14 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) if (net->ipv4.ipv4_hdr == NULL) goto err_reg; + net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); + if (!net->ipv4.sysctl_local_reserved_ports) + goto err_ports; + return 0; +err_ports: + unregister_net_sysctl_table(net->ipv4.ipv4_hdr); err_reg: if (!net_eq(net, &init_net)) kfree(table); @@ -889,6 +895,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net) { struct ctl_table *table; + kfree(net->ipv4.sysctl_local_reserved_ports); table = net->ipv4.ipv4_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.ipv4_hdr); kfree(table); @@ -902,16 +909,6 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = { static __init int sysctl_ipv4_init(void) { struct ctl_table_header *hdr; - struct ctl_table *i; - - for (i = ipv4_table; i->procname; i++) { - if (strcmp(i->procname, "ip_local_reserved_ports") == 0) { - i->data = sysctl_local_reserved_ports; - break; - } - } - if (!i->procname) - return -EINVAL; hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table); if (hdr == NULL) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 54ea0a3a48f1..6729ea97a59d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -246,7 +246,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, do { if (low <= snum && snum <= high && !test_bit(snum >> udptable->log, bitmap) && - !inet_is_reserved_local_port(snum)) + !inet_is_local_reserved_port(net, snum)) goto found; snum += rand; } while (snum != first); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e37b2cbbf177..2af76eaba8f7 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5946,8 +5946,9 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) /* Search for an available port. */ int low, high, remaining, index; unsigned int rover; + struct net *net = sock_net(sk); - inet_get_local_port_range(sock_net(sk), &low, &high); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; rover = prandom_u32() % remaining + low; @@ -5955,7 +5956,7 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) rover++; if ((rover < low) || (rover > high)) rover = low; - if (inet_is_reserved_local_port(rover)) + if (inet_is_local_reserved_port(net, rover)) continue; index = sctp_phashfn(sock_net(sk), rover); head = &sctp_port_hashtable[index]; -- cgit From 4449bf927b61bdb4389393c6fea6837214d1ace7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 6 May 2014 13:10:24 -0400 Subject: tracing: Add __bitmask() macro to trace events to cpumasks and other bitmasks Being able to show a cpumask of events can be useful as some events may affect only some CPUs. There is no standard way to record the cpumask and converting it to a string is rather expensive during the trace as traces happen in hotpaths. It would be better to record the raw event mask and be able to parse it at print time. The following macros were added for use with the TRACE_EVENT() macro: __bitmask() __assign_bitmask() __get_bitmask() To test this, I added this to the sched_migrate_task event, which looked like this: TRACE_EVENT(sched_migrate_task, TP_PROTO(struct task_struct *p, int dest_cpu, const struct cpumask *cpus), TP_ARGS(p, dest_cpu, cpus), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) __field( int, orig_cpu ) __field( int, dest_cpu ) __bitmask( cpumask, num_possible_cpus() ) ), TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; __entry->orig_cpu = task_cpu(p); __entry->dest_cpu = dest_cpu; __assign_bitmask(cpumask, cpumask_bits(cpus), num_possible_cpus()); ), TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d cpumask=%s", __entry->comm, __entry->pid, __entry->prio, __entry->orig_cpu, __entry->dest_cpu, __get_bitmask(cpumask)) ); With the output of: ksmtuned-3613 [003] d..2 485.220508: sched_migrate_task: comm=ksmtuned pid=3615 prio=120 orig_cpu=3 dest_cpu=2 cpumask=00000000,0000000f migration/1-13 [001] d..5 485.221202: sched_migrate_task: comm=ksmtuned pid=3614 prio=120 orig_cpu=1 dest_cpu=0 cpumask=00000000,0000000f awk-3615 [002] d.H5 485.221747: sched_migrate_task: comm=rcu_preempt pid=7 prio=120 orig_cpu=0 dest_cpu=1 cpumask=00000000,000000ff migration/2-18 [002] d..5 485.222062: sched_migrate_task: comm=ksmtuned pid=3615 prio=120 orig_cpu=2 dest_cpu=3 cpumask=00000000,0000000f Link: http://lkml.kernel.org/r/1399377998-14870-6-git-send-email-javi.merino@arm.com Link: http://lkml.kernel.org/r/20140506132238.22e136d1@gandalf.local.home Suggested-by: Javi Merino Tested-by: Javi Merino Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 3 +++ include/linux/trace_seq.h | 10 ++++++++ include/trace/ftrace.h | 57 +++++++++++++++++++++++++++++++++++++++++++- kernel/trace/trace_output.c | 41 +++++++++++++++++++++++++++++++ 4 files changed, 110 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index d16da3e53bc7..cff3106ffe2c 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -38,6 +38,9 @@ const char *ftrace_print_symbols_seq_u64(struct trace_seq *p, *symbol_array); #endif +const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, + unsigned int bitmask_size); + const char *ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int len); diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index a32d86ec8bf2..136116924d8d 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -46,6 +46,9 @@ extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, extern void *trace_seq_reserve(struct trace_seq *s, size_t len); extern int trace_seq_path(struct trace_seq *s, const struct path *path); +extern int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits); + #else /* CONFIG_TRACING */ static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { @@ -57,6 +60,13 @@ trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) return 0; } +static inline int +trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits) +{ + return 0; +} + static inline int trace_print_seq(struct seq_file *m, struct trace_seq *s) { return 0; diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 0a1a4f7caf09..9b7a989dcbcc 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -53,6 +53,9 @@ #undef __string #define __string(item, src) __dynamic_array(char, item, -1) +#undef __bitmask +#define __bitmask(item, nr_bits) __dynamic_array(char, item, -1) + #undef TP_STRUCT__entry #define TP_STRUCT__entry(args...) args @@ -128,6 +131,9 @@ #undef __string #define __string(item, src) __dynamic_array(char, item, -1) +#undef __bitmask +#define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ struct ftrace_data_offsets_##call { \ @@ -200,6 +206,15 @@ #undef __get_str #define __get_str(field) (char *)__get_dynamic_array(field) +#undef __get_bitmask +#define __get_bitmask(field) \ + ({ \ + void *__bitmask = __get_dynamic_array(field); \ + unsigned int __bitmask_size; \ + __bitmask_size = (__entry->__data_loc_##field >> 16) & 0xffff; \ + ftrace_print_bitmask_seq(p, __bitmask, __bitmask_size); \ + }) + #undef __print_flags #define __print_flags(flag, delim, flag_array...) \ ({ \ @@ -322,6 +337,9 @@ static struct trace_event_functions ftrace_event_type_funcs_##call = { \ #undef __string #define __string(item, src) __dynamic_array(char, item, -1) +#undef __bitmask +#define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print) \ static int notrace __init \ @@ -372,6 +390,29 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ #define __string(item, src) __dynamic_array(char, item, \ strlen((src) ? (const char *)(src) : "(null)") + 1) +/* + * __bitmask_size_in_bytes_raw is the number of bytes needed to hold + * num_possible_cpus(). + */ +#define __bitmask_size_in_bytes_raw(nr_bits) \ + (((nr_bits) + 7) / 8) + +#define __bitmask_size_in_longs(nr_bits) \ + ((__bitmask_size_in_bytes_raw(nr_bits) + \ + ((BITS_PER_LONG / 8) - 1)) / (BITS_PER_LONG / 8)) + +/* + * __bitmask_size_in_bytes is the number of bytes needed to hold + * num_possible_cpus() padded out to the nearest long. This is what + * is saved in the buffer, just to be consistent. + */ +#define __bitmask_size_in_bytes(nr_bits) \ + (__bitmask_size_in_longs(nr_bits) * (BITS_PER_LONG / 8)) + +#undef __bitmask +#define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, \ + __bitmask_size_in_longs(nr_bits)) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static inline notrace int ftrace_get_offsets_##call( \ @@ -513,12 +554,22 @@ static inline notrace int ftrace_get_offsets_##call( \ __entry->__data_loc_##item = __data_offsets.item; #undef __string -#define __string(item, src) __dynamic_array(char, item, -1) \ +#define __string(item, src) __dynamic_array(char, item, -1) #undef __assign_str #define __assign_str(dst, src) \ strcpy(__get_str(dst), (src) ? (const char *)(src) : "(null)"); +#undef __bitmask +#define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) + +#undef __get_bitmask +#define __get_bitmask(field) (char *)__get_dynamic_array(field) + +#undef __assign_bitmask +#define __assign_bitmask(dst, src, nr_bits) \ + memcpy(__get_bitmask(dst), (src), __bitmask_size_in_bytes(nr_bits)) + #undef TP_fast_assign #define TP_fast_assign(args...) args @@ -586,6 +637,7 @@ static inline void ftrace_test_probe_##call(void) \ #undef __print_hex #undef __get_dynamic_array #undef __get_str +#undef __get_bitmask #undef TP_printk #define TP_printk(fmt, args...) "\"" fmt "\", " __stringify(args) @@ -651,6 +703,9 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call #undef __get_str #define __get_str(field) (char *)__get_dynamic_array(field) +#undef __get_bitmask +#define __get_bitmask(field) (char *)__get_dynamic_array(field) + #undef __perf_addr #define __perf_addr(a) (__addr = (a)) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a436de18aa99..f3dad80c20b2 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -125,6 +125,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) } EXPORT_SYMBOL_GPL(trace_seq_printf); +/** + * trace_seq_bitmask - put a list of longs as a bitmask print output + * @s: trace sequence descriptor + * @maskp: points to an array of unsigned longs that represent a bitmask + * @nmaskbits: The number of bits that are valid in @maskp + * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * + * Writes a ASCII representation of a bitmask string into @s. + */ +int +trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits) +{ + int len = (PAGE_SIZE - 1) - s->len; + int ret; + + if (s->full || !len) + return 0; + + ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); + s->len += ret; + + return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_bitmask); + /** * trace_seq_vprintf - sequence printing of trace information * @s: trace sequence descriptor @@ -398,6 +426,19 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); #endif +const char * +ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, + unsigned int bitmask_size) +{ + const char *ret = p->buffer + p->len; + + trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq); + const char * ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) { -- cgit From 8f577cadf7181243d336be9aba40c1bcc02c4c54 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 13 May 2014 19:50:47 -0700 Subject: seccomp: JIT compile seccomp filter Take advantage of internal BPF JIT 05-sim-long_jumps.c of libseccomp was used as micro-benchmark: seccomp_rule_add_exact(ctx,... seccomp_rule_add_exact(ctx,... rc = seccomp_load(ctx); for (i = 0; i < 10000000; i++) syscall(...); $ sudo sysctl net.core.bpf_jit_enable=1 $ time ./bench real 0m2.769s user 0m1.136s sys 0m1.624s $ sudo sysctl net.core.bpf_jit_enable=0 $ time ./bench real 0m5.825s user 0m1.268s sys 0m4.548s Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/seccomp.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b35c21503a36..7e02d624cc50 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -54,8 +54,7 @@ struct seccomp_filter { atomic_t usage; struct seccomp_filter *prev; - unsigned short len; /* Instruction count */ - struct sock_filter_int insnsi[]; + struct sk_filter *prog; }; /* Limit any path through the tree to 256KB worth of instructions. */ @@ -189,7 +188,8 @@ static u32 seccomp_run_filters(int syscall) * value always takes priority (ignoring the DATA). */ for (f = current->seccomp.filter; f; f = f->prev) { - u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi); + u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd); + if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; } @@ -215,7 +215,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) return -EINVAL; for (filter = current->seccomp.filter; filter; filter = filter->prev) - total_insns += filter->len + 4; /* include a 4 instr penalty */ + total_insns += filter->prog->len + 4; /* include a 4 instr penalty */ if (total_insns > MAX_INSNS_PER_PATH) return -ENOMEM; @@ -256,19 +256,27 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) /* Allocate a new seccomp_filter */ ret = -ENOMEM; - filter = kzalloc(sizeof(struct seccomp_filter) + - sizeof(struct sock_filter_int) * new_len, + filter = kzalloc(sizeof(struct seccomp_filter), GFP_KERNEL|__GFP_NOWARN); if (!filter) goto free_prog; - ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len); - if (ret) + filter->prog = kzalloc(sk_filter_size(new_len), + GFP_KERNEL|__GFP_NOWARN); + if (!filter->prog) goto free_filter; + + ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); + if (ret) + goto free_filter_prog; kfree(fp); atomic_set(&filter->usage, 1); - filter->len = new_len; + filter->prog->len = new_len; + filter->prog->bpf_func = (void *)sk_run_filter_int_seccomp; + + /* JIT internal BPF into native HW instructions */ + bpf_int_jit_compile(filter->prog); /* * If there is an existing filter, make it the prev and don't drop its @@ -278,6 +286,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) current->seccomp.filter = filter; return 0; +free_filter_prog: + kfree(filter->prog); free_filter: kfree(filter); free_prog: @@ -330,6 +340,7 @@ void put_seccomp_filter(struct task_struct *tsk) while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; orig = orig->prev; + bpf_jit_free(freeme->prog); kfree(freeme); } } -- cgit From 1f0b63866fc1be700260547be8edf8e6f0af37f2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 15 May 2014 23:29:57 +0200 Subject: ACPI / PM: Hold ACPI scan lock over the "freeze" sleep state The "freeze" sleep state suffers from the same issue that was addressed by commit ad07277e82de (ACPI / PM: Hold acpi_scan_lock over system PM transitions) for ACPI sleep states, that is, things break if ->remove() is called for devices whose system resume callbacks haven't been executed yet. It also can be addressed in the same way, by holding the ACPI scan lock over the "freeze" sleep state and PM transitions to and from that state, but ->begin() and ->end() platform operations for the "freeze" sleep state are needed for this purpose. This change has been tested on Acer Aspire S5 with Thunderbolt. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/sleep.c | 18 ++++++++++++++++++ include/linux/suspend.h | 7 +++++++ kernel/power/suspend.c | 15 +++++++++++++++ 3 files changed, 40 insertions(+) (limited to 'kernel') diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 2281ca31c1bc..c11e3795431b 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -612,6 +612,22 @@ static const struct platform_suspend_ops acpi_suspend_ops_old = { .recover = acpi_pm_finish, }; +static int acpi_freeze_begin(void) +{ + acpi_scan_lock_acquire(); + return 0; +} + +static void acpi_freeze_end(void) +{ + acpi_scan_lock_release(); +} + +static const struct platform_freeze_ops acpi_freeze_ops = { + .begin = acpi_freeze_begin, + .end = acpi_freeze_end, +}; + static void acpi_sleep_suspend_setup(void) { int i; @@ -622,7 +638,9 @@ static void acpi_sleep_suspend_setup(void) suspend_set_ops(old_suspend_ordering ? &acpi_suspend_ops_old : &acpi_suspend_ops); + freeze_set_ops(&acpi_freeze_ops); } + #else /* !CONFIG_SUSPEND */ static inline void acpi_sleep_suspend_setup(void) {} #endif /* !CONFIG_SUSPEND */ diff --git a/include/linux/suspend.h b/include/linux/suspend.h index f73cabf59012..91d66fd8dce1 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -187,6 +187,11 @@ struct platform_suspend_ops { void (*recover)(void); }; +struct platform_freeze_ops { + int (*begin)(void); + void (*end)(void); +}; + #ifdef CONFIG_SUSPEND /** * suspend_set_ops - set platform dependent suspend operations @@ -194,6 +199,7 @@ struct platform_suspend_ops { */ extern void suspend_set_ops(const struct platform_suspend_ops *ops); extern int suspend_valid_only_mem(suspend_state_t state); +extern void freeze_set_ops(const struct platform_freeze_ops *ops); extern void freeze_wake(void); /** @@ -220,6 +226,7 @@ extern int pm_suspend(suspend_state_t state); static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {} static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; } +static inline void freeze_set_ops(const struct platform_freeze_ops *ops) {} static inline void freeze_wake(void) {} #endif /* !CONFIG_SUSPEND */ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8233cd4047d7..73a905f83972 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -38,6 +38,7 @@ const char *const pm_states[PM_SUSPEND_MAX] = { }; static const struct platform_suspend_ops *suspend_ops; +static const struct platform_freeze_ops *freeze_ops; static bool need_suspend_ops(suspend_state_t state) { @@ -47,6 +48,13 @@ static bool need_suspend_ops(suspend_state_t state) static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); static bool suspend_freeze_wake; +void freeze_set_ops(const struct platform_freeze_ops *ops) +{ + lock_system_sleep(); + freeze_ops = ops; + unlock_system_sleep(); +} + static void freeze_begin(void) { suspend_freeze_wake = false; @@ -269,6 +277,10 @@ int suspend_devices_and_enter(suspend_state_t state) error = suspend_ops->begin(state); if (error) goto Close; + } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) { + error = freeze_ops->begin(); + if (error) + goto Close; } suspend_console(); suspend_test_start(); @@ -294,6 +306,9 @@ int suspend_devices_and_enter(suspend_state_t state) Close: if (need_suspend_ops(state) && suspend_ops->end) suspend_ops->end(); + else if (state == PM_SUSPEND_FREEZE && freeze_ops->end) + freeze_ops->end(); + trace_machine_suspend(PWR_EVENT_EXIT); return error; -- cgit From 7b6ef1262549f6afc5c881aaef80beb8fd15f908 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:05 +0000 Subject: genirq: Provide generic hwirq allocation facility Not really the solution to the problem, but at least it confines the mess in the core code and allows to get rid of the create/destroy_irq variants from hell, i.e. 3 implementations with different semantics plus the x86 specific variants __create_irqs and create_irq_nr which have been invented in another circle of hell. x86 : x86 should be converted to irq domains and I'm deliberately making it impossible to do the multi-vector MSI support by adding more crap to the current mess. It's not that hard to do and I'm really tired of the trainwrecks which have been invented by baindaid engineering so far. Any attempt to do multi-vector MSI or ioapic hotplug without converting to irq domains is NAKed hereby. tile: Might use irq domains as well, but it has a very limited interrupt space, so handling it via this functionality might be the right thing to do even in the long run. ia64: That's an hopeless case, as I doubt that anyone has the stomach to rewrite the homebrewn dynamic allocation facilities. I stared at it for a couple of hours and gave up. The create/destroy_irq mess could be made private to itanic right away if there wouldn't be the iommu/dmar driver being shared with x86. So to do that I'm going to add a separate ia64 specific implementation later in order not to deep-six itanic right away. Signed-off-by: Thomas Gleixner Reviewed-by: Grant Likely Cc: Tony Luck Cc: Peter Zijlstra Cc: Chris Metcalf Cc: Fenghua Yu Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20140507154334.208629358@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 15 +++++++++++++++ kernel/irq/Kconfig | 5 +++++ kernel/irq/irqdesc.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 5c57efb863d0..c75dd161d37f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -637,6 +637,21 @@ static inline int irq_reserve_irq(unsigned int irq) return irq_reserve_irqs(irq, 1); } +#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ +unsigned int irq_alloc_hwirqs(int cnt, int node); +static inline unsigned int irq_alloc_hwirq(int node) +{ + return irq_alloc_hwirqs(1, node); +} +void irq_free_hwirqs(unsigned int from, int cnt); +static inline void irq_free_hwirq(unsigned int irq) +{ + return irq_free_hwirqs(irq, 1); +} +int arch_setup_hwirq(unsigned int irq, int node); +void arch_teardown_hwirq(unsigned int irq); +#endif + #ifndef irq_reg_writel # define irq_reg_writel(val, addr) writel(val, addr) #endif diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 07cbdfea9ae2..a83f10e406c1 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -17,6 +17,11 @@ config GENERIC_IRQ_SHOW config GENERIC_IRQ_SHOW_LEVEL bool +# Facility to allocate a hardware interrupt. This is legacy support +# and should not be used in new code. Use irq domains instead. +config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ + bool + # Support for delayed migration from interrupt context config GENERIC_PENDING_IRQ bool diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index bb07f2928f4b..f388ade5e792 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -396,6 +396,57 @@ err: } EXPORT_SYMBOL_GPL(__irq_alloc_descs); +#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ +/** + * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware + * @cnt: number of interrupts to allocate + * @node: node on which to allocate + * + * Returns an interrupt number > 0 or 0, if the allocation fails. + */ +unsigned int irq_alloc_hwirqs(int cnt, int node) +{ + int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL); + + if (irq < 0) + return 0; + + for (i = irq; cnt > 0; i++, cnt--) { + if (arch_setup_hwirq(i, node)) + goto err; + irq_clear_status_flags(i, _IRQ_NOREQUEST); + } + return irq; + +err: + for (i--; i >= irq; i--) { + irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); + arch_teardown_hwirq(i); + } + irq_free_descs(irq, cnt); + return 0; +} +EXPORT_SYMBOL_GPL(irq_alloc_hwirqs); + +/** + * irq_free_hwirqs - Free irq descriptor and cleanup the hardware + * @from: Free from irq number + * @cnt: number of interrupts to free + * + */ +void irq_free_hwirqs(unsigned int from, int cnt) +{ + int i; + + for (i = from; cnt > 0; i++, cnt--) { + irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); + arch_teardown_hwirq(i); + } + irq_free_descs(from, cnt); +} +EXPORT_SYMBOL_GPL(irq_free_hwirqs); +#endif + /** * irq_reserve_irqs - mark irqs allocated * @from: mark from irq number -- cgit From f63b6a05f2b11537612266a8b27a61f412344a1d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:21 +0000 Subject: genirq: Replace reserve_irqs in core code We want to get rid of the public interface. Signed-off-by: Thomas Gleixner Reviewed-by: Grant Likely Tested-by: Tony Luck Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140507154340.061990194@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 5 ++--- kernel/irq/internals.h | 6 ++++++ kernel/irq/irqdesc.c | 7 +++++++ 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6397df2d6945..a2b28a2fd7b1 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -40,10 +40,9 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) irq_put_desc_unlock(desc, flags); /* * For !CONFIG_SPARSE_IRQ make the irq show up in - * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is - * already marked, and this call is harmless. + * allocated_irqs. */ - irq_reserve_irq(irq); + irq_mark_irq(irq); return 0; } EXPORT_SYMBOL(irq_set_chip); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ddf1ffeb79f1..c4065e3edbc3 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -76,6 +76,12 @@ extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); extern void unmask_threaded_irq(struct irq_desc *desc); +#ifdef CONFIG_SPARSE_IRQ +static inline void irq_mark_irq(unsigned int irq) { } +#else +extern void irq_mark_irq(unsigned int irq); +#endif + extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index f388ade5e792..24029348729b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -299,6 +299,13 @@ static int irq_expand_nr_irqs(unsigned int nr) return -ENOMEM; } +void irq_mark_irq(unsigned int irq) +{ + mutex_lock(&sparse_irq_lock); + bitmap_set(allocated_irqs, irq, 1); + mutex_unlock(&sparse_irq_lock); +} + #endif /* !CONFIG_SPARSE_IRQ */ /** -- cgit From 1d008353ba088fdec0b2a944e140ff9154a5fb20 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:21 +0000 Subject: genirq: Remove irq_reserve_irq[s] No more users. And it's not going to come back. If you need hotplugable irq chips, use irq domains. Signed-off-by: Thomas Gleixner Reviewed-and-acked-by: Grant Likely Tested-by: Tony Luck Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140507154340.302183048@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 7 ------- kernel/irq/irqdesc.c | 25 ------------------------- 2 files changed, 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index ac9634286f42..2110f46fcafa 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -617,18 +617,11 @@ int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, irq_alloc_descs(-1, from, cnt, node) void irq_free_descs(unsigned int irq, unsigned int cnt); -int irq_reserve_irqs(unsigned int from, unsigned int cnt); - static inline void irq_free_desc(unsigned int irq) { irq_free_descs(irq, 1); } -static inline int irq_reserve_irq(unsigned int irq) -{ - return irq_reserve_irqs(irq, 1); -} - #ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ unsigned int irq_alloc_hwirqs(int cnt, int node); static inline unsigned int irq_alloc_hwirq(int node) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 24029348729b..d514ed6080e1 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -454,31 +454,6 @@ void irq_free_hwirqs(unsigned int from, int cnt) EXPORT_SYMBOL_GPL(irq_free_hwirqs); #endif -/** - * irq_reserve_irqs - mark irqs allocated - * @from: mark from irq number - * @cnt: number of irqs to mark - * - * Returns 0 on success or an appropriate error code - */ -int irq_reserve_irqs(unsigned int from, unsigned int cnt) -{ - unsigned int start; - int ret = 0; - - if (!cnt || (from + cnt) > nr_irqs) - return -EINVAL; - - mutex_lock(&sparse_irq_lock); - start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); - if (start == from) - bitmap_set(allocated_irqs, start, cnt); - else - ret = -EEXIST; - mutex_unlock(&sparse_irq_lock); - return ret; -} - /** * irq_get_next_irq - get next allocated irq number * @offset: where to start the search -- cgit From c940e01c94e73a2a5318f1b82038e0746aaec753 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:22 +0000 Subject: genirq: Replace dynamic_irq_init/cleanup Create a new interface and confine it with a config switch which makes clear that this is just legacy support and not to be used for new code. Signed-off-by: Thomas Gleixner Reviewed-by: Grant Likely Tested-by: Tony Luck Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140507154340.574437049@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 ++++ kernel/irq/Kconfig | 4 ++++ kernel/irq/irqdesc.c | 7 +++++++ 3 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 2110f46fcafa..8ff71d14365a 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -637,6 +637,10 @@ int arch_setup_hwirq(unsigned int irq, int node); void arch_teardown_hwirq(unsigned int irq); #endif +#ifdef CONFIG_GENERIC_IRQ_LEGACY +void irq_init_desc(unsigned int irq); +#endif + #ifndef irq_reg_writel # define irq_reg_writel(val, addr) writel(val, addr) #endif diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index a83f10e406c1..d269cecdfbf0 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -5,6 +5,10 @@ menu "IRQ subsystem" config MAY_HAVE_SPARSE_IRQ bool +# Legacy support, required for itanic +config GENERIC_IRQ_LEGACY + bool + # Enable the generic irq autoprobe mechanism config GENERIC_IRQ_PROBE bool diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index d514ed6080e1..7f267799a717 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -306,6 +306,13 @@ void irq_mark_irq(unsigned int irq) mutex_unlock(&sparse_irq_lock); } +#ifdef CONFIG_GENERIC_IRQ_LEGACY +void irq_init_desc(unsigned int irq) +{ + dynamic_irq_cleanup(irq); +} +#endif + #endif /* !CONFIG_SPARSE_IRQ */ /** -- cgit From d8179bc0db8d0c9654d5de43de2874bf6d0a58fa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 7 May 2014 15:44:23 +0000 Subject: genirq: Remove dynamic_irq mess No more users. Get rid of the cruft. Signed-off-by: Thomas Gleixner Reviewed-by: Grant Likely Tested-by: Tony Luck Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140507154341.012847637@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 10 ---------- kernel/irq/irqdesc.c | 23 +++++++---------------- 2 files changed, 7 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 8ff71d14365a..0d998d8b01d8 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -525,16 +525,6 @@ static inline void irq_set_percpu_devid_flags(unsigned int irq) IRQ_NOPROBE | IRQ_PER_CPU_DEVID); } -/* - * Dynamic irq helper functions. Obsolete. Use irq_alloc_desc* and - * irq_free_desc instead. - */ -extern void dynamic_irq_cleanup(unsigned int irq); -static inline void dynamic_irq_init(unsigned int irq) -{ - dynamic_irq_cleanup(irq); -} - /* Set/get chip/data for an IRQ: */ extern int irq_set_chip(unsigned int irq, struct irq_chip *chip); extern int irq_set_handler_data(unsigned int irq, void *data); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 7f267799a717..7339e42a85ab 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -278,7 +278,12 @@ EXPORT_SYMBOL(irq_to_desc); static void free_desc(unsigned int irq) { - dynamic_irq_cleanup(irq); + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc_set_defaults(irq, desc, desc_node(desc), NULL); + raw_spin_unlock_irqrestore(&desc->lock, flags); } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, @@ -309,7 +314,7 @@ void irq_mark_irq(unsigned int irq) #ifdef CONFIG_GENERIC_IRQ_LEGACY void irq_init_desc(unsigned int irq) { - dynamic_irq_cleanup(irq); + free_desc(irq); } #endif @@ -522,20 +527,6 @@ int irq_set_percpu_devid(unsigned int irq) return 0; } -/** - * dynamic_irq_cleanup - cleanup a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_cleanup(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, desc_node(desc), NULL); - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - void kstat_incr_irq_this_cpu(unsigned int irq) { kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); -- cgit From 3b514d24e200fcdcde0a57c354a51d3677a86743 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:47 -0400 Subject: cgroup: skip refcnting on normal root csses and cgrp_dfl_root self css 9395a4500404 ("cgroup: enable refcnting for root csses") enabled reference counting for root csses (cgroup_subsys_states) so that cgroup's self csses can be used to manage the lifetime of the containing cgroups. Unfortunately, this change was incorrect. During early init, cgrp_dfl_root self css refcnt is used. percpu_ref can't initialized during early init and its initialization is deferred till cgroup_init() time. This means that cpu was using percpu_ref which wasn't properly initialized. Due to the way percpu variables are laid out on x86, this didn't blow up immediately on x86 but ended up incrementing and decrementing the percpu variable at offset zero, whatever it may be; however, on other archs, this caused fault and early boot failure. As cgroup self csses for root cgroups of non-dfl hierarchies need working refcounting, we can't revert 9395a4500404. This patch adds CSS_NO_REF which explicitly inhibits reference counting on the css and sets it on all normal (non-self) csses and cgroup_dfl_root self css. v2: cgrp_dfl_root.self is the offending one. Set the flag on it. Signed-off-by: Tejun Heo Reported-by: Stephen Warren Tested-by: Stephen Warren Fixes: 9395a4500404 ("cgroup: enable refcnting for root csses") --- include/linux/cgroup.h | 11 ++++++++--- kernel/cgroup.c | 11 +++++++++-- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 76dadd77a120..1737db0c63fe 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -77,6 +77,7 @@ struct cgroup_subsys_state { /* bits in struct cgroup_subsys_state flags field */ enum { + CSS_NO_REF = (1 << 0), /* no reference counting for this css */ CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ }; @@ -88,7 +89,8 @@ enum { */ static inline void css_get(struct cgroup_subsys_state *css) { - percpu_ref_get(&css->refcnt); + if (!(css->flags & CSS_NO_REF)) + percpu_ref_get(&css->refcnt); } /** @@ -103,7 +105,9 @@ static inline void css_get(struct cgroup_subsys_state *css) */ static inline bool css_tryget_online(struct cgroup_subsys_state *css) { - return percpu_ref_tryget_live(&css->refcnt); + if (!(css->flags & CSS_NO_REF)) + return percpu_ref_tryget_live(&css->refcnt); + return true; } /** @@ -114,7 +118,8 @@ static inline bool css_tryget_online(struct cgroup_subsys_state *css) */ static inline void css_put(struct cgroup_subsys_state *css) { - percpu_ref_put(&css->refcnt); + if (!(css->flags & CSS_NO_REF)) + percpu_ref_put(&css->refcnt); } /* bits in struct cgroup flags field */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c01e8e8dfad0..0343d7ee6d62 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4593,11 +4593,17 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); + + /* + * Root csses are never destroyed and we can't initialize + * percpu_ref during early init. Disable refcnting. + */ + css->flags |= CSS_NO_REF; + if (early) { /* allocation can't be done safely during early init */ css->id = 1; } else { - BUG_ON(percpu_ref_init(&css->refcnt, css_release)); css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); } @@ -4636,6 +4642,8 @@ int __init cgroup_init_early(void) int i; init_cgroup_root(&cgrp_dfl_root, &opts); + cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; + RCU_INIT_POINTER(init_task.cgroups, &init_css_set); for_each_subsys(ss, i) { @@ -4684,7 +4692,6 @@ int __init cgroup_init(void) struct cgroup_subsys_state *css = init_css_set.subsys[ss->id]; - BUG_ON(percpu_ref_init(&css->refcnt, css_release)); css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); -- cgit From 5c9d535b893f30266ea29fe377cb9b002fcd76aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:48 -0400 Subject: cgroup: remove css_parent() cgroup in general is moving towards using cgroup_subsys_state as the fundamental structural component and css_parent() was introduced to convert from using cgroup->parent to css->parent. It was quite some time ago and we're moving forward with making css more prominent. This patch drops the trivial wrapper css_parent() and let the users dereference css->parent. While at it, explicitly mark fields of css which are public and immutable. v2: New usage from device_cgroup.c converted. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Acked-by: Neil Horman Acked-by: "David S. Miller" Acked-by: Li Zefan Cc: Vivek Goyal Cc: Jens Axboe Cc: Peter Zijlstra Cc: Johannes Weiner --- block/blk-cgroup.h | 2 +- include/linux/cgroup.h | 29 +++++++++++------------------ kernel/cgroup.c | 8 ++++---- kernel/cgroup_freezer.c | 2 +- kernel/cpuset.c | 2 +- kernel/sched/core.c | 2 +- kernel/sched/cpuacct.c | 2 +- mm/hugetlb_cgroup.c | 2 +- mm/memcontrol.c | 14 +++++++------- net/core/netclassid_cgroup.c | 2 +- net/core/netprio_cgroup.c | 2 +- security/device_cgroup.c | 8 ++++---- 12 files changed, 34 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 371fe8e92ab5..d692b29c083a 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -204,7 +204,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) */ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) { - return css_to_blkcg(css_parent(&blkcg->css)); + return css_to_blkcg(blkcg->css.parent); } /** diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1737db0c63fe..2549493d518d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -48,22 +48,28 @@ enum cgroup_subsys_id { }; #undef SUBSYS -/* Per-subsystem/per-cgroup state maintained by the system. */ +/* + * Per-subsystem/per-cgroup state maintained by the system. This is the + * fundamental structural building block that controllers deal with. + * + * Fields marked with "PI:" are public and immutable and may be accessed + * directly without synchronization. + */ struct cgroup_subsys_state { - /* the cgroup that this css is attached to */ + /* PI: the cgroup that this css is attached to */ struct cgroup *cgroup; - /* the cgroup subsystem that this css is attached to */ + /* PI: the cgroup subsystem that this css is attached to */ struct cgroup_subsys *ss; /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; - /* the parent css */ + /* PI: the parent css */ struct cgroup_subsys_state *parent; /* - * Subsys-unique ID. 0 is unused and root is always 1. The + * PI: Subsys-unique ID. 0 is unused and root is always 1. The * matching css can be looked up using css_from_id(). */ int id; @@ -669,19 +675,6 @@ struct cgroup_subsys { #include #undef SUBSYS -/** - * css_parent - find the parent css - * @css: the target cgroup_subsys_state - * - * Return the parent css of @css. This function is guaranteed to return - * non-NULL parent as long as @css isn't the root. - */ -static inline -struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css) -{ - return css->parent; -} - /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0343d7ee6d62..929bbbc539e9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3176,10 +3176,10 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, /* no child, visit my or the closest ancestor's next sibling */ while (pos != root) { - next = css_next_child(pos, css_parent(pos)); + next = css_next_child(pos, pos->parent); if (next) return next; - pos = css_parent(pos); + pos = pos->parent; } return NULL; @@ -3261,12 +3261,12 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ - next = css_next_child(pos, css_parent(pos)); + next = css_next_child(pos, pos->parent); if (next) return css_leftmost_descendant(next); /* no sibling left, visit parent */ - return css_parent(pos); + return pos->parent; } static bool cgroup_has_live_children(struct cgroup *cgrp) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 6b4e60e33a9a..a79e40f9d700 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -59,7 +59,7 @@ static inline struct freezer *task_freezer(struct task_struct *task) static struct freezer *parent_freezer(struct freezer *freezer) { - return css_freezer(css_parent(&freezer->css)); + return css_freezer(freezer->css.parent); } bool cgroup_freezing(struct task_struct *task) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2f4b08b8db24..5b2a31082f4f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -124,7 +124,7 @@ static inline struct cpuset *task_cs(struct task_struct *task) static inline struct cpuset *parent_cs(struct cpuset *cs) { - return css_cs(css_parent(&cs->css)); + return css_cs(cs->css.parent); } #ifdef CONFIG_NUMA diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45ea238c..ac61ad1a5f9f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7586,7 +7586,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css_parent(css)); + struct task_group *parent = css_tg(css->parent); if (parent) sched_online_group(tg, parent); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index c143ee380e3a..9cf350c94ec4 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -46,7 +46,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) static inline struct cpuacct *parent_ca(struct cpuacct *ca) { - return css_ca(css_parent(&ca->css)); + return css_ca(ca->css.parent); } static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index a380681ab3cf..493f758445e7 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -52,7 +52,7 @@ static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) static inline struct hugetlb_cgroup * parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) { - return hugetlb_cgroup_from_css(css_parent(&h_cg->css)); + return hugetlb_cgroup_from_css(h_cg->css.parent); } static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b638a79209ee..a5e0417b4f9a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1540,7 +1540,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) int mem_cgroup_swappiness(struct mem_cgroup *memcg) { /* root ? */ - if (!css_parent(&memcg->css)) + if (!memcg->css.parent) return vm_swappiness; return memcg->swappiness; @@ -4909,7 +4909,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, { int retval = 0; struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); + struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); mutex_lock(&memcg_create_mutex); @@ -5207,8 +5207,8 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, if (!memcg->use_hierarchy) goto out; - while (css_parent(&memcg->css)) { - memcg = mem_cgroup_from_css(css_parent(&memcg->css)); + while (memcg->css.parent) { + memcg = mem_cgroup_from_css(memcg->css.parent); if (!memcg->use_hierarchy) break; tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); @@ -5443,7 +5443,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); + struct mem_cgroup *parent = mem_cgroup_from_css(memcg->css.parent); if (val > 100 || !parent) return -EINVAL; @@ -5790,7 +5790,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); + struct mem_cgroup *parent = mem_cgroup_from_css(memcg->css.parent); /* cannot set to root cgroup and only 0 and 1 are allowed */ if (!parent || !((val == 0) || (val == 1))) @@ -6407,7 +6407,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); + struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); if (css->id > MEM_CGROUP_ID_MAX) return -ENOSPC; diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 22931e1b99b4..30d903b19c62 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -42,7 +42,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css) static int cgrp_css_online(struct cgroup_subsys_state *css) { struct cgroup_cls_state *cs = css_cls_state(css); - struct cgroup_cls_state *parent = css_cls_state(css_parent(css)); + struct cgroup_cls_state *parent = css_cls_state(css->parent); if (parent) cs->classid = parent->classid; diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index b990cefd906b..2f385b9bccc0 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -140,7 +140,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css) static int cgrp_css_online(struct cgroup_subsys_state *css) { - struct cgroup_subsys_state *parent_css = css_parent(css); + struct cgroup_subsys_state *parent_css = css->parent; struct net_device *dev; int ret = 0; diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 7dbac4061b1c..ce14a31b1337 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -182,7 +182,7 @@ static inline bool is_devcg_online(const struct dev_cgroup *devcg) static int devcgroup_online(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); - struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css_parent(css)); + struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css->parent); int ret = 0; mutex_lock(&devcgroup_mutex); @@ -455,7 +455,7 @@ static bool verify_new_ex(struct dev_cgroup *dev_cgroup, static int parent_has_perm(struct dev_cgroup *childcg, struct dev_exception_item *ex) { - struct dev_cgroup *parent = css_to_devcgroup(css_parent(&childcg->css)); + struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent); if (!parent) return 1; @@ -476,7 +476,7 @@ static int parent_has_perm(struct dev_cgroup *childcg, static bool parent_allows_removal(struct dev_cgroup *childcg, struct dev_exception_item *ex) { - struct dev_cgroup *parent = css_to_devcgroup(css_parent(&childcg->css)); + struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent); if (!parent) return true; @@ -614,7 +614,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, char temp[12]; /* 11 + 1 characters needed for a u32 */ int count, rc = 0; struct dev_exception_item ex; - struct dev_cgroup *parent = css_to_devcgroup(css_parent(&devcgroup->css)); + struct dev_cgroup *parent = css_to_devcgroup(devcgroup->css.parent); if (!capable(CAP_SYS_ADMIN)) return -EPERM; -- cgit From d51f39b05ce0008118c45945e681b20484990571 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:48 -0400 Subject: cgroup: remove cgroup->parent cgroup->parent is redundant as cgroup->self.parent can also be used to determine the parent cgroup and we're moving towards using cgroup_subsys_states as the fundamental structural blocks. This patch introduces cgroup_parent() which follows cgroup->self.parent and removes cgroup->parent. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 1 - kernel/cgroup.c | 52 +++++++++++++++++++++++++++++--------------------- 2 files changed, 30 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 2549493d518d..fd538f4c2bb6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -178,7 +178,6 @@ struct cgroup { struct list_head sibling; /* my parent's children */ struct list_head children; /* my children */ - struct cgroup *parent; /* my parent */ struct kernfs_node *kn; /* cgroup kernfs entry */ struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 929bbbc539e9..8c67a739aea4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -218,6 +218,15 @@ static void cgroup_idr_remove(struct idr *idr, int id) spin_unlock_bh(&cgroup_idr_lock); } +static struct cgroup *cgroup_parent(struct cgroup *cgrp) +{ + struct cgroup_subsys_state *parent_css = cgrp->self.parent; + + if (parent_css) + return container_of(parent_css, struct cgroup, self); + return NULL; +} + /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest @@ -260,9 +269,9 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, if (!(cgrp->root->subsys_mask & (1 << ss->id))) return NULL; - while (cgrp->parent && - !(cgrp->parent->child_subsys_mask & (1 << ss->id))) - cgrp = cgrp->parent; + while (cgroup_parent(cgrp) && + !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) + cgrp = cgroup_parent(cgrp); return cgroup_css(cgrp, ss); } @@ -307,7 +316,7 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) while (cgrp) { if (cgrp == ancestor) return true; - cgrp = cgrp->parent; + cgrp = cgroup_parent(cgrp); } return false; } @@ -454,7 +463,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (cgrp->populated_kn) kernfs_notify(cgrp->populated_kn); - cgrp = cgrp->parent; + cgrp = cgroup_parent(cgrp); } while (cgrp); } @@ -2018,7 +2027,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, * Except for the root, child_subsys_mask must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ - if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && dst_cgrp->parent && + if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && dst_cgrp->child_subsys_mask) return -EBUSY; @@ -2427,7 +2436,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgrp->parent->child_subsys_mask); + cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); return 0; } @@ -2610,8 +2619,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, /* unavailable or not enabled on the parent? */ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || - (cgrp->parent && - !(cgrp->parent->child_subsys_mask & (1 << ssid)))) { + (cgroup_parent(cgrp) && + !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { ret = -ENOENT; goto out_unlock; } @@ -2640,7 +2649,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * Except for the root, child_subsys_mask must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ - if (enable && cgrp->parent && !list_empty(&cgrp->cset_links)) { + if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { ret = -EBUSY; goto out_unlock; } @@ -2898,9 +2907,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], continue; if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) continue; - if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) + if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; - if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) + if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) continue; if (is_add) { @@ -4092,14 +4101,14 @@ static void css_free_work_fn(struct work_struct *work) atomic_dec(&cgrp->root->nr_cgrps); cgroup_pidlist_destroy_all(cgrp); - if (cgrp->parent) { + if (cgroup_parent(cgrp)) { /* * We get a ref to the parent, and put the ref when * this cgroup is being freed, so it's guaranteed * that the parent won't be destroyed before its * children. */ - cgroup_put(cgrp->parent); + cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); kfree(cgrp); } else { @@ -4163,8 +4172,8 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->ss = ss; css->flags = 0; - if (cgrp->parent) { - css->parent = cgroup_css(cgrp->parent, ss); + if (cgroup_parent(cgrp)) { + css->parent = cgroup_css(cgroup_parent(cgrp), ss); css_get(css->parent); } @@ -4218,7 +4227,7 @@ static void offline_css(struct cgroup_subsys_state *css) */ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { - struct cgroup *parent = cgrp->parent; + struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; int err; @@ -4251,7 +4260,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) goto err_clear_dir; if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && - parent->parent) { + cgroup_parent(parent)) { pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", current->comm, current->pid, ss->name); if (!strcmp(ss->name, "memory")) @@ -4309,7 +4318,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, init_cgroup_housekeeping(cgrp); - cgrp->parent = parent; cgrp->self.parent = &parent->self; cgrp->root = root; @@ -4336,7 +4344,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgrp->serial_nr = cgroup_serial_nr_next++; /* allocation complete, commit to creation */ - list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); + list_add_tail_rcu(&cgrp->sibling, &cgroup_parent(cgrp)->children); atomic_inc(&root->nr_cgrps); cgroup_get(parent); @@ -4531,8 +4539,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ kernfs_remove(cgrp->kn); - set_bit(CGRP_RELEASABLE, &cgrp->parent->flags); - check_for_release(cgrp->parent); + set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags); + check_for_release(cgroup_parent(cgrp)); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); -- cgit From d5c419b68e368fdd9f1857bf8d4bb4480edb9b80 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:48 -0400 Subject: cgroup: move cgroup->sibling and ->children into cgroup_subsys_state We're moving towards using cgroup_subsys_states as the fundamental structural blocks. Let's move cgroup->sibling and ->children into cgroup_subsys_state. This is pure move without functional change and only cgroup->self's fields are actually used. Other csses will make use of the fields later. While at it, update init_and_link_css() so that it zeroes the whole css before initializing it and remove explicit zeroing of ->flags. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 11 ++++------- kernel/cgroup.c | 38 ++++++++++++++++++++------------------ 2 files changed, 24 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index fd538f4c2bb6..cf8ba26b7c6e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -68,6 +68,10 @@ struct cgroup_subsys_state { /* PI: the parent css */ struct cgroup_subsys_state *parent; + /* siblings list anchored at the parent's ->children */ + struct list_head sibling; + struct list_head children; + /* * PI: Subsys-unique ID. 0 is unused and root is always 1. The * matching css can be looked up using css_from_id(). @@ -171,13 +175,6 @@ struct cgroup { */ int populated_cnt; - /* - * We link our 'sibling' struct into our parent's 'children'. - * Our children link their 'sibling' into our 'children'. - */ - struct list_head sibling; /* my parent's children */ - struct list_head children; /* my children */ - struct kernfs_node *kn; /* cgroup kernfs entry */ struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c67a739aea4..5385839e727b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -378,7 +378,7 @@ static int notify_on_release(const struct cgroup *cgrp) /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ - list_for_each_entry((child), &(cgrp)->children, sibling) \ + list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ if (({ lockdep_assert_held(&cgroup_mutex); \ cgroup_is_dead(child); })) \ ; \ @@ -870,7 +870,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) mutex_lock(&cgroup_mutex); BUG_ON(atomic_read(&root->nr_cgrps)); - BUG_ON(!list_empty(&cgrp->children)); + BUG_ON(!list_empty(&cgrp->self.children)); /* Rebind all subsystems back to the default hierarchy */ rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); @@ -1432,7 +1432,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) } /* remounting is not allowed for populated hierarchies */ - if (!list_empty(&root->cgrp.children)) { + if (!list_empty(&root->cgrp.self.children)) { ret = -EBUSY; goto out_unlock; } @@ -1512,8 +1512,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) struct cgroup_subsys *ss; int ssid; - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); + INIT_LIST_HEAD(&cgrp->self.sibling); + INIT_LIST_HEAD(&cgrp->self.children); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); @@ -1612,7 +1612,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) link_css_set(&tmp_links, cset, root_cgrp); up_write(&css_set_rwsem); - BUG_ON(!list_empty(&root_cgrp->children)); + BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); kernfs_activate(root_cgrp->kn); @@ -3128,11 +3128,11 @@ css_next_child(struct cgroup_subsys_state *pos_css, * cgroup is removed or iteration and removal race. */ if (!pos) { - next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); + next = list_entry_rcu(cgrp->self.children.next, struct cgroup, self.sibling); } else if (likely(!cgroup_is_dead(pos))) { - next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling); } else { - list_for_each_entry_rcu(next, &cgrp->children, sibling) + list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling) if (next->serial_nr > pos->serial_nr) break; } @@ -3142,12 +3142,12 @@ css_next_child(struct cgroup_subsys_state *pos_css, * the next sibling; however, it might have @ss disabled. If so, * fast-forward to the next enabled one. */ - while (&next->sibling != &cgrp->children) { + while (&next->self.sibling != &cgrp->self.children) { struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss); if (next_css) return next_css; - next = list_entry_rcu(next->sibling.next, struct cgroup, sibling); + next = list_entry_rcu(next->self.sibling.next, struct cgroup, self.sibling); } return NULL; } @@ -3283,7 +3283,7 @@ static bool cgroup_has_live_children(struct cgroup *cgrp) struct cgroup *child; rcu_read_lock(); - list_for_each_entry_rcu(child, &cgrp->children, sibling) { + list_for_each_entry_rcu(child, &cgrp->self.children, self.sibling) { if (!cgroup_is_dead(child)) { rcu_read_unlock(); return true; @@ -4144,7 +4144,7 @@ static void css_release_work_fn(struct work_struct *work) } else { /* cgroup release path */ mutex_lock(&cgroup_mutex); - list_del_rcu(&cgrp->sibling); + list_del_rcu(&cgrp->self.sibling); mutex_unlock(&cgroup_mutex); cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); @@ -4168,9 +4168,11 @@ static void init_and_link_css(struct cgroup_subsys_state *css, { cgroup_get(cgrp); + memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; - css->flags = 0; + INIT_LIST_HEAD(&css->sibling); + INIT_LIST_HEAD(&css->children); if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); @@ -4344,7 +4346,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgrp->serial_nr = cgroup_serial_nr_next++; /* allocation complete, commit to creation */ - list_add_tail_rcu(&cgrp->sibling, &cgroup_parent(cgrp)->children); + list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); cgroup_get(parent); @@ -4507,9 +4509,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Make sure there's no live children. We can't test ->children - * emptiness as dead children linger on it while being destroyed; - * otherwise, "rmdir parent/child parent" may fail with -EBUSY. + * Make sure there's no live children. We can't test emptiness of + * ->self.children as dead children linger on it while being + * drained; otherwise, "rmdir parent/child parent" may fail. */ if (cgroup_has_live_children(cgrp)) return -EBUSY; -- cgit From 1fed1b2e36ba1aa0257004a97e75bbdb70f216b5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:49 -0400 Subject: cgroup: link all cgroup_subsys_states in their sibling lists Currently, while all csses have ->children and ->sibling, only the self csses of cgroups make use of them. This patch makes all other csses to link themselves on the sibling lists too. This will be used to update css iteration. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5385839e727b..dcb06e181ce4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4138,19 +4138,21 @@ static void css_release_work_fn(struct work_struct *work) struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; + mutex_lock(&cgroup_mutex); + + list_del_rcu(&css->sibling); + if (ss) { /* css release path */ cgroup_idr_remove(&ss->css_idr, css->id); } else { /* cgroup release path */ - mutex_lock(&cgroup_mutex); - list_del_rcu(&cgrp->self.sibling); - mutex_unlock(&cgroup_mutex); - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; } + mutex_unlock(&cgroup_mutex); + call_rcu(&css->rcu_head, css_free_rcu_fn); } @@ -4230,12 +4232,13 @@ static void offline_css(struct cgroup_subsys_state *css) static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); struct cgroup_subsys_state *css; int err; lockdep_assert_held(&cgroup_mutex); - css = ss->css_alloc(cgroup_css(parent, ss)); + css = ss->css_alloc(parent_css); if (IS_ERR(css)) return PTR_ERR(css); @@ -4255,11 +4258,12 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) goto err_free_id; /* @css is ready to be brought online now, make it visible */ + list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); err = online_css(css); if (err) - goto err_clear_dir; + goto err_list_del; if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && cgroup_parent(parent)) { @@ -4272,7 +4276,8 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) return 0; -err_clear_dir: +err_list_del: + list_del_rcu(&css->sibling); cgroup_clear_dir(css->cgroup, 1 << css->ss->id); err_free_id: cgroup_idr_remove(&ss->css_idr, css->id); -- cgit From 0cb51d71c1fa9234afe4213089844be76ec1765a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:49 -0400 Subject: cgroup: move cgroup->serial_nr into cgroup_subsys_state We're moving towards using cgroup_subsys_states as the fundamental structural blocks. All csses including the cgroup->self and actual ones now form trees through css->children and ->sibling which follow the same rules as what cgroup->children and ->sibling followed. This patch moves cgroup->serial_nr which is used to implement css iteration into css. Note that all csses, regardless of their types, allocate their serial numbers from the same monotonically increasing counter. This doesn't affect the ordering needed by css iteration or cause any other material behavior changes. This will be used to update css iteration. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 16 ++++++++-------- kernel/cgroup.c | 20 +++++++++++--------- 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index cf8ba26b7c6e..ebe7ce49f4b7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -80,6 +80,14 @@ struct cgroup_subsys_state { unsigned int flags; + /* + * Monotonically increasing unique serial number which defines a + * uniform order among all csses. It's guaranteed that all + * ->children lists are in the ascending order of ->serial_nr and + * used to allow interrupting and resuming iterations. + */ + u64 serial_nr; + /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; struct work_struct destroy_work; @@ -178,14 +186,6 @@ struct cgroup { struct kernfs_node *kn; /* cgroup kernfs entry */ struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ - /* - * Monotonically increasing unique serial number which defines a - * uniform order among all cgroups. It's guaranteed that all - * ->children lists are in the ascending order of ->serial_nr. - * It's used to allow interrupting and resuming iterations. - */ - u64 serial_nr; - /* the bitmask of subsystems enabled on the child cgroups */ unsigned int child_subsys_mask; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dcb06e181ce4..d5af128ec1ec 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -157,14 +157,13 @@ static int cgroup_root_count; static DEFINE_IDR(cgroup_hierarchy_idr); /* - * Assign a monotonically increasing serial number to cgroups. It - * guarantees cgroups with bigger numbers are newer than those with smaller - * numbers. Also, as cgroups are always appended to the parent's - * ->children list, it guarantees that sibling cgroups are always sorted in - * the ascending serial number order on the list. Protected by - * cgroup_mutex. + * Assign a monotonically increasing serial number to csses. It guarantees + * cgroups with bigger numbers are newer than those with smaller numbers. + * Also, as csses are always appended to the parent's ->children list, it + * guarantees that sibling csses are always sorted in the ascending serial + * number order on the list. Protected by cgroup_mutex. */ -static u64 cgroup_serial_nr_next = 1; +static u64 css_serial_nr_next = 1; /* This flag indicates whether tasks in the fork and exit paths should * check for fork/exit handlers to call. This avoids us having to do @@ -3133,7 +3132,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling); } else { list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling) - if (next->serial_nr > pos->serial_nr) + if (next->self.serial_nr > pos->self.serial_nr) break; } @@ -4168,6 +4167,8 @@ static void css_release(struct percpu_ref *ref) static void init_and_link_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { + lockdep_assert_held(&cgroup_mutex); + cgroup_get(cgrp); memset(css, 0, sizeof(*css)); @@ -4175,6 +4176,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->ss = ss; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); + css->serial_nr = css_serial_nr_next++; if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); @@ -4348,7 +4350,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, */ kernfs_get(kn); - cgrp->serial_nr = cgroup_serial_nr_next++; + cgrp->self.serial_nr = css_serial_nr_next++; /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); -- cgit From de3f034182ecbf0efbcef7ab8b253c6c3049a592 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:49 -0400 Subject: cgroup: introduce CSS_RELEASED and reduce css iteration fallback window css iterations allow the caller to drop RCU read lock. As long as the caller keeps the current position accessible, it can simply re-grab RCU read lock later and continue iteration. This is achieved by using CGRP_DEAD to detect whether the current positions next pointer is safe to dereference and if not re-iterate from the beginning to the next position using ->serial_nr. CGRP_DEAD is used as the marker to invalidate the next pointer and the only requirement is that the marker is set before the next sibling starts its RCU grace period. Because CGRP_DEAD is set at the end of cgroup_destroy_locked() but the cgroup is unlinked when the reference count reaches zero, we currently have a rather large window where this fallback re-iteration logic can be triggered. This patch introduces CSS_RELEASED which is set when a css is unlinked from its sibling list. This still keeps the re-iteration logic working while drastically reducing the window of its activation. While at it, rewrite the comment in css_next_child() to reflect the new flag and better explain the synchronization. This will also enable iterating csses directly instead of through cgroups. v2: CSS_RELEASED now assigned to 1 << 2 as 1 << 0 is used by CSS_NO_REF. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 1 + kernel/cgroup.c | 41 ++++++++++++++++++++--------------------- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ebe7ce49f4b7..5375582ea5f6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -97,6 +97,7 @@ struct cgroup_subsys_state { enum { CSS_NO_REF = (1 << 0), /* no reference counting for this css */ CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ + CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ }; /** diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d5af128ec1ec..5544e685f2da 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3108,27 +3108,28 @@ css_next_child(struct cgroup_subsys_state *pos_css, cgroup_assert_mutex_or_rcu_locked(); /* - * @pos could already have been removed. Once a cgroup is removed, - * its ->sibling.next is no longer updated when its next sibling - * changes. As CGRP_DEAD assertion is serialized and happens - * before the cgroup is taken off the ->sibling list, if we see it - * unasserted, it's guaranteed that the next sibling hasn't - * finished its grace period even if it's already removed, and thus - * safe to dereference from this RCU critical section. If - * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed - * to be visible as %true here. + * @pos could already have been unlinked from the sibling list. + * Once a cgroup is removed, its ->sibling.next is no longer + * updated when its next sibling changes. CSS_RELEASED is set when + * @pos is taken off list, at which time its next pointer is valid, + * and, as releases are serialized, the one pointed to by the next + * pointer is guaranteed to not have started release yet. This + * implies that if we observe !CSS_RELEASED on @pos in this RCU + * critical section, the one pointed to by its next pointer is + * guaranteed to not have finished its RCU grace period even if we + * have dropped rcu_read_lock() inbetween iterations. * - * If @pos is dead, its next pointer can't be dereferenced; - * however, as each cgroup is given a monotonically increasing - * unique serial number and always appended to the sibling list, - * the next one can be found by walking the parent's children until - * we see a cgroup with higher serial number than @pos's. While - * this path can be slower, it's taken only when either the current - * cgroup is removed or iteration and removal race. + * If @pos has CSS_RELEASED set, its next pointer can't be + * dereferenced; however, as each css is given a monotonically + * increasing unique serial number and always appended to the + * sibling list, the next one can be found by walking the parent's + * children until the first css with higher serial number than + * @pos's. While this path can be slower, it happens iff iteration + * races against release and the race window is very small. */ if (!pos) { next = list_entry_rcu(cgrp->self.children.next, struct cgroup, self.sibling); - } else if (likely(!cgroup_is_dead(pos))) { + } else if (likely(!(pos->self.flags & CSS_RELEASED))) { next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling); } else { list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling) @@ -4139,6 +4140,7 @@ static void css_release_work_fn(struct work_struct *work) mutex_lock(&cgroup_mutex); + css->flags |= CSS_RELEASED; list_del_rcu(&css->sibling); if (ss) { @@ -4525,10 +4527,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp dead. This prevents further task migration and child - * creation by disabling cgroup_lock_live_group(). Note that - * CGRP_DEAD assertion is depended upon by css_next_child() to - * resume iteration after dropping RCU read lock. See - * css_next_child() for details. + * creation by disabling cgroup_lock_live_group(). */ set_bit(CGRP_DEAD, &cgrp->flags); -- cgit From c2931b70a32c705b9bd5762f5044f9eac8a52bb3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:51 -0400 Subject: cgroup: iterate cgroup_subsys_states directly Currently, css_next_child() is implemented as finding the next child cgroup which has the css enabled, which used to be the only way to do it as only cgroups participated in sibling lists and thus could be iteratd. This works as long as what's required during iteration is not missing online csses; however, it turns out that there are use cases where offlined but not yet released csses need to be iterated. This is difficult to implement through cgroup iteration the unified hierarchy as there may be multiple dying csses for the same subsystem associated with single cgroup. After the recent changes, the cgroup self and regular csses behave identically in how they're linked and unlinked from the sibling lists including assertion of CSS_RELEASED and css_next_child() can simply switch to iterating csses directly. This both simplifies the logic and ensures that all visible non-released csses are included in the iteration whether there are multiple dying csses for a subsystem or not. As all other iterators depend on css_next_child() for sibling iteration, this changes behaviors of all css iterators. Add and update explanations on the css states which are included in traversal to all iterators. As css iteration could always contain offlined csses, this shouldn't break any of the current users and new usages which need iteration of all on and offline csses can make use of the new semantics. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Johannes Weiner --- include/linux/cgroup.h | 44 ++++++++++++++++++++--------------- kernel/cgroup.c | 62 ++++++++++++++++++++++++++++++-------------------- 2 files changed, 63 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5375582ea5f6..f2ff578fc03a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -764,14 +764,14 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); * @pos: the css * to use as the loop cursor * @parent: css whose children to walk * - * Walk @parent's children. Must be called under rcu_read_lock(). A child - * css which hasn't finished ->css_online() or already has finished - * ->css_offline() may show up during traversal and it's each subsystem's - * responsibility to verify that each @pos is alive. + * Walk @parent's children. Must be called under rcu_read_lock(). * - * If a subsystem synchronizes against the parent in its ->css_online() and - * before starting iterating, a css which finished ->css_online() is - * guaranteed to be visible in the future iterations. + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until @@ -794,17 +794,16 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos); * @root: css whose descendants to walk * * Walk @root's descendants. @root is included in the iteration and the - * first node to be visited. Must be called under rcu_read_lock(). A - * descendant css which hasn't finished ->css_online() or already has - * finished ->css_offline() may show up during traversal and it's each - * subsystem's responsibility to verify that each @pos is alive. + * first node to be visited. Must be called under rcu_read_lock(). * - * If a subsystem synchronizes against the parent in its ->css_online() and - * before starting iterating, and synchronizes against @pos on each - * iteration, any descendant css which finished ->css_online() is - * guaranteed to be visible in the future iterations. + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. * - * In other words, the following guarantees that a descendant can't escape + * For example, the following guarantees that a descendant can't escape * state updates of its ancestors. * * my_online(@css) @@ -860,8 +859,17 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, * * Similar to css_for_each_descendant_pre() but performs post-order * traversal instead. @root is included in the iteration and the last - * node to be visited. Note that the walk visibility guarantee described - * in pre-order walk doesn't apply the same to post-order walks. + * node to be visited. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. + * + * Note that the walk visibility guarantee example described in pre-order + * walk doesn't apply the same to post-order walks. */ #define css_for_each_descendant_post(pos, css) \ for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5544e685f2da..097a1fc1e1e8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3089,21 +3089,25 @@ static int cgroup_task_count(const struct cgroup *cgrp) /** * css_next_child - find the next child of a given css - * @pos_css: the current position (%NULL to initiate traversal) - * @parent_css: css whose children to walk + * @pos: the current position (%NULL to initiate traversal) + * @parent: css whose children to walk * - * This function returns the next child of @parent_css and should be called + * This function returns the next child of @parent and should be called * under either cgroup_mutex or RCU read lock. The only requirement is - * that @parent_css and @pos_css are accessible. The next sibling is - * guaranteed to be returned regardless of their states. + * that @parent and @pos are accessible. The next sibling is guaranteed to + * be returned regardless of their states. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. */ -struct cgroup_subsys_state * -css_next_child(struct cgroup_subsys_state *pos_css, - struct cgroup_subsys_state *parent_css) +struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *parent) { - struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; - struct cgroup *cgrp = parent_css->cgroup; - struct cgroup *next; + struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); @@ -3128,27 +3132,21 @@ css_next_child(struct cgroup_subsys_state *pos_css, * races against release and the race window is very small. */ if (!pos) { - next = list_entry_rcu(cgrp->self.children.next, struct cgroup, self.sibling); - } else if (likely(!(pos->self.flags & CSS_RELEASED))) { - next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling); + next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling); + } else if (likely(!(pos->flags & CSS_RELEASED))) { + next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); } else { - list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling) - if (next->self.serial_nr > pos->self.serial_nr) + list_for_each_entry_rcu(next, &parent->children, sibling) + if (next->serial_nr > pos->serial_nr) break; } /* * @next, if not pointing to the head, can be dereferenced and is - * the next sibling; however, it might have @ss disabled. If so, - * fast-forward to the next enabled one. + * the next sibling. */ - while (&next->self.sibling != &cgrp->self.children) { - struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss); - - if (next_css) - return next_css; - next = list_entry_rcu(next->self.sibling.next, struct cgroup, self.sibling); - } + if (&next->sibling != &parent->children) + return next; return NULL; } @@ -3165,6 +3163,13 @@ css_next_child(struct cgroup_subsys_state *pos_css, * doesn't require the whole traversal to be contained in a single critical * section. This function will return the correct next descendant as long * as both @pos and @root are accessible and @pos is a descendant of @root. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_pre(struct cgroup_subsys_state *pos, @@ -3252,6 +3257,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * section. This function will return the correct next descendant as long * as both @pos and @cgroup are accessible and @pos is a descendant of * @cgroup. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_post(struct cgroup_subsys_state *pos, -- cgit From 184faf32328c65c9d86b19577b8d8b90bdd2cd2e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:51 -0400 Subject: cgroup: use CSS_ONLINE instead of CGRP_DEAD Use CSS_ONLINE on the self css to indicate whether a cgroup has been killed instead of CGRP_DEAD. This will allow re-using css online test for cgroup liveliness test. This doesn't introduce any functional change. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 2 -- kernel/cgroup.c | 7 ++++--- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f2ff578fc03a..51a339c99eb6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -143,8 +143,6 @@ static inline void css_put(struct cgroup_subsys_state *css) /* bits in struct cgroup flags field */ enum { - /* Control Group is dead */ - CGRP_DEAD, /* * Control Group has previously had a child cgroup or a task, * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 097a1fc1e1e8..004004fd0ded 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -278,7 +278,7 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { - return test_bit(CGRP_DEAD, &cgrp->flags); + return !(cgrp->self.flags & CSS_ONLINE); } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) @@ -1518,6 +1518,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; + cgrp->self.flags |= CSS_ONLINE; for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); @@ -4541,13 +4542,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * Mark @cgrp dead. This prevents further task migration and child * creation by disabling cgroup_lock_live_group(). */ - set_bit(CGRP_DEAD, &cgrp->flags); + cgrp->self.flags &= ~CSS_ONLINE; /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); - /* CGRP_DEAD is set, remove from ->release_list for the last time */ + /* CSS_ONLINE is clear, remove from ->release_list for the last time */ raw_spin_lock(&release_list_lock); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); -- cgit From f3d4650015301d1c880df4523f7e7ef320a38aab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:52 -0400 Subject: cgroup: convert cgroup_has_live_children() into css_has_online_children() Now that cgroup liveliness and css onliness are the same state, convert cgroup_has_live_children() into css_has_online_children() so that it can be used for actual csses too. The function now uses css_for_each_child() for iteration and is published. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 2 ++ kernel/cgroup.c | 32 ++++++++++++++++++++------------ 2 files changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 51a339c99eb6..b76999954beb 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -873,6 +873,8 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) +bool css_has_online_children(struct cgroup_subsys_state *css); + /* A css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys *ss; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 004004fd0ded..082bb842b11a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -175,7 +175,6 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); -static bool cgroup_has_live_children(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); static int cgroup_destroy_locked(struct cgroup *cgrp); @@ -1769,7 +1768,7 @@ static void cgroup_kill_sb(struct super_block *sb) * This prevents new mounts by disabling percpu_ref_tryget_live(). * cgroup_mount() may wait for @root's release. */ - if (cgroup_has_live_children(&root->cgrp)) + if (css_has_online_children(&root->cgrp.self)) cgroup_put(&root->cgrp); else percpu_ref_kill(&root->cgrp.self.refcnt); @@ -3291,19 +3290,28 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, return pos->parent; } -static bool cgroup_has_live_children(struct cgroup *cgrp) +/** + * css_has_online_children - does a css have online children + * @css: the target css + * + * Returns %true if @css has any online children; otherwise, %false. This + * function can be called from any context but the caller is responsible + * for synchronizing against on/offlining as necessary. + */ +bool css_has_online_children(struct cgroup_subsys_state *css) { - struct cgroup *child; + struct cgroup_subsys_state *child; + bool ret = false; rcu_read_lock(); - list_for_each_entry_rcu(child, &cgrp->self.children, self.sibling) { - if (!cgroup_is_dead(child)) { - rcu_read_unlock(); - return true; + css_for_each_child(child, css) { + if (css->flags & CSS_ONLINE) { + ret = true; + break; } } rcu_read_unlock(); - return false; + return ret; } /** @@ -4535,7 +4543,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * ->self.children as dead children linger on it while being * drained; otherwise, "rmdir parent/child parent" may fail. */ - if (cgroup_has_live_children(cgrp)) + if (css_has_online_children(&cgrp->self)) return -EBUSY; /* @@ -5014,8 +5022,8 @@ void cgroup_exit(struct task_struct *tsk) static void check_for_release(struct cgroup *cgrp) { - if (cgroup_is_releasable(cgrp) && - list_empty(&cgrp->cset_links) && !cgroup_has_live_children(cgrp)) { + if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && + !css_has_online_children(&cgrp->self)) { /* * Control Group is currently removeable. If it's not * already queued for a userspace notification, queue -- cgit From f6514be5fe7fe796041b673bad769510414ff2b9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 14 May 2014 19:08:46 +0300 Subject: PM / hibernate: Fix memory corruption in resumedelay_setup() In the original code "resume_delay" is an int so on 64 bits, the call to kstrtoul() will cause memory corruption. We may as well fix a style issue here as well and make "resume_delay" unsigned int, since that's what we pass to ssleep(). Fixes: 317cf7e5e85e (PM / hibernate: convert simple_strtoul to kstrtoul) Signed-off-by: Dan Carpenter Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 2377ff72994c..df88d55dc436 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -35,7 +35,7 @@ static int nocompress; static int noresume; static int resume_wait; -static int resume_delay; +static unsigned int resume_delay; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; @@ -1115,7 +1115,7 @@ static int __init resumewait_setup(char *str) static int __init resumedelay_setup(char *str) { - int rc = kstrtoul(str, 0, (unsigned long *)&resume_delay); + int rc = kstrtouint(str, 0, &resume_delay); if (rc) return rc; -- cgit From 8c7260c0d9a6fa327546af724fb48375df493326 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Fri, 16 May 2014 23:26:02 +0200 Subject: sparc64: fix sparse warning in tsb.c Fix following warning: tsb.c:290:5: warning: symbol 'sysctl_tsb_ratio' was not declared. Should it be static? Add extern declaration in asm/setup.h and remove local declaration in kernel/sysctl.c Signed-off-by: Sam Ravnborg Signed-off-by: David S. Miller --- arch/sparc/include/asm/setup.h | 1 + arch/sparc/mm/tsb.c | 1 + kernel/sysctl.c | 4 ---- 3 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/sparc/include/asm/setup.h b/arch/sparc/include/asm/setup.h index b5f5686ca57e..0c4f69528d0d 100644 --- a/arch/sparc/include/asm/setup.h +++ b/arch/sparc/include/asm/setup.h @@ -52,6 +52,7 @@ unsigned long safe_compute_effective_address(struct pt_regs *, unsigned int); int handle_ldf_stq(u32 insn, struct pt_regs *regs); void handle_ld_nf(u32 insn, struct pt_regs *regs); +extern int sysctl_tsb_ratio; #endif void sun_do_break(void); diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c index f5d506fdddad..33ca9360f591 100644 --- a/arch/sparc/mm/tsb.c +++ b/arch/sparc/mm/tsb.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..3c202d00f6e8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -152,10 +152,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #ifdef CONFIG_SPARC #endif -#ifdef CONFIG_SPARC64 -extern int sysctl_tsb_ratio; -#endif - #ifdef __hppa__ extern int pwrsw_enabled; #endif -- cgit From 866293ee54227584ffcb4a42f69c1f365974ba7f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2014 20:45:34 +0000 Subject: futex: Add another early deadlock detection check Dave Jones trinity syscall fuzzer exposed an issue in the deadlock detection code of rtmutex: http://lkml.kernel.org/r/20140429151655.GA14277@redhat.com That underlying issue has been fixed with a patch to the rtmutex code, but the futex code must not call into rtmutex in that case because - it can detect that issue early - it avoids a different and more complex fixup for backing out If the user space variable got manipulated to 0x80000000 which means no lock holder, but the waiters bit set and an active pi_state in the kernel is found we can figure out the recursive locking issue by looking at the pi_state owner. If that is the current task, then we can safely return -EDEADLK. The check should have been added in commit 59fa62451 (futex: Handle futex_pi OWNER_DIED take over correctly) already, but I did not see the above issue caused by user space manipulation back then. Signed-off-by: Thomas Gleixner Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Darren Hart Cc: Davidlohr Bueso Cc: Steven Rostedt Cc: Clark Williams Cc: Paul McKenney Cc: Lai Jiangshan Cc: Roland McGrath Cc: Carlos ODonell Cc: Jakub Jelinek Cc: Michael Kerrisk Cc: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/20140512201701.097349971@linutronix.de Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- kernel/futex.c | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 5f589279e462..7c68225e3967 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -745,7 +745,8 @@ void exit_pi_state_list(struct task_struct *curr) static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps) + union futex_key *key, struct futex_pi_state **ps, + struct task_struct *task) { struct futex_pi_state *pi_state = NULL; struct futex_q *this, *next; @@ -786,6 +787,16 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return -EINVAL; } + /* + * Protect against a corrupted uval. If uval + * is 0x80000000 then pid is 0 and the waiter + * bit is set. So the deadlock check in the + * calling code has failed and we did not fall + * into the check above due to !pid. + */ + if (task && pi_state->owner == task) + return -EDEADLK; + atomic_inc(&pi_state->refcount); *ps = pi_state; @@ -935,7 +946,7 @@ retry: * We dont have the lock. Look up the PI state (or create it if * we are the first waiter): */ - ret = lookup_pi_state(uval, hb, key, ps); + ret = lookup_pi_state(uval, hb, key, ps, task); if (unlikely(ret)) { switch (ret) { @@ -1347,7 +1358,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * * Return: * 0 - failed to acquire the lock atomically; - * 1 - acquired the lock; + * >0 - acquired the lock, return value is vpid of the top_waiter * <0 - error */ static int futex_proxy_trylock_atomic(u32 __user *pifutex, @@ -1358,7 +1369,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, { struct futex_q *top_waiter = NULL; u32 curval; - int ret; + int ret, vpid; if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; @@ -1386,11 +1397,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * the contended case or if set_waiters is 1. The pi_state is returned * in ps in contended cases. */ + vpid = task_pid_vnr(top_waiter->task); ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, set_waiters); - if (ret == 1) + if (ret == 1) { requeue_pi_wake_futex(top_waiter, key2, hb2); - + return vpid; + } return ret; } @@ -1421,7 +1434,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; - u32 curval2; if (requeue_pi) { /* @@ -1509,16 +1521,25 @@ retry_private: * At this point the top_waiter has either taken uaddr2 or is * waiting on it. If the former, then the pi_state will not * exist yet, look it up one more time to ensure we have a - * reference to it. + * reference to it. If the lock was taken, ret contains the + * vpid of the top waiter task. */ - if (ret == 1) { + if (ret > 0) { WARN_ON(pi_state); drop_count++; task_count++; - ret = get_futex_value_locked(&curval2, uaddr2); - if (!ret) - ret = lookup_pi_state(curval2, hb2, &key2, - &pi_state); + /* + * If we acquired the lock, then the user + * space value of uaddr2 should be vpid. It + * cannot be changed by the top waiter as it + * is blocked on hb2 lock if it tries to do + * so. If something fiddled with it behind our + * back the pi state lookup might unearth + * it. So we rather use the known value than + * rereading and handing potential crap to + * lookup_pi_state. + */ + ret = lookup_pi_state(ret, hb2, &key2, &pi_state, NULL); } switch (ret) { -- cgit From f0d71b3dcb8332f7971b5f2363632573e6d9486a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 May 2014 20:45:35 +0000 Subject: futex: Prevent attaching to kernel threads We happily allow userspace to declare a random kernel thread to be the owner of a user space PI futex. Found while analysing the fallout of Dave Jones syscall fuzzer. We also should validate the thread group for private futexes and find some fast way to validate whether the "alleged" owner has RW access on the file which backs the SHM, but that's a separate issue. Signed-off-by: Thomas Gleixner Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Darren Hart Cc: Davidlohr Bueso Cc: Steven Rostedt Cc: Clark Williams Cc: Paul McKenney Cc: Lai Jiangshan Cc: Roland McGrath Cc: Carlos ODonell Cc: Jakub Jelinek Cc: Michael Kerrisk Cc: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/20140512201701.194824402@linutronix.de Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- kernel/futex.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 7c68225e3967..81dbe773ce4c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -814,6 +814,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, if (!p) return -ESRCH; + if (!p->mm) { + put_task_struct(p); + return -EPERM; + } + /* * We need to look at the task state flags to figure out, * whether the task is exiting. To protect against the do_exit -- cgit From 0819b2e30ccb93edf04876237b6205eef84ec8d2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 May 2014 20:23:48 +0200 Subject: perf: Limit perf_event_attr::sample_period to 63 bits Vince reported that using a large sample_period (one with bit 63 set) results in wreckage since while the sample_period is fundamentally unsigned (negative periods don't make sense) the way we implement things very much rely on signed logic. So limit sample_period to 63 bits to avoid tripping over this. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/n/tip-p25fhunibl4y3qi0zuqmyf4b@git.kernel.org Signed-off-by: Thomas Gleixner --- kernel/events/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 71232844f235..1d1ec6453a08 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7029,6 +7029,9 @@ SYSCALL_DEFINE5(perf_event_open, if (attr.freq) { if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL; + } else { + if (attr.sample_period & (1ULL << 63)) + return -EINVAL; } /* -- cgit From 39af6b1678afa5880dda7e375cf3f9d395087f6d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 7 Apr 2014 11:04:08 +0200 Subject: perf: Prevent false warning in perf_swevent_add The perf cpu offline callback takes down all cpu context events and releases swhash->swevent_hlist. This could race with task context software event being just scheduled on this cpu via perf_swevent_add while cpu hotplug code already cleaned up event's data. The race happens in the gap between the cpu notifier code and the cpu being actually taken down. Note that only cpu ctx events are terminated in the perf cpu hotplug code. It's easily reproduced with: $ perf record -e faults perf bench sched pipe while putting one of the cpus offline: # echo 0 > /sys/devices/system/cpu/cpu1/online Console emits following warning: WARNING: CPU: 1 PID: 2845 at kernel/events/core.c:5672 perf_swevent_add+0x18d/0x1a0() Modules linked in: CPU: 1 PID: 2845 Comm: sched-pipe Tainted: G W 3.14.0+ #256 Hardware name: Intel Corporation Montevina platform/To be filled by O.E.M., BIOS AMVACRB1.86C.0066.B00.0805070703 05/07/2008 0000000000000009 ffff880077233ab8 ffffffff81665a23 0000000000200005 0000000000000000 ffff880077233af8 ffffffff8104732c 0000000000000046 ffff88007467c800 0000000000000002 ffff88007a9cf2a0 0000000000000001 Call Trace: [] dump_stack+0x4f/0x7c [] warn_slowpath_common+0x8c/0xc0 [] warn_slowpath_null+0x1a/0x20 [] perf_swevent_add+0x18d/0x1a0 [] event_sched_in.isra.75+0x9e/0x1f0 [] group_sched_in+0x6a/0x1f0 [] ? sched_clock_local+0x25/0xa0 [] ctx_sched_in+0x1f6/0x450 [] perf_event_sched_in+0x6b/0xa0 [] perf_event_context_sched_in+0x7b/0xc0 [] __perf_event_task_sched_in+0x43e/0x460 [] ? put_lock_stats.isra.18+0xe/0x30 [] finish_task_switch+0xb8/0x100 [] __schedule+0x30e/0xad0 [] ? pipe_read+0x3e2/0x560 [] ? preempt_schedule_irq+0x3e/0x70 [] ? preempt_schedule_irq+0x3e/0x70 [] preempt_schedule_irq+0x44/0x70 [] retint_kernel+0x20/0x30 [] ? lockdep_sys_exit+0x1a/0x90 [] lockdep_sys_exit_thunk+0x35/0x67 [] ? sysret_check+0x5/0x56 Fixing this by tracking the cpu hotplug state and displaying the WARN only if current cpu is initialized properly. Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: stable@vger.kernel.org Reported-by: Fengguang Wu Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1396861448-10097-1-git-send-email-jolsa@redhat.com Signed-off-by: Thomas Gleixner --- kernel/events/core.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1d1ec6453a08..feb1329ca331 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5419,6 +5419,9 @@ struct swevent_htable { /* Recursion avoidance in each contexts */ int recursion[PERF_NR_CONTEXTS]; + + /* Keeps track of cpu being initialized/exited */ + bool online; }; static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); @@ -5665,8 +5668,14 @@ static int perf_swevent_add(struct perf_event *event, int flags) hwc->state = !(flags & PERF_EF_START); head = find_swevent_head(swhash, event); - if (WARN_ON_ONCE(!head)) + if (!head) { + /* + * We can race with cpu hotplug code. Do not + * WARN if the cpu just got unplugged. + */ + WARN_ON_ONCE(swhash->online); return -EINVAL; + } hlist_add_head_rcu(&event->hlist_entry, head); @@ -7845,6 +7854,7 @@ static void perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); + swhash->online = true; if (swhash->hlist_refcount > 0) { struct swevent_hlist *hlist; @@ -7902,6 +7912,7 @@ static void perf_event_exit_cpu(int cpu) perf_event_exit_cpu_context(cpu); mutex_lock(&swhash->hlist_mutex); + swhash->online = false; swevent_hlist_release(swhash); mutex_unlock(&swhash->hlist_mutex); } -- cgit From b69cf53640da2b86439596118cfa95233154ee76 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Mar 2014 10:50:33 +0100 Subject: perf: Fix a race between ring_buffer_detach() and ring_buffer_attach() Alexander noticed that we use RCU iteration on rb->event_list but do not use list_{add,del}_rcu() to add,remove entries to that list, nor do we observe proper grace periods when re-using the entries. Merge ring_buffer_detach() into ring_buffer_attach() such that attaching to the NULL buffer is detaching. Furthermore, ensure that between any 'detach' and 'attach' of the same event we observe the required grace period, but only when strictly required. In effect this means that only ioctl(.request = PERF_EVENT_IOC_SET_OUTPUT) will wait for a grace period, while the normal initial attach and final detach will not be delayed. This patch should, I think, do the right thing under all circumstances, the 'normal' cases all should never see the extra grace period, but the two cases: 1) PERF_EVENT_IOC_SET_OUTPUT on an event which already has a ring_buffer set, will now observe the required grace period between removing itself from the old and attaching itself to the new buffer. This case is 'simple' in that both buffers are present in perf_event_set_output() one could think an unconditional synchronize_rcu() would be sufficient; however... 2) an event that has a buffer attached, the buffer is destroyed (munmap) and then the event is attached to a new/different buffer using PERF_EVENT_IOC_SET_OUTPUT. This case is more complex because the buffer destruction does: ring_buffer_attach(.rb = NULL) followed by the ioctl() doing: ring_buffer_attach(.rb = foo); and we still need to observe the grace period between these two calls due to us reusing the event->rb_entry list_head. In order to make 2 happen we use Paul's latest cond_synchronize_rcu() call. Cc: Paul Mackerras Cc: Stephane Eranian Cc: Andi Kleen Cc: "Paul E. McKenney" Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Mike Galbraith Reported-by: Alexander Shishkin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140507123526.GD13658@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- include/linux/perf_event.h | 2 + kernel/events/core.c | 109 ++++++++++++++++++++------------------------- 2 files changed, 51 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3356abcfff18..3ef6ea12806a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -402,6 +402,8 @@ struct perf_event { struct ring_buffer *rb; struct list_head rb_entry; + unsigned long rcu_batches; + int rcu_pending; /* poll related */ wait_queue_head_t waitq; diff --git a/kernel/events/core.c b/kernel/events/core.c index feb1329ca331..440eefc67397 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3192,7 +3192,8 @@ static void free_event_rcu(struct rcu_head *head) } static void ring_buffer_put(struct ring_buffer *rb); -static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); +static void ring_buffer_attach(struct perf_event *event, + struct ring_buffer *rb); static void unaccount_event_cpu(struct perf_event *event, int cpu) { @@ -3252,8 +3253,6 @@ static void free_event(struct perf_event *event) unaccount_event(event); if (event->rb) { - struct ring_buffer *rb; - /* * Can happen when we close an event with re-directed output. * @@ -3261,12 +3260,7 @@ static void free_event(struct perf_event *event) * over us; possibly making our ring_buffer_put() the last. */ mutex_lock(&event->mmap_mutex); - rb = event->rb; - if (rb) { - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); - ring_buffer_put(rb); /* could be last */ - } + ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); } @@ -3850,28 +3844,47 @@ unlock: static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb) { + struct ring_buffer *old_rb = NULL; unsigned long flags; - if (!list_empty(&event->rb_entry)) - return; + if (event->rb) { + /* + * Should be impossible, we set this when removing + * event->rb_entry and wait/clear when adding event->rb_entry. + */ + WARN_ON_ONCE(event->rcu_pending); - spin_lock_irqsave(&rb->event_lock, flags); - if (list_empty(&event->rb_entry)) - list_add(&event->rb_entry, &rb->event_list); - spin_unlock_irqrestore(&rb->event_lock, flags); -} + old_rb = event->rb; + event->rcu_batches = get_state_synchronize_rcu(); + event->rcu_pending = 1; -static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) -{ - unsigned long flags; + spin_lock_irqsave(&old_rb->event_lock, flags); + list_del_rcu(&event->rb_entry); + spin_unlock_irqrestore(&old_rb->event_lock, flags); + } - if (list_empty(&event->rb_entry)) - return; + if (event->rcu_pending && rb) { + cond_synchronize_rcu(event->rcu_batches); + event->rcu_pending = 0; + } - spin_lock_irqsave(&rb->event_lock, flags); - list_del_init(&event->rb_entry); - wake_up_all(&event->waitq); - spin_unlock_irqrestore(&rb->event_lock, flags); + if (rb) { + spin_lock_irqsave(&rb->event_lock, flags); + list_add_rcu(&event->rb_entry, &rb->event_list); + spin_unlock_irqrestore(&rb->event_lock, flags); + } + + rcu_assign_pointer(event->rb, rb); + + if (old_rb) { + ring_buffer_put(old_rb); + /* + * Since we detached before setting the new rb, so that we + * could attach the new rb, we could have missed a wakeup. + * Provide it now. + */ + wake_up_all(&event->waitq); + } } static void ring_buffer_wakeup(struct perf_event *event) @@ -3940,7 +3953,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - struct ring_buffer *rb = event->rb; + struct ring_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); @@ -3948,18 +3961,14 @@ static void perf_mmap_close(struct vm_area_struct *vma) atomic_dec(&rb->mmap_count); if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) - return; + goto out_put; - /* Detach current event from the buffer. */ - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); + ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); /* If there's still other mmap()s of this buffer, we're done. */ - if (atomic_read(&rb->mmap_count)) { - ring_buffer_put(rb); /* can't be last */ - return; - } + if (atomic_read(&rb->mmap_count)) + goto out_put; /* * No other mmap()s, detach from all other events that might redirect @@ -3989,11 +3998,9 @@ again: * still restart the iteration to make sure we're not now * iterating the wrong list. */ - if (event->rb == rb) { - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); - ring_buffer_put(rb); /* can't be last, we still have one */ - } + if (event->rb == rb) + ring_buffer_attach(event, NULL); + mutex_unlock(&event->mmap_mutex); put_event(event); @@ -4018,6 +4025,7 @@ again: vma->vm_mm->pinned_vm -= mmap_locked; free_uid(mmap_user); +out_put: ring_buffer_put(rb); /* could be last */ } @@ -4135,7 +4143,6 @@ again: vma->vm_mm->pinned_vm += extra; ring_buffer_attach(event, rb); - rcu_assign_pointer(event->rb, rb); perf_event_init_userpage(event); perf_event_update_userpage(event); @@ -6934,7 +6941,7 @@ err_size: static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct ring_buffer *rb = NULL, *old_rb = NULL; + struct ring_buffer *rb = NULL; int ret = -EINVAL; if (!output_event) @@ -6962,8 +6969,6 @@ set: if (atomic_read(&event->mmap_count)) goto unlock; - old_rb = event->rb; - if (output_event) { /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); @@ -6971,23 +6976,7 @@ set: goto unlock; } - if (old_rb) - ring_buffer_detach(event, old_rb); - - if (rb) - ring_buffer_attach(event, rb); - - rcu_assign_pointer(event->rb, rb); - - if (old_rb) { - ring_buffer_put(old_rb); - /* - * Since we detached before setting the new rb, so that we - * could attach the new rb, we could have missed a wakeup. - * Provide it now. - */ - wake_up_all(&event->waitq); - } + ring_buffer_attach(event, rb); ret = 0; unlock: -- cgit From 12665b35b0b48c9583ee1b8f0a403dc708fb4a92 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 10 May 2014 13:10:59 +0200 Subject: perf/events/core: Drop unused variable after cleanup ... in 3a497f48637 ("perf: Simplify perf_event_exit_task_context()") Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1399720259-28275-1-git-send-email-bp@alien8.de Signed-off-by: Thomas Gleixner --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7ab734fbaeeb..ed50b0943213 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7431,7 +7431,7 @@ __perf_event_exit_task(struct perf_event *child_event, static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { - struct perf_event *child_event, *tmp; + struct perf_event *child_event; struct perf_event_context *child_ctx; unsigned long flags; -- cgit From 61f38db3e3c0e4c3be0858750e2cabeadaecac0c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sat, 26 Apr 2014 23:15:35 -0700 Subject: rcu: Provide API to suppress stall warnings while sysrc runs Some sysrq handlers can run for a long time, because they dump a lot of data onto a serial console. Having RCU stall warnings pop up in the middle of them only makes the problem worse. This commit provides rcu_sysrq_start() and rcu_sysrq_end() APIs to temporarily suppress RCU CPU stall warnings while a sysrq request is handled. Signed-off-by: Rik van Riel [ paulmck: Fix TINY_RCU build error. ] Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 12 ++++++++++++ kernel/rcu/update.c | 12 ++++++++++++ 2 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 9ccd644c1234..5a75d19aa661 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -248,6 +248,18 @@ void rcu_idle_exit(void); void rcu_irq_enter(void); void rcu_irq_exit(void); +#ifdef CONFIG_RCU_STALL_COMMON +void rcu_sysrq_start(void); +void rcu_sysrq_end(void); +#else /* #ifdef CONFIG_RCU_STALL_COMMON */ +static inline void rcu_sysrq_start(void) +{ +} +static inline void rcu_sysrq_end(void) +{ +} +#endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */ + #ifdef CONFIG_RCU_USER_QS void rcu_user_enter(void); void rcu_user_exit(void); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index ed7a0d72562c..a2aeb4df0f60 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -320,6 +320,18 @@ int rcu_jiffies_till_stall_check(void) return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; } +void rcu_sysrq_start(void) +{ + if (!rcu_cpu_stall_suppress) + rcu_cpu_stall_suppress = 2; +} + +void rcu_sysrq_end(void) +{ + if (rcu_cpu_stall_suppress == 2) + rcu_cpu_stall_suppress = 0; +} + static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) { rcu_cpu_stall_suppress = 1; -- cgit From 5533e0114425dcdb878f11b291f2727af8667a7c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 19:33:07 -0400 Subject: cgroup: disallow debug controller on the default hierarchy The debug controller, as its name suggests, exposes cgroup core internals to userland to aid debugging. Unfortunately, except for the name, there's no provision to prevent its usage in production configurations and the controller is widely enabled and mounted leaking internal details to userland. Like most other debug information, the information exposed by debug isn't interesting even for debugging itself once the related parts are working reliably. This controller has no reason for existing. This patch implements cgrp_dfl_root_inhibit_ss_mask which can suppress specific subsystems on the default hierarchy and adds the debug subsystem to it so that it can be gradually deprecated as usages move towards the unified hierarchy. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 2 ++ include/linux/cgroup_subsys.h | 11 +++++++---- kernel/cgroup.c | 21 ++++++++++++++++++--- 3 files changed, 27 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4afe544d3547..8a111dd42d7a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -305,6 +305,8 @@ enum { * the flag is not created. * * - blkcg: blk-throttle becomes properly hierarchical. + * + * - debug: disallowed on the default hierarchy. */ CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 768fe44e19f0..98c4f9b12b03 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -7,10 +7,6 @@ SUBSYS(cpuset) #endif -#if IS_ENABLED(CONFIG_CGROUP_DEBUG) -SUBSYS(debug) -#endif - #if IS_ENABLED(CONFIG_CGROUP_SCHED) SUBSYS(cpu) #endif @@ -50,6 +46,13 @@ SUBSYS(net_prio) #if IS_ENABLED(CONFIG_CGROUP_HUGETLB) SUBSYS(hugetlb) #endif + +/* + * The following subsystems are not supported on the default hierarchy. + */ +#if IS_ENABLED(CONFIG_CGROUP_DEBUG) +SUBSYS(debug) +#endif /* * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 082bb842b11a..a5f75ac4e793 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -148,6 +148,13 @@ struct cgroup_root cgrp_dfl_root; */ static bool cgrp_dfl_root_visible; +/* some controllers are not supported in the default hierarchy */ +static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 +#ifdef CONFIG_CGROUP_DEBUG + | (1 << debug_cgrp_id) +#endif + ; + /* The list of hierarchy roots */ static LIST_HEAD(cgroup_roots); @@ -1126,6 +1133,7 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) { struct cgroup_subsys *ss; + unsigned int tmp_ss_mask; int ssid, i, ret; lockdep_assert_held(&cgroup_mutex); @@ -1143,7 +1151,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) return -EBUSY; } - ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); + /* skip creating root files on dfl_root for inhibited subsystems */ + tmp_ss_mask = ss_mask; + if (dst_root == &cgrp_dfl_root) + tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; + + ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); if (ret) { if (dst_root != &cgrp_dfl_root) return ret; @@ -2426,7 +2439,8 @@ static int cgroup_root_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgrp->root->subsys_mask); + cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & + ~cgrp_dfl_root_inhibit_ss_mask); return 0; } @@ -2564,7 +2578,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (tok[0] == '\0') continue; for_each_subsys(ss, ssid) { - if (ss->disabled || strcmp(tok + 1, ss->name)) + if (ss->disabled || strcmp(tok + 1, ss->name) || + ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask)) continue; if (*tok == '+') { -- cgit From 049d595a4db3b3a20fc252298010f0545ef659a3 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 9 May 2014 19:18:08 +0100 Subject: PM / OPP: Make OPP invisible to users in Kconfig The OPP code is an in kernel library selected by its users, there is no no architecture code required to implement it and enabling it without a user just increases the kernel size. Since the users select rather than depend on it just remove the ability to directly set the option from Kconfig. Signed-off-by: Mark Brown Acked-by: Nishanth Menon Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 2fac9cc79b3d..9a83d780facd 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -257,8 +257,7 @@ config ARCH_HAS_OPP bool config PM_OPP - bool "Operating Performance Point (OPP) Layer library" - depends on ARCH_HAS_OPP + bool ---help--- SOCs have a standard set of tuples consisting of frequency and voltage pairs that the device will support per voltage domain. This -- cgit From 9625ab1727743f6a164df26b7b1eeeced7380b42 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:27 +0800 Subject: workqueue: use manager lock only to protect worker_idr worker_idr is highly bound to managers and is always/only accessed in manager lock context. So we don't need pool->lock for it. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c8411085466f..910d963f6b76 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -124,8 +124,7 @@ enum { * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * - * MG: pool->manager_mutex and pool->lock protected. Writes require both - * locks. Reads can happen under either lock. + * M: pool->manager_mutex protected. * * PL: wq_pool_mutex protected. * @@ -164,7 +163,7 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ struct mutex manager_mutex; /* manager exclusion */ - struct idr worker_idr; /* MG: worker IDs and iteration */ + struct idr worker_idr; /* M: worker IDs and iteration */ struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */ @@ -340,16 +339,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, lockdep_is_held(&wq->mutex), \ "sched RCU or wq->mutex should be held") -#ifdef CONFIG_LOCKDEP -#define assert_manager_or_pool_lock(pool) \ - WARN_ONCE(debug_locks && \ - !lockdep_is_held(&(pool)->manager_mutex) && \ - !lockdep_is_held(&(pool)->lock), \ - "pool->manager_mutex or ->lock should be held") -#else -#define assert_manager_or_pool_lock(pool) do { } while (0) -#endif - #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ @@ -378,14 +367,14 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, * @wi: integer used for iteration * @pool: worker_pool to iterate workers of * - * This must be called with either @pool->manager_mutex or ->lock held. + * This must be called with @pool->manager_mutex. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool_worker(worker, wi, pool) \ idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \ - if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ + if (({ lockdep_assert_held(&pool->manager_mutex); false; })) { } \ else /** @@ -1725,13 +1714,7 @@ static struct worker *create_worker(struct worker_pool *pool) * ID is needed to determine kthread name. Allocate ID first * without installing the pointer. */ - idr_preload(GFP_KERNEL); - spin_lock_irq(&pool->lock); - - id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT); - - spin_unlock_irq(&pool->lock); - idr_preload_end(); + id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_KERNEL); if (id < 0) goto fail; @@ -1773,18 +1756,13 @@ static struct worker *create_worker(struct worker_pool *pool) worker->flags |= WORKER_UNBOUND; /* successful, commit the pointer to idr */ - spin_lock_irq(&pool->lock); idr_replace(&pool->worker_idr, worker, worker->id); - spin_unlock_irq(&pool->lock); return worker; fail: - if (id >= 0) { - spin_lock_irq(&pool->lock); + if (id >= 0) idr_remove(&pool->worker_idr, id); - spin_unlock_irq(&pool->lock); - } kfree(worker); return NULL; } -- cgit From 73eb7fe73ae303996187fff38b1c162f1df0e9d1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:28 +0800 Subject: workqueue: destroy_worker() should destroy idle workers only We used to have the CPU online failure path where a worker is created and then destroyed without being started. A worker was created for the CPU coming online and if the online operation failed the created worker was shut down without being started. But this behavior was changed. The first worker is created and started at the same time for the CPU coming online. It means that we had already ensured in the code that destroy_worker() destroys only idle workers and we don't want to allow it to destroy any non-idle worker in the future. Otherwise, it may be buggy and it may be extremely hard to check. We should force destroy_worker() to destroy only idle workers explicitly. Since destroy_worker() destroys only idle workers, this patch does not change any functionality. We just need to update the comments and the sanity check code. In the sanity check code, we will refuse to destroy the worker if !(worker->flags & WORKER_IDLE). If the worker entered idle which means it is already started, so we remove the check of "worker->flags & WORKER_STARTED", after this removal, WORKER_STARTED is totally unneeded, so we remove WORKER_STARTED too. In the comments for create_worker(), "Create a new worker which is bound..." is changed to "... which is attached..." due to we change the name of this behavior to attaching. tj: Minor description / comment updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 910d963f6b76..189d79be8091 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -73,7 +73,6 @@ enum { POOL_FREEZING = 1 << 3, /* freeze in progress */ /* worker flags */ - WORKER_STARTED = 1 << 0, /* started */ WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ @@ -1692,9 +1691,8 @@ static struct worker *alloc_worker(void) * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to * - * Create a new worker which is bound to @pool. The returned worker - * can be started by calling start_worker() or destroyed using - * destroy_worker(). + * Create a new worker which is attached to @pool. The new worker must be + * started by start_worker(). * * CONTEXT: * Might sleep. Does GFP_KERNEL allocations. @@ -1778,7 +1776,6 @@ fail: */ static void start_worker(struct worker *worker) { - worker->flags |= WORKER_STARTED; worker->pool->nr_workers++; worker_enter_idle(worker); wake_up_process(worker->task); @@ -1814,7 +1811,8 @@ static int create_and_start_worker(struct worker_pool *pool) * destroy_worker - destroy a workqueue worker * @worker: worker to be destroyed * - * Destroy @worker and adjust @pool stats accordingly. + * Destroy @worker and adjust @pool stats accordingly. The worker should + * be idle. * * CONTEXT: * spin_lock_irq(pool->lock) which is released and regrabbed. @@ -1828,13 +1826,12 @@ static void destroy_worker(struct worker *worker) /* sanity check frenzy */ if (WARN_ON(worker->current_work) || - WARN_ON(!list_empty(&worker->scheduled))) + WARN_ON(!list_empty(&worker->scheduled)) || + WARN_ON(!(worker->flags & WORKER_IDLE))) return; - if (worker->flags & WORKER_STARTED) - pool->nr_workers--; - if (worker->flags & WORKER_IDLE) - pool->nr_idle--; + pool->nr_workers--; + pool->nr_idle--; /* * Once WORKER_DIE is set, the kworker may destroy itself at any -- cgit From 60f5a4bcf852b5dec698b08cd34efc302ea72f2b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:29 +0800 Subject: workqueue: async worker destruction worker destruction includes these parts of code: adjust pool's stats remove the worker from idle list detach the worker from the pool kthread_stop() to wait for the worker's task exit free the worker struct We can find out that there is no essential work to do after kthread_stop(), which means destroy_worker() doesn't need to wait for the worker's task exit, so we can remove kthread_stop() and free the worker struct in the worker exiting path. However, put_unbound_pool() still needs to sync the all the workers' destruction before destroying the pool; otherwise, the workers may access to the invalid pool when they are exiting. So we also move the code of "detach the worker" to the exiting path and let put_unbound_pool() to sync with this code via detach_completion. The code of "detach the worker" is wrapped in a new function "worker_detach_from_pool()" although worker_detach_from_pool() is only called once (in worker_thread()) after this patch, but we need to wrap it for these reasons: 1) The code of "detach the worker" is not short enough to unfold them in worker_thread(). 2) the name of "worker_detach_from_pool()" is self-comment, and we add some comments above the function. 3) it will be shared by rescuer in later patch which allows rescuer and normal thread use the same attach/detach frameworks. The worker id is freed when detaching which happens before the worker is fully dead, but this id of the dying worker may be re-used for a new worker, so the dying worker's task name is changed to "worker/dying" to avoid two or several workers having the same name. Since "detach the worker" is moved out from destroy_worker(), destroy_worker() doesn't require manager_mutex, so the "lockdep_assert_held(&pool->manager_mutex)" in destroy_worker() is removed, and destroy_worker() is not protected by manager_mutex in put_unbound_pool(). tj: Minor description updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 62 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 189d79be8091..c458d73022bb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -163,6 +163,7 @@ struct worker_pool { struct mutex manager_arb; /* manager arbitration */ struct mutex manager_mutex; /* manager exclusion */ struct idr worker_idr; /* M: worker IDs and iteration */ + struct completion *detach_completion; /* all workers detached */ struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */ @@ -1687,6 +1688,30 @@ static struct worker *alloc_worker(void) return worker; } +/** + * worker_detach_from_pool() - detach a worker from its pool + * @worker: worker which is attached to its pool + * @pool: the pool @worker is attached to + * + * Undo the attaching which had been done in create_worker(). The caller + * worker shouldn't access to the pool after detached except it has other + * reference to the pool. + */ +static void worker_detach_from_pool(struct worker *worker, + struct worker_pool *pool) +{ + struct completion *detach_completion = NULL; + + mutex_lock(&pool->manager_mutex); + idr_remove(&pool->worker_idr, worker->id); + if (idr_is_empty(&pool->worker_idr)) + detach_completion = pool->detach_completion; + mutex_unlock(&pool->manager_mutex); + + if (detach_completion) + complete(detach_completion); +} + /** * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to @@ -1815,13 +1840,12 @@ static int create_and_start_worker(struct worker_pool *pool) * be idle. * * CONTEXT: - * spin_lock_irq(pool->lock) which is released and regrabbed. + * spin_lock_irq(pool->lock). */ static void destroy_worker(struct worker *worker) { struct worker_pool *pool = worker->pool; - lockdep_assert_held(&pool->manager_mutex); lockdep_assert_held(&pool->lock); /* sanity check frenzy */ @@ -1833,24 +1857,9 @@ static void destroy_worker(struct worker *worker) pool->nr_workers--; pool->nr_idle--; - /* - * Once WORKER_DIE is set, the kworker may destroy itself at any - * point. Pin to ensure the task stays until we're done with it. - */ - get_task_struct(worker->task); - list_del_init(&worker->entry); worker->flags |= WORKER_DIE; - - idr_remove(&pool->worker_idr, worker->id); - - spin_unlock_irq(&pool->lock); - - kthread_stop(worker->task); - put_task_struct(worker->task); - kfree(worker); - - spin_lock_irq(&pool->lock); + wake_up_process(worker->task); } static void idle_worker_timeout(unsigned long __pool) @@ -2289,6 +2298,10 @@ woke_up: spin_unlock_irq(&pool->lock); WARN_ON_ONCE(!list_empty(&worker->entry)); worker->task->flags &= ~PF_WQ_WORKER; + + set_task_comm(worker->task, "kworker/dying"); + worker_detach_from_pool(worker, pool); + kfree(worker); return 0; } @@ -3560,6 +3573,7 @@ static void rcu_free_pool(struct rcu_head *rcu) */ static void put_unbound_pool(struct worker_pool *pool) { + DECLARE_COMPLETION_ONSTACK(detach_completion); struct worker *worker; lockdep_assert_held(&wq_pool_mutex); @@ -3583,15 +3597,21 @@ static void put_unbound_pool(struct worker_pool *pool) * manager_mutex. */ mutex_lock(&pool->manager_arb); - mutex_lock(&pool->manager_mutex); - spin_lock_irq(&pool->lock); + spin_lock_irq(&pool->lock); while ((worker = first_worker(pool))) destroy_worker(worker); WARN_ON(pool->nr_workers || pool->nr_idle); - spin_unlock_irq(&pool->lock); + + mutex_lock(&pool->manager_mutex); + if (!idr_is_empty(&pool->worker_idr)) + pool->detach_completion = &detach_completion; mutex_unlock(&pool->manager_mutex); + + if (pool->detach_completion) + wait_for_completion(pool->detach_completion); + mutex_unlock(&pool->manager_arb); /* shut down the timers */ -- cgit From 3347fc9f36e7e5d3ebe504fc4034745b5d8971d3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:30 +0800 Subject: workqueue: destroy worker directly in the idle timeout handler Since destroy_worker() doesn't need to sleep nor require manager_mutex, destroy_worker() can be directly called in the idle timeout handler, it helps us remove POOL_MANAGE_WORKERS and maybe_destroy_worker() and simplify the manage_workers() After POOL_MANAGE_WORKERS is removed, worker_thread() doesn't need to test whether it needs to manage after processed works. So we can remove the test branch. Signed-off-by: Lai Jiangshan --- kernel/workqueue.c | 67 ++++-------------------------------------------------- 1 file changed, 5 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c458d73022bb..938836bc6207 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -68,7 +68,6 @@ enum { * manager_mutex to avoid changing binding state while * create_worker() is in progress. */ - POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ POOL_FREEZING = 1 << 3, /* freeze in progress */ @@ -752,13 +751,6 @@ static bool need_to_create_worker(struct worker_pool *pool) return need_more_worker(pool) && !may_start_working(pool); } -/* Do I need to be the manager? */ -static bool need_to_manage_workers(struct worker_pool *pool) -{ - return need_to_create_worker(pool) || - (pool->flags & POOL_MANAGE_WORKERS); -} - /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { @@ -1868,7 +1860,7 @@ static void idle_worker_timeout(unsigned long __pool) spin_lock_irq(&pool->lock); - if (too_many_workers(pool)) { + while (too_many_workers(pool)) { struct worker *worker; unsigned long expires; @@ -1876,13 +1868,12 @@ static void idle_worker_timeout(unsigned long __pool) worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; - if (time_before(jiffies, expires)) + if (time_before(jiffies, expires)) { mod_timer(&pool->idle_timer, expires); - else { - /* it's been idle for too long, wake up manager */ - pool->flags |= POOL_MANAGE_WORKERS; - wake_up_worker(pool); + break; } + + destroy_worker(worker); } spin_unlock_irq(&pool->lock); @@ -2000,44 +1991,6 @@ restart: return true; } -/** - * maybe_destroy_worker - destroy workers which have been idle for a while - * @pool: pool to destroy workers for - * - * Destroy @pool workers which have been idle for longer than - * IDLE_WORKER_TIMEOUT. - * - * LOCKING: - * spin_lock_irq(pool->lock) which may be released and regrabbed - * multiple times. Called only from manager. - * - * Return: - * %false if no action was taken and pool->lock stayed locked, %true - * otherwise. - */ -static bool maybe_destroy_workers(struct worker_pool *pool) -{ - bool ret = false; - - while (too_many_workers(pool)) { - struct worker *worker; - unsigned long expires; - - worker = list_entry(pool->idle_list.prev, struct worker, entry); - expires = worker->last_active + IDLE_WORKER_TIMEOUT; - - if (time_before(jiffies, expires)) { - mod_timer(&pool->idle_timer, expires); - break; - } - - destroy_worker(worker); - ret = true; - } - - return ret; -} - /** * manage_workers - manage worker pool * @worker: self @@ -2101,13 +2054,6 @@ static bool manage_workers(struct worker *worker) ret = true; } - pool->flags &= ~POOL_MANAGE_WORKERS; - - /* - * Destroy and then create so that may_start_working() is true - * on return. - */ - ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); mutex_unlock(&pool->manager_mutex); @@ -2349,9 +2295,6 @@ recheck: worker_set_flags(worker, WORKER_PREP, false); sleep: - if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker)) - goto recheck; - /* * pool->lock is held and there's no work to process and no need to * manage, sleep. Workers are woken up only while holding -- cgit From da028469ba173e9c634b6ecf80bb0c69c7d1024d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:31 +0800 Subject: workqueue: separate iteration role from worker_idr worker_idr has the iteration (iterating for attached workers) and worker ID duties. These two duties don't have to be tied together. We can separate them and use a list for tracking attached workers and iteration. Before this separation, it wasn't possible to add rescuer workers to worker_idr due to rescuer workers couldn't allocate ID dynamically because ID-allocation depends on memory-allocation, which rescuer can't depend on. After separation, we can easily add the rescuer workers to the list for iteration without any memory-allocation. It is required when we attach the rescuer worker to the pool in later patch. tj: Minor description updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 28 +++++++++++++++------------- kernel/workqueue_internal.h | 2 ++ 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 938836bc6207..a6c38b717957 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -161,7 +161,8 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ struct mutex manager_mutex; /* manager exclusion */ - struct idr worker_idr; /* M: worker IDs and iteration */ + struct idr worker_idr; /* M: worker IDs */ + struct list_head workers; /* M: attached workers */ struct completion *detach_completion; /* all workers detached */ struct workqueue_attrs *attrs; /* I: worker attributes */ @@ -363,7 +364,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, /** * for_each_pool_worker - iterate through all workers of a worker_pool * @worker: iteration cursor - * @wi: integer used for iteration * @pool: worker_pool to iterate workers of * * This must be called with @pool->manager_mutex. @@ -371,8 +371,8 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, * The if/else clause exists only for the lockdep assertion and can be * ignored. */ -#define for_each_pool_worker(worker, wi, pool) \ - idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \ +#define for_each_pool_worker(worker, pool) \ + list_for_each_entry((worker), &(pool)->workers, node) \ if (({ lockdep_assert_held(&pool->manager_mutex); false; })) { } \ else @@ -1674,6 +1674,7 @@ static struct worker *alloc_worker(void) if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); + INIT_LIST_HEAD(&worker->node); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } @@ -1696,7 +1697,8 @@ static void worker_detach_from_pool(struct worker *worker, mutex_lock(&pool->manager_mutex); idr_remove(&pool->worker_idr, worker->id); - if (idr_is_empty(&pool->worker_idr)) + list_del(&worker->node); + if (list_empty(&pool->workers)) detach_completion = pool->detach_completion; mutex_unlock(&pool->manager_mutex); @@ -1772,6 +1774,8 @@ static struct worker *create_worker(struct worker_pool *pool) /* successful, commit the pointer to idr */ idr_replace(&pool->worker_idr, worker, worker->id); + /* successful, attach the worker to the pool */ + list_add_tail(&worker->node, &pool->workers); return worker; @@ -3483,6 +3487,7 @@ static int init_worker_pool(struct worker_pool *pool) mutex_init(&pool->manager_arb); mutex_init(&pool->manager_mutex); idr_init(&pool->worker_idr); + INIT_LIST_HEAD(&pool->workers); INIT_HLIST_NODE(&pool->hash_node); pool->refcnt = 1; @@ -3548,7 +3553,7 @@ static void put_unbound_pool(struct worker_pool *pool) spin_unlock_irq(&pool->lock); mutex_lock(&pool->manager_mutex); - if (!idr_is_empty(&pool->worker_idr)) + if (!list_empty(&pool->workers)) pool->detach_completion = &detach_completion; mutex_unlock(&pool->manager_mutex); @@ -4533,7 +4538,6 @@ static void wq_unbind_fn(struct work_struct *work) int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; - int wi; for_each_cpu_worker_pool(pool, cpu) { WARN_ON_ONCE(cpu != smp_processor_id()); @@ -4548,7 +4552,7 @@ static void wq_unbind_fn(struct work_struct *work) * before the last CPU down must be on the cpu. After * this, they may become diasporas. */ - for_each_pool_worker(worker, wi, pool) + for_each_pool_worker(worker, pool) worker->flags |= WORKER_UNBOUND; pool->flags |= POOL_DISASSOCIATED; @@ -4594,7 +4598,6 @@ static void wq_unbind_fn(struct work_struct *work) static void rebind_workers(struct worker_pool *pool) { struct worker *worker; - int wi; lockdep_assert_held(&pool->manager_mutex); @@ -4605,13 +4608,13 @@ static void rebind_workers(struct worker_pool *pool) * of all workers first and then clear UNBOUND. As we're called * from CPU_ONLINE, the following shouldn't fail. */ - for_each_pool_worker(worker, wi, pool) + for_each_pool_worker(worker, pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); - for_each_pool_worker(worker, wi, pool) { + for_each_pool_worker(worker, pool) { unsigned int worker_flags = worker->flags; /* @@ -4663,7 +4666,6 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) { static cpumask_t cpumask; struct worker *worker; - int wi; lockdep_assert_held(&pool->manager_mutex); @@ -4677,7 +4679,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) return; /* as we're called from CPU_ONLINE, the following shouldn't fail */ - for_each_pool_worker(worker, wi, pool) + for_each_pool_worker(worker, pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); } diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 7e2204db0b1a..8888e0672442 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -37,6 +37,8 @@ struct worker { struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* I: the associated pool */ /* L: for rescuers */ + struct list_head node; /* M: anchored at pool->workers */ + /* M: runs through worker->node */ unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ -- cgit From 7cda9aae0596d871a8d7a6888d7b447c60e5ab30 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:32 +0800 Subject: workqueue: convert worker_idr to worker_ida We no longer iterate workers via worker_idr and worker_idr is used only for allocating/freeing ID, so we can convert it to worker_ida. By using ida_simple_get/remove(), worker_ida doesn't require external synchronization, so we don't need manager_mutex to protect it and the ID-removal code is allowed to be moved out from worker_detach_from_pool(). In a later patch, worker_detach_from_pool() will be used in rescuers which don't have IDs, so we move the ID-removal code out from worker_detach_from_pool() into worker_thread(). tj: Minor description updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a6c38b717957..092f2098746d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -161,10 +161,11 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ struct mutex manager_mutex; /* manager exclusion */ - struct idr worker_idr; /* M: worker IDs */ struct list_head workers; /* M: attached workers */ struct completion *detach_completion; /* all workers detached */ + struct ida worker_ida; /* worker IDs for task name */ + struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */ int refcnt; /* PL: refcnt for unbound pools */ @@ -1696,7 +1697,6 @@ static void worker_detach_from_pool(struct worker *worker, struct completion *detach_completion = NULL; mutex_lock(&pool->manager_mutex); - idr_remove(&pool->worker_idr, worker->id); list_del(&worker->node); if (list_empty(&pool->workers)) detach_completion = pool->detach_completion; @@ -1727,11 +1727,8 @@ static struct worker *create_worker(struct worker_pool *pool) lockdep_assert_held(&pool->manager_mutex); - /* - * ID is needed to determine kthread name. Allocate ID first - * without installing the pointer. - */ - id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_KERNEL); + /* ID is needed to determine kthread name */ + id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL); if (id < 0) goto fail; @@ -1772,8 +1769,6 @@ static struct worker *create_worker(struct worker_pool *pool) if (pool->flags & POOL_DISASSOCIATED) worker->flags |= WORKER_UNBOUND; - /* successful, commit the pointer to idr */ - idr_replace(&pool->worker_idr, worker, worker->id); /* successful, attach the worker to the pool */ list_add_tail(&worker->node, &pool->workers); @@ -1781,7 +1776,7 @@ static struct worker *create_worker(struct worker_pool *pool) fail: if (id >= 0) - idr_remove(&pool->worker_idr, id); + ida_simple_remove(&pool->worker_ida, id); kfree(worker); return NULL; } @@ -2250,6 +2245,7 @@ woke_up: worker->task->flags &= ~PF_WQ_WORKER; set_task_comm(worker->task, "kworker/dying"); + ida_simple_remove(&pool->worker_ida, worker->id); worker_detach_from_pool(worker, pool); kfree(worker); return 0; @@ -3486,9 +3482,9 @@ static int init_worker_pool(struct worker_pool *pool) mutex_init(&pool->manager_arb); mutex_init(&pool->manager_mutex); - idr_init(&pool->worker_idr); INIT_LIST_HEAD(&pool->workers); + ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); pool->refcnt = 1; @@ -3503,7 +3499,7 @@ static void rcu_free_pool(struct rcu_head *rcu) { struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); - idr_destroy(&pool->worker_idr); + ida_destroy(&pool->worker_ida); free_workqueue_attrs(pool->attrs); kfree(pool); } -- cgit From 4d757c5c81edba2052aae10d5b36dfcb9902b141 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:33 +0800 Subject: workqueue: narrow the protection range of manager_mutex In create_worker(), as pool->worker_ida now uses ida_simple_get()/ida_simple_put() and doesn't require external synchronization, it doesn't need manager_mutex. struct worker allocation and kthread allocation are not visible by any one before attached, so they don't need manager_mutex either. The above operations are before the attaching operation which attaches the worker to the pool. Between attaching and starting the worker, the worker is already attached to the pool, so the cpu hotplug will handle cpu-binding for the worker correctly and we don't need the manager_mutex after attaching. The conclusion is that only the attaching operation needs manager_mutex, so we narrow the protection section of manager_mutex in create_worker(). Some comments about manager_mutex are removed, because we will rename it to attach_mutex and add worker_attach_to_pool() later which will be self-explanatory. tj: Minor description updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 35 +++++------------------------------ 1 file changed, 5 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 092f2098746d..d6b31ff60c52 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1725,8 +1725,6 @@ static struct worker *create_worker(struct worker_pool *pool) int id = -1; char id_buf[16]; - lockdep_assert_held(&pool->manager_mutex); - /* ID is needed to determine kthread name */ id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL); if (id < 0) @@ -1755,6 +1753,8 @@ static struct worker *create_worker(struct worker_pool *pool) /* prevent userland from meddling with cpumask of workqueue workers */ worker->task->flags |= PF_NO_SETAFFINITY; + mutex_lock(&pool->manager_mutex); + /* * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any * online CPUs. It'll be re-applied when any of the CPUs come up. @@ -1762,7 +1762,7 @@ static struct worker *create_worker(struct worker_pool *pool) set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); /* - * The caller is responsible for ensuring %POOL_DISASSOCIATED + * The pool->manager_mutex ensures %POOL_DISASSOCIATED * remains stable across this function. See the comments above the * flag definition for details. */ @@ -1772,6 +1772,8 @@ static struct worker *create_worker(struct worker_pool *pool) /* successful, attach the worker to the pool */ list_add_tail(&worker->node, &pool->workers); + mutex_unlock(&pool->manager_mutex); + return worker; fail: @@ -1809,8 +1811,6 @@ static int create_and_start_worker(struct worker_pool *pool) { struct worker *worker; - mutex_lock(&pool->manager_mutex); - worker = create_worker(pool); if (worker) { spin_lock_irq(&pool->lock); @@ -1818,8 +1818,6 @@ static int create_and_start_worker(struct worker_pool *pool) spin_unlock_irq(&pool->lock); } - mutex_unlock(&pool->manager_mutex); - return worker ? 0 : -ENOMEM; } @@ -2019,8 +2017,6 @@ static bool manage_workers(struct worker *worker) bool ret = false; /* - * Managership is governed by two mutexes - manager_arb and - * manager_mutex. manager_arb handles arbitration of manager role. * Anyone who successfully grabs manager_arb wins the arbitration * and becomes the manager. mutex_trylock() on pool->manager_arb * failure while holding pool->lock reliably indicates that someone @@ -2029,33 +2025,12 @@ static bool manage_workers(struct worker *worker) * grabbing manager_arb is responsible for actually performing * manager duties. If manager_arb is grabbed and released without * actual management, the pool may stall indefinitely. - * - * manager_mutex is used for exclusion of actual management - * operations. The holder of manager_mutex can be sure that none - * of management operations, including creation and destruction of - * workers, won't take place until the mutex is released. Because - * manager_mutex doesn't interfere with manager role arbitration, - * it is guaranteed that the pool's management, while may be - * delayed, won't be disturbed by someone else grabbing - * manager_mutex. */ if (!mutex_trylock(&pool->manager_arb)) return ret; - /* - * With manager arbitration won, manager_mutex would be free in - * most cases. trylock first without dropping @pool->lock. - */ - if (unlikely(!mutex_trylock(&pool->manager_mutex))) { - spin_unlock_irq(&pool->lock); - mutex_lock(&pool->manager_mutex); - spin_lock_irq(&pool->lock); - ret = true; - } - ret |= maybe_create_worker(pool); - mutex_unlock(&pool->manager_mutex); mutex_unlock(&pool->manager_arb); return ret; } -- cgit From 92f9c5c40cc67ffcc5ac7f55fdbd6ae8afc7e0b4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:34 +0800 Subject: workqueue: rename manager_mutex to attach_mutex manager_mutex is only used to protect the attaching for the pool and the pool->workers list. It protects the pool->workers and operations based on this list, such as: cpu-binding for the workers in the pool->workers the operations to set/clear WORKER_UNBOUND So let's rename manager_mutex to attach_mutex to better reflect its role. This patch is a pure rename. tj: Minor command and description updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 44 ++++++++++++++++++++++---------------------- kernel/workqueue_internal.h | 4 ++-- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d6b31ff60c52..38b9ea7c204c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -65,7 +65,7 @@ enum { * be executing on any CPU. The pool behaves as an unbound one. * * Note that DISASSOCIATED should be flipped only while holding - * manager_mutex to avoid changing binding state while + * attach_mutex to avoid changing binding state while * create_worker() is in progress. */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ @@ -122,7 +122,7 @@ enum { * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * - * M: pool->manager_mutex protected. + * A: pool->attach_mutex protected. * * PL: wq_pool_mutex protected. * @@ -160,8 +160,8 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ - struct mutex manager_mutex; /* manager exclusion */ - struct list_head workers; /* M: attached workers */ + struct mutex attach_mutex; /* attach/detach exclusion */ + struct list_head workers; /* A: attached workers */ struct completion *detach_completion; /* all workers detached */ struct ida worker_ida; /* worker IDs for task name */ @@ -367,14 +367,14 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, * @worker: iteration cursor * @pool: worker_pool to iterate workers of * - * This must be called with @pool->manager_mutex. + * This must be called with @pool->attach_mutex. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool_worker(worker, pool) \ list_for_each_entry((worker), &(pool)->workers, node) \ - if (({ lockdep_assert_held(&pool->manager_mutex); false; })) { } \ + if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \ else /** @@ -1696,11 +1696,11 @@ static void worker_detach_from_pool(struct worker *worker, { struct completion *detach_completion = NULL; - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->attach_mutex); list_del(&worker->node); if (list_empty(&pool->workers)) detach_completion = pool->detach_completion; - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->attach_mutex); if (detach_completion) complete(detach_completion); @@ -1753,7 +1753,7 @@ static struct worker *create_worker(struct worker_pool *pool) /* prevent userland from meddling with cpumask of workqueue workers */ worker->task->flags |= PF_NO_SETAFFINITY; - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->attach_mutex); /* * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any @@ -1762,7 +1762,7 @@ static struct worker *create_worker(struct worker_pool *pool) set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); /* - * The pool->manager_mutex ensures %POOL_DISASSOCIATED + * The pool->attach_mutex ensures %POOL_DISASSOCIATED * remains stable across this function. See the comments above the * flag definition for details. */ @@ -1772,7 +1772,7 @@ static struct worker *create_worker(struct worker_pool *pool) /* successful, attach the worker to the pool */ list_add_tail(&worker->node, &pool->workers); - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->attach_mutex); return worker; @@ -3456,7 +3456,7 @@ static int init_worker_pool(struct worker_pool *pool) (unsigned long)pool); mutex_init(&pool->manager_arb); - mutex_init(&pool->manager_mutex); + mutex_init(&pool->attach_mutex); INIT_LIST_HEAD(&pool->workers); ida_init(&pool->worker_ida); @@ -3513,7 +3513,7 @@ static void put_unbound_pool(struct worker_pool *pool) /* * Become the manager and destroy all workers. Grabbing * manager_arb prevents @pool's workers from blocking on - * manager_mutex. + * attach_mutex. */ mutex_lock(&pool->manager_arb); @@ -3523,10 +3523,10 @@ static void put_unbound_pool(struct worker_pool *pool) WARN_ON(pool->nr_workers || pool->nr_idle); spin_unlock_irq(&pool->lock); - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->attach_mutex); if (!list_empty(&pool->workers)) pool->detach_completion = &detach_completion; - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->attach_mutex); if (pool->detach_completion) wait_for_completion(pool->detach_completion); @@ -4513,11 +4513,11 @@ static void wq_unbind_fn(struct work_struct *work) for_each_cpu_worker_pool(pool, cpu) { WARN_ON_ONCE(cpu != smp_processor_id()); - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->attach_mutex); spin_lock_irq(&pool->lock); /* - * We've blocked all manager operations. Make all workers + * We've blocked all attach/detach operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers * except for the ones which are still executing works from * before the last CPU down must be on the cpu. After @@ -4529,7 +4529,7 @@ static void wq_unbind_fn(struct work_struct *work) pool->flags |= POOL_DISASSOCIATED; spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->attach_mutex); /* * Call schedule() so that we cross rq->lock and thus can @@ -4570,7 +4570,7 @@ static void rebind_workers(struct worker_pool *pool) { struct worker *worker; - lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->attach_mutex); /* * Restore CPU affinity of all workers. As all idle workers should @@ -4638,7 +4638,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) static cpumask_t cpumask; struct worker *worker; - lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->attach_mutex); /* is @cpu allowed for @pool? */ if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) @@ -4683,7 +4683,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, mutex_lock(&wq_pool_mutex); for_each_pool(pool, pi) { - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->attach_mutex); if (pool->cpu == cpu) { spin_lock_irq(&pool->lock); @@ -4695,7 +4695,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, restore_unbound_workers_cpumask(pool, cpu); } - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->attach_mutex); } /* update NUMA affinity of unbound workqueues */ diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 8888e0672442..45215870ac6c 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -37,8 +37,8 @@ struct worker { struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* I: the associated pool */ /* L: for rescuers */ - struct list_head node; /* M: anchored at pool->workers */ - /* M: runs through worker->node */ + struct list_head node; /* A: anchored at pool->workers */ + /* A: runs through worker->node */ unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ -- cgit From 4736cbf7a4b2a6bbb50a809a6933fb7eb29dc38f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:35 +0800 Subject: workqueue: separate pool-attaching code out from create_worker() Currently, the code to attach a new worker to its pool is embedded in create_worker(). Separating this code out will make the codes clearer and will allow rescuers to share the code path later. tj: Description and comment updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 61 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 38b9ea7c204c..b1de6ac4a0e3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -66,7 +66,7 @@ enum { * * Note that DISASSOCIATED should be flipped only while holding * attach_mutex to avoid changing binding state while - * create_worker() is in progress. + * worker_attach_to_pool() is in progress. */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ POOL_FREEZING = 1 << 3, /* freeze in progress */ @@ -1682,14 +1682,47 @@ static struct worker *alloc_worker(void) return worker; } +/** + * worker_attach_to_pool() - attach a worker to a pool + * @worker: worker to be attached + * @pool: the target pool + * + * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and + * cpu-binding of @worker are kept coordinated with the pool across + * cpu-[un]hotplugs. + */ +static void worker_attach_to_pool(struct worker *worker, + struct worker_pool *pool) +{ + mutex_lock(&pool->attach_mutex); + + /* + * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any + * online CPUs. It'll be re-applied when any of the CPUs come up. + */ + set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + + /* + * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains + * stable across this function. See the comments above the + * flag definition for details. + */ + if (pool->flags & POOL_DISASSOCIATED) + worker->flags |= WORKER_UNBOUND; + + list_add_tail(&worker->node, &pool->workers); + + mutex_unlock(&pool->attach_mutex); +} + /** * worker_detach_from_pool() - detach a worker from its pool * @worker: worker which is attached to its pool * @pool: the pool @worker is attached to * - * Undo the attaching which had been done in create_worker(). The caller - * worker shouldn't access to the pool after detached except it has other - * reference to the pool. + * Undo the attaching which had been done in worker_attach_to_pool(). The + * caller worker shouldn't access to the pool after detached except it has + * other reference to the pool. */ static void worker_detach_from_pool(struct worker *worker, struct worker_pool *pool) @@ -1753,26 +1786,8 @@ static struct worker *create_worker(struct worker_pool *pool) /* prevent userland from meddling with cpumask of workqueue workers */ worker->task->flags |= PF_NO_SETAFFINITY; - mutex_lock(&pool->attach_mutex); - - /* - * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any - * online CPUs. It'll be re-applied when any of the CPUs come up. - */ - set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); - - /* - * The pool->attach_mutex ensures %POOL_DISASSOCIATED - * remains stable across this function. See the comments above the - * flag definition for details. - */ - if (pool->flags & POOL_DISASSOCIATED) - worker->flags |= WORKER_UNBOUND; - /* successful, attach the worker to the pool */ - list_add_tail(&worker->node, &pool->workers); - - mutex_unlock(&pool->attach_mutex); + worker_attach_to_pool(worker, pool); return worker; -- cgit From 51697d393922eb643e78ac5db86e8fa5a45b469a Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 May 2014 17:46:36 +0800 Subject: workqueue: use generic attach/detach routine for rescuers There are several problems with the code that rescuers use to bind themselve to the target pool's cpumask. 1) It is very different from how the normal workers bind to cpumask, increasing code complexity and maintenance overhead. 2) The code of cpu-binding for rescuers is complicated. 3) If one or more cpu hotplugs happen while a rescuer is processing its scheduled work items, the rescuer may not stay bound to the cpumask of the pool. This is an allowed behavior, but is still hairy. It will be better if the cpumask of the rescuer is always kept synchronized with the pool across cpu hotplugs. Using generic attach/detach routine will solve the above problems and results in much simpler code. tj: Minor description updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 74 ++++++------------------------------------------------ 1 file changed, 8 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b1de6ac4a0e3..6603923b7ede 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1603,70 +1603,6 @@ static void worker_leave_idle(struct worker *worker) list_del_init(&worker->entry); } -/** - * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it - * @pool: target worker_pool - * - * Bind %current to the cpu of @pool if it is associated and lock @pool. - * - * Works which are scheduled while the cpu is online must at least be - * scheduled to a worker which is bound to the cpu so that if they are - * flushed from cpu callbacks while cpu is going down, they are - * guaranteed to execute on the cpu. - * - * This function is to be used by unbound workers and rescuers to bind - * themselves to the target cpu and may race with cpu going down or - * coming online. kthread_bind() can't be used because it may put the - * worker to already dead cpu and set_cpus_allowed_ptr() can't be used - * verbatim as it's best effort and blocking and pool may be - * [dis]associated in the meantime. - * - * This function tries set_cpus_allowed() and locks pool and verifies the - * binding against %POOL_DISASSOCIATED which is set during - * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker - * enters idle state or fetches works without dropping lock, it can - * guarantee the scheduling requirement described in the first paragraph. - * - * CONTEXT: - * Might sleep. Called without any lock but returns with pool->lock - * held. - * - * Return: - * %true if the associated pool is online (@worker is successfully - * bound), %false if offline. - */ -static bool worker_maybe_bind_and_lock(struct worker_pool *pool) -__acquires(&pool->lock) -{ - while (true) { - /* - * The following call may fail, succeed or succeed - * without actually migrating the task to the cpu if - * it races with cpu hotunplug operation. Verify - * against POOL_DISASSOCIATED. - */ - if (!(pool->flags & POOL_DISASSOCIATED)) - set_cpus_allowed_ptr(current, pool->attrs->cpumask); - - spin_lock_irq(&pool->lock); - if (pool->flags & POOL_DISASSOCIATED) - return false; - if (task_cpu(current) == pool->cpu && - cpumask_equal(¤t->cpus_allowed, pool->attrs->cpumask)) - return true; - spin_unlock_irq(&pool->lock); - - /* - * We've raced with CPU hot[un]plug. Give it a breather - * and retry migration. cond_resched() is required here; - * otherwise, we might deadlock against cpu_stop trying to - * bring down the CPU on non-preemptive kernel. - */ - cpu_relax(); - cond_resched(); - } -} - static struct worker *alloc_worker(void) { struct worker *worker; @@ -2361,8 +2297,9 @@ repeat: spin_unlock_irq(&wq_mayday_lock); - /* migrate to the target cpu if possible */ - worker_maybe_bind_and_lock(pool); + worker_attach_to_pool(rescuer, pool); + + spin_lock_irq(&pool->lock); rescuer->pool = pool; /* @@ -2375,6 +2312,11 @@ repeat: move_linked_works(work, scheduled, &n); process_scheduled_works(rescuer); + spin_unlock_irq(&pool->lock); + + worker_detach_from_pool(rescuer, pool); + + spin_lock_irq(&pool->lock); /* * Put the reference grabbed by send_mayday(). @pool won't -- cgit From ccdb594653b1c0d5b0f3e6f8bf764c544ba0606d Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Tue, 20 May 2014 17:10:36 -0500 Subject: tracing: Eliminate duplicate TRACE_GRAPH_PRINT_xx defines Eliminate duplicate TRACE_GRAPH_PRINT_xx defines in trace_functions_graph.c that are already in trace.h. Add TRACE_GRAPH_PRINT_IRQS to trace.h, which is the only one that is missing. Link: http://lkml.kernel.org/p/20140520221031.8359.24733.stgit@beardog.cce.hp.com Signed-off-by: Robert Elliott Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 1 + kernel/trace/trace_functions_graph.c | 9 --------- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3b3e09e61f33..68050633255e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -724,6 +724,7 @@ extern unsigned long trace_flags; #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_IRQS 0x40 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b86dd4d8c6a6..af08dd531cb8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -38,15 +38,6 @@ struct fgraph_data { #define TRACE_GRAPH_INDENT 2 -/* Flag options */ -#define TRACE_GRAPH_PRINT_OVERRUN 0x1 -#define TRACE_GRAPH_PRINT_CPU 0x2 -#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 -#define TRACE_GRAPH_PRINT_PROC 0x8 -#define TRACE_GRAPH_PRINT_DURATION 0x10 -#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 -#define TRACE_GRAPH_PRINT_IRQS 0x40 - static unsigned int max_depth; static struct tracer_opt trace_opts[] = { -- cgit From 607e3a29203633eaec7334f2f58f9653df788a06 Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Tue, 20 May 2014 17:10:51 -0500 Subject: tracing: Add funcgraph_tail option to print function name after closing braces In the function-graph tracer, add a funcgraph_tail option to print the function name on all } lines, not just functions whose first line is no longer in the trace buffer. If a function calls other traced functions, its total time appears on its } line. This change allows grep to be used to determine the function for which the line corresponds. Update Documentation/trace/ftrace.txt to describe this new option. Link: http://lkml.kernel.org/p/20140520221041.8359.6782.stgit@beardog.cce.hp.com Signed-off-by: Robert Elliott Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 26 ++++++++++++++++++++++++++ kernel/trace/trace.h | 1 + kernel/trace/trace_functions_graph.c | 9 ++++++--- 3 files changed, 33 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index bd365988e8d8..2479b2a0c77c 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -2003,6 +2003,32 @@ want, depending on your needs. 360.774530 | 1) 0.594 us | __phys_addr(); +The function name is always displayed after the closing bracket +for a function if the start of that function is not in the +trace buffer. + +Display of the function name after the closing bracket may be +enabled for functions whose start is in the trace buffer, +allowing easier searching with grep for function durations. +It is default disabled. + + hide: echo nofuncgraph-tail > trace_options + show: echo funcgraph-tail > trace_options + + Example with nofuncgraph-tail (default): + 0) | putname() { + 0) | kmem_cache_free() { + 0) 0.518 us | __phys_addr(); + 0) 1.757 us | } + 0) 2.861 us | } + + Example with funcgraph-tail: + 0) | putname() { + 0) | kmem_cache_free() { + 0) 0.518 us | __phys_addr(); + 0) 1.757 us | } /* kmem_cache_free() */ + 0) 2.861 us | } /* putname() */ + You can put some comments on specific functions by using trace_printk() For example, if you want to put a comment inside the __might_sleep() function, you just have to include diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 68050633255e..217207ad60b3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -725,6 +725,7 @@ extern unsigned long trace_flags; #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 #define TRACE_GRAPH_PRINT_IRQS 0x40 +#define TRACE_GRAPH_PRINT_TAIL 0x80 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index af08dd531cb8..4de3e57f723c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -55,11 +55,13 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, /* Display interrupts */ { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, + /* Display function name after trailing } */ + { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { - /* Don't display overruns and proc by default */ + /* Don't display overruns, proc, or tail by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, .opts = trace_opts @@ -1167,9 +1169,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, * If the return function does not have a matching entry, * then the entry was lost. Instead of just printing * the '}' and letting the user guess what function this - * belongs to, write out the function name. + * belongs to, write out the function name. Always do + * that if the funcgraph-tail option is enabled. */ - if (func_match) { + if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { ret = trace_seq_puts(s, "}\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; -- cgit From 5fe821a9dee241fa450703ab7015d970ee0cfb8d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 19 May 2014 14:56:14 -0700 Subject: net: filter: cleanup invocation of internal BPF Kernel API for classic BPF socket filters is: sk_unattached_filter_create() - validate classic BPF, convert, JIT SK_RUN_FILTER() - run it sk_unattached_filter_destroy() - destroy socket filter Cleanup internal BPF kernel API as following: sk_filter_select_runtime() - final step of internal BPF creation. Try to JIT internal BPF program, if JIT is not available select interpreter SK_RUN_FILTER() - run it sk_filter_free() - free internal BPF program Disallow direct calls to BPF interpreter. Execution of the BPF program should be done with SK_RUN_FILTER() macro. Example of internal BPF create, run, destroy: struct sk_filter *fp; fp = kzalloc(sk_filter_size(prog_len), GFP_KERNEL); memcpy(fp->insni, prog, prog_len * sizeof(fp->insni[0])); fp->len = prog_len; sk_filter_select_runtime(fp); SK_RUN_FILTER(fp, ctx); sk_filter_free(fp); Sockets, seccomp, testsuite, tracing are using different ways to populate sk_filter, so first steps of program creation are not common. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 6 ++---- kernel/seccomp.c | 6 ++---- lib/test_bpf.c | 4 ++-- net/core/filter.c | 44 ++++++++++++++++++++++++++++---------------- 4 files changed, 34 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 9d5ae0a2c954..7977b3958e25 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -184,10 +184,8 @@ static inline unsigned int sk_filter_size(unsigned int proglen) int sk_filter(struct sock *sk, struct sk_buff *skb); -u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx, - const struct sock_filter_int *insni); -u32 sk_run_filter_int_skb(const struct sk_buff *ctx, - const struct sock_filter_int *insni); +void sk_filter_select_runtime(struct sk_filter *fp); +void sk_filter_free(struct sk_filter *fp); int sk_convert_filter(struct sock_filter *prog, int len, struct sock_filter_int *new_prog, int *new_len); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 7e02d624cc50..1036b6f2fded 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -273,10 +273,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) atomic_set(&filter->usage, 1); filter->prog->len = new_len; - filter->prog->bpf_func = (void *)sk_run_filter_int_seccomp; - /* JIT internal BPF into native HW instructions */ - bpf_int_jit_compile(filter->prog); + sk_filter_select_runtime(filter->prog); /* * If there is an existing filter, make it the prev and don't drop its @@ -340,7 +338,7 @@ void put_seccomp_filter(struct task_struct *tsk) while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; orig = orig->prev; - bpf_jit_free(freeme->prog); + sk_filter_free(freeme->prog); kfree(freeme); } } diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 3603ebcd5d65..e160934430eb 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -1489,7 +1489,7 @@ static __init int test_bpf(void) memcpy(fp_ext->insns, tests[i].insns_int, fprog.len * 8); fp->len = fprog.len; - fp->bpf_func = sk_run_filter_int_skb; + sk_filter_select_runtime(fp); } else { err = sk_unattached_filter_create(&fp, &fprog); if (tests[i].data_type == EXPECTED_FAIL) { @@ -1516,7 +1516,7 @@ static __init int test_bpf(void) if (tests[i].data_type != SKB_INT) sk_unattached_filter_destroy(fp); else - kfree(fp); + sk_filter_free(fp); if (err) { pr_cont("FAIL %d\n", err); diff --git a/net/core/filter.c b/net/core/filter.c index 32c5b44c537e..7067cb240d3e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -153,7 +153,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) * keep, 0 for none. @ctx is the data we are operating on, @insn is the * array of filter instructions. */ -unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn) +static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn) { u64 stack[MAX_BPF_STACK / sizeof(u64)]; u64 regs[MAX_BPF_REG], tmp; @@ -571,15 +571,6 @@ load_byte: return 0; } -u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx, - const struct sock_filter_int *insni) - __attribute__ ((alias ("__sk_run_filter"))); - -u32 sk_run_filter_int_skb(const struct sk_buff *ctx, - const struct sock_filter_int *insni) - __attribute__ ((alias ("__sk_run_filter"))); -EXPORT_SYMBOL_GPL(sk_run_filter_int_skb); - /* Helper to find the offset of pkt_type in sk_buff structure. We want * to make sure its still a 3bit field starting at a byte boundary; * taken from arch/x86/net/bpf_jit_comp.c. @@ -1397,7 +1388,7 @@ static void sk_filter_release_rcu(struct rcu_head *rcu) struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); sk_release_orig_filter(fp); - bpf_jit_free(fp); + sk_filter_free(fp); } /** @@ -1497,7 +1488,6 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, goto out_err_free; } - fp->bpf_func = sk_run_filter_int_skb; fp->len = new_len; /* 2nd pass: remap sock_filter insns into sock_filter_int insns. */ @@ -1510,6 +1500,8 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, */ goto out_err_free; + sk_filter_select_runtime(fp); + kfree(old_prog); return fp; @@ -1528,6 +1520,29 @@ void __weak bpf_int_jit_compile(struct sk_filter *prog) { } +/** + * sk_filter_select_runtime - select execution runtime for BPF program + * @fp: sk_filter populated with internal BPF program + * + * try to JIT internal BPF program, if JIT is not available select interpreter + * BPF program will be executed via SK_RUN_FILTER() macro + */ +void sk_filter_select_runtime(struct sk_filter *fp) +{ + fp->bpf_func = (void *) __sk_run_filter; + + /* Probe if internal BPF can be JITed */ + bpf_int_jit_compile(fp); +} +EXPORT_SYMBOL_GPL(sk_filter_select_runtime); + +/* free internal BPF program */ +void sk_filter_free(struct sk_filter *fp) +{ + bpf_jit_free(fp); +} +EXPORT_SYMBOL_GPL(sk_filter_free); + static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, struct sock *sk) { @@ -1548,12 +1563,9 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp, /* JIT compiler couldn't process this filter, so do the * internal BPF translation for the optimized interpreter. */ - if (!fp->jited) { + if (!fp->jited) fp = __sk_migrate_filter(fp, sk); - /* Probe if internal BPF can be jit-ed */ - bpf_int_jit_compile(fp); - } return fp; } -- cgit From 143cf23df25b7082cd706c3c53188e741e7881c3 Mon Sep 17 00:00:00 2001 From: Michael Kerrisk Date: Fri, 9 May 2014 16:54:15 +0200 Subject: sched: Make sched_setattr() correctly return -EFBIG The documented[1] behavior of sched_attr() in the proposed man page text is: sched_attr::size must be set to the size of the structure, as in sizeof(struct sched_attr), if the provided structure is smaller than the kernel structure, any additional fields are assumed '0'. If the provided structure is larger than the kernel structure, the kernel verifies all additional fields are '0' if not the syscall will fail with -E2BIG. As currently implemented, sched_copy_attr() returns -EFBIG for for this case, but the logic in sys_sched_setattr() converts that error to -EFAULT. This patch fixes the behavior. [1] http://thread.gmane.org/gmane.linux.kernel/1615615/focus=1697760 Signed-off-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Cc: Cc: Linus Torvalds Link: http://lkml.kernel.org/r/536CEC17.9070903@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13584f1cccfc..f2205f02eb70 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3658,8 +3658,9 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (!uattr || pid < 0 || flags) return -EINVAL; - if (sched_copy_attr(uattr, &attr)) - return -EFAULT; + retval = sched_copy_attr(uattr, &attr); + if (retval) + return retval; rcu_read_lock(); retval = -ESRCH; -- cgit From dbdb22754fde671dc93d2fae06f8be113d47f2fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 May 2014 10:49:03 +0200 Subject: sched: Disallow sched_attr::sched_policy < 0 The scheduler uses policy=-1 to preserve the current policy state to implement sys_sched_setparam(), this got exposed to userspace by accident through sys_sched_setattr(), cure this. Reported-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Acked-by: Michael Kerrisk Cc: Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140509085311.GJ30445@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f2205f02eb70..cdefcf7c5925 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3662,6 +3662,9 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (retval) return retval; + if (attr.sched_policy < 0) + return -EINVAL; + rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); -- cgit From ce5f7f8200ca2504f6f290044393d73ca314965a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 May 2014 22:50:34 +0200 Subject: sched/deadline: Change sched_getparam() behaviour vs SCHED_DEADLINE The way we read POSIX one should only call sched_getparam() when sched_getscheduler() returns either SCHED_FIFO or SCHED_RR. Given that we currently return sched_param::sched_priority=0 for all others, extend the same behaviour to SCHED_DEADLINE. Requested-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Acked-by: Michael Kerrisk Cc: Dario Faggioli Cc: linux-man Cc: "Michael Kerrisk (man-pages)" Cc: Juri Lelli Cc: Linus Torvalds Cc: Link: http://lkml.kernel.org/r/20140512205034.GH13467@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cdefcf7c5925..f3f08bf94355 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3713,7 +3713,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) */ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) { - struct sched_param lp; + struct sched_param lp = { .sched_priority = 0 }; struct task_struct *p; int retval; @@ -3730,11 +3730,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (retval) goto out_unlock; - if (task_has_dl_policy(p)) { - retval = -EINVAL; - goto out_unlock; - } - lp.sched_priority = p->rt_priority; + if (task_has_rt_policy(p)) + lp.sched_priority = p->rt_priority; rcu_read_unlock(); /* -- cgit From b0827819b0da4acfbc1df1e05edcf50efd07cbd1 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 13 May 2014 14:11:31 +0200 Subject: sched/deadline: Restrict user params max value to 2^63 ns Michael Kerrisk noticed that creating SCHED_DEADLINE reservations with certain parameters (e.g, a runtime of something near 2^64 ns) can cause a system freeze for some amount of time. The problem is that in the interface we have u64 sched_runtime; while internally we need to have a signed runtime (to cope with budget overruns) s64 runtime; At the time we setup a new dl_entity we copy the first value in the second. The cast turns out with negative values when sched_runtime is too big, and this causes the scheduler to go crazy right from the start. Moreover, considering how we deal with deadlines wraparound (s64)(a - b) < 0 we also have to restrict acceptable values for sched_{deadline,period}. This patch fixes the thing checking that user parameters are always below 2^63 ns (still large enough for everyone). It also rewrites other conditions that we check, since in __checkparam_dl we don't have to deal with deadline wraparounds and what we have now erroneously fails when the difference between values is too big. Reported-by: Michael Kerrisk Suggested-by: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: Cc: Dario Faggioli Cc: Dave Jones Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140513141131.20d944f81633ee937f256385@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f3f08bf94355..44e00abece09 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3195,17 +3195,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr) * We ask for the deadline not being zero, and greater or equal * than the runtime, as well as the period of being zero or * greater than deadline. Furthermore, we have to be sure that - * user parameters are above the internal resolution (1us); we - * check sched_runtime only since it is always the smaller one. + * user parameters are above the internal resolution of 1us (we + * check sched_runtime only since it is always the smaller one) and + * below 2^63 ns (we have to check both sched_deadline and + * sched_period, as the latter can be zero). */ static bool __checkparam_dl(const struct sched_attr *attr) { - return attr && attr->sched_deadline != 0 && - (attr->sched_period == 0 || - (s64)(attr->sched_period - attr->sched_deadline) >= 0) && - (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && - attr->sched_runtime >= (2 << (DL_SCALE - 1)); + /* deadline != 0 */ + if (attr->sched_deadline == 0) + return false; + + /* + * Since we truncate DL_SCALE bits, make sure we're at least + * that big. + */ + if (attr->sched_runtime < (1ULL << DL_SCALE)) + return false; + + /* + * Since we use the MSB for wrap-around and sign issues, make + * sure it's not set (mind that period can be equal to zero). + */ + if (attr->sched_deadline & (1ULL << 63) || + attr->sched_period & (1ULL << 63)) + return false; + + /* runtime <= deadline <= period (if period != 0) */ + if ((attr->sched_period != 0 && + attr->sched_period < attr->sched_deadline) || + attr->sched_deadline < attr->sched_runtime) + return false; + + return true; } /* -- cgit From 944770ab54babaef29d9d1dc8189898b3ee8afcf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 May 2014 16:13:56 +0200 Subject: sched/deadline: Replace NR_CPUS arrays Tejun reported that his resume was failing due to order-3 allocations from sched_domain building. Replace the NR_CPUS arrays in there with a dynamically allocated array. Reported-by: Tejun Heo Signed-off-by: Peter Zijlstra Acked-by: Juri Lelli Cc: Johannes Weiner Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-kat4gl1m5a6dwy6nzuqox45e@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 33 ++++++++++++++++++++++++--------- kernel/sched/cpudeadline.h | 6 +++--- 2 files changed, 27 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index ab001b5d5048..bd95963dae80 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -13,6 +13,7 @@ #include #include +#include #include "cpudeadline.h" static inline int parent(int i) @@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b) { int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; - swap(cp->elements[a], cp->elements[b]); - swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); + swap(cp->elements[a].cpu, cp->elements[b].cpu); + swap(cp->elements[a].dl , cp->elements[b].dl ); + + swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); } static void cpudl_heapify(struct cpudl *cp, int idx) @@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) WARN_ON(!cpu_present(cpu)); raw_spin_lock_irqsave(&cp->lock, flags); - old_idx = cp->cpu_to_idx[cpu]; + old_idx = cp->elements[cpu].idx; if (!is_valid) { /* remove item */ if (old_idx == IDX_INVALID) { @@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; cp->elements[old_idx].cpu = new_cpu; cp->size--; - cp->cpu_to_idx[new_cpu] = old_idx; - cp->cpu_to_idx[cpu] = IDX_INVALID; + cp->elements[new_cpu].idx = old_idx; + cp->elements[cpu].idx = IDX_INVALID; while (old_idx > 0 && dl_time_before( cp->elements[parent(old_idx)].dl, cp->elements[old_idx].dl)) { @@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cp->size++; cp->elements[cp->size - 1].dl = 0; cp->elements[cp->size - 1].cpu = cpu; - cp->cpu_to_idx[cpu] = cp->size - 1; + cp->elements[cpu].idx = cp->size - 1; cpudl_change_key(cp, cp->size - 1, dl); cpumask_clear_cpu(cpu, cp->free_cpus); } else { @@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp) memset(cp, 0, sizeof(*cp)); raw_spin_lock_init(&cp->lock); cp->size = 0; - for (i = 0; i < NR_CPUS; i++) - cp->cpu_to_idx[i] = IDX_INVALID; - if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) + + cp->elements = kcalloc(nr_cpu_ids, + sizeof(struct cpudl_item), + GFP_KERNEL); + if (!cp->elements) + return -ENOMEM; + + if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { + kfree(cp->elements); return -ENOMEM; + } + + for_each_possible_cpu(i) + cp->elements[i].idx = IDX_INVALID; + cpumask_setall(cp->free_cpus); return 0; @@ -211,4 +225,5 @@ int cpudl_init(struct cpudl *cp) void cpudl_cleanup(struct cpudl *cp) { free_cpumask_var(cp->free_cpus); + kfree(cp->elements); } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index a202789a412c..538c9796ad4a 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -5,17 +5,17 @@ #define IDX_INVALID -1 -struct array_item { +struct cpudl_item { u64 dl; int cpu; + int idx; }; struct cpudl { raw_spinlock_t lock; int size; - int cpu_to_idx[NR_CPUS]; - struct array_item elements[NR_CPUS]; cpumask_var_t free_cpus; + struct cpudl_item *elements; }; -- cgit From 4dac0b638310d2e92f6e19958b73d4c97c9734bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 May 2014 16:04:26 +0200 Subject: sched/cpupri: Replace NR_CPUS arrays Tejun reported that his resume was failing due to order-3 allocations from sched_domain building. Replace the NR_CPUS arrays in there with a dynamically allocated array. Reported-by: Tejun Heo Signed-off-by: Peter Zijlstra Cc: Johannes Weiner Cc: Steven Rostedt Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-7cysnkw1gik45r864t1nkudh@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cpupri.c | 7 +++++++ kernel/sched/cpupri.h | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 3031bac8aa3e..8834243abee2 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "cpupri.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ @@ -218,8 +219,13 @@ int cpupri_init(struct cpupri *cp) goto cleanup; } + cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL); + if (!cp->cpu_to_pri) + goto cleanup; + for_each_possible_cpu(i) cp->cpu_to_pri[i] = CPUPRI_INVALID; + return 0; cleanup: @@ -236,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp) { int i; + kfree(cp->cpu_to_pri); for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) free_cpumask_var(cp->pri_to_cpu[i].mask); } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index f6d756173491..6b033347fdfd 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -17,7 +17,7 @@ struct cpupri_vec { struct cpupri { struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; - int cpu_to_pri[NR_CPUS]; + int *cpu_to_pri; }; #ifdef CONFIG_SMP -- cgit From 6acbfb96976fc3350e30d964acb1dbbdf876d55e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 16 May 2014 11:50:42 +0800 Subject: sched: Fix hotplug vs. set_cpus_allowed_ptr() Lai found that: WARNING: CPU: 1 PID: 13 at arch/x86/kernel/smp.c:124 native_smp_send_reschedule+0x2d/0x4b() ... migration_cpu_stop+0x1d/0x22 was caused by set_cpus_allowed_ptr() assuming that cpu_active_mask is always a sub-set of cpu_online_mask. This isn't true since 5fbd036b552f ("sched: Cleanup cpu_active madness"). So set active and online at the same time to avoid this particular problem. Fixes: 5fbd036b552f ("sched: Cleanup cpu_active madness") Signed-off-by: Lai Jiangshan Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Gautham R. Shenoy Cc: Linus Torvalds Cc: Michael wang Cc: Paul Gortmaker Cc: Rafael J. Wysocki Cc: Srivatsa S. Bhat Cc: Toshi Kani Link: http://lkml.kernel.org/r/53758B12.8060609@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 6 ++++-- kernel/sched/core.c | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index a9e710eef0e2..247979a1b815 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -726,10 +726,12 @@ void set_cpu_present(unsigned int cpu, bool present) void set_cpu_online(unsigned int cpu, bool online) { - if (online) + if (online) { cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); - else + cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); + } else { cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); + } } void set_cpu_active(unsigned int cpu, bool active) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44e00abece09..86f3890c3d08 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5076,7 +5076,6 @@ static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { - case CPU_STARTING: case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; -- cgit From 3944a9274ef6cda0cc282daf0739832f661670f7 Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Thu, 15 May 2014 15:59:20 -0700 Subject: sched: Fix exec_start/task_hot on migrated tasks task_hot checks exec_start on any runnable task, but if it has been migrated since the it last ran, then exec_start is a clock_task from another cpu. If the old cpu's clock_task was sufficiently far ahead of this cpu's then the task will not be considered for another migration until it has run. Instead reset exec_start whenever a task is migrated, since it is presumably no longer hot anyway. Signed-off-by: Ben Segall [ Made it compile. ] Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140515225920.7179.13924.stgit@sword-of-the-dawn.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 28ccf502c63c..dd3fa14a2998 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4544,6 +4544,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) atomic_long_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } + + /* We have migrated, no longer consider this task hot */ + se->exec_start = 0; } #endif /* CONFIG_SMP */ -- cgit From e78c7bca56dab5ce4f22694f99115ec07e4935f6 Mon Sep 17 00:00:00 2001 From: Michael Kerrisk Date: Fri, 9 May 2014 16:54:28 +0200 Subject: sched: Simplify return logic in sched_copy_attr() The logic in this function is a little contorted, clean it up: * Rather than having chained gotos for the -EFBIG case, just return -EFBIG directly. * Now, the label 'out' is no longer needed, and 'ret' must be zero zero by the time we fall through to this point, so just return 0. Signed-off-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/536CEC24.9080201@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2551b6db470e..2318fc484d8b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3657,13 +3657,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr, */ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); -out: - return ret; + return 0; err_size: put_user(sizeof(*attr), &uattr->size); - ret = -E2BIG; - goto out; + return -E2BIG; } /** -- cgit From 22400674945c562cf3c176d6c4178cd545e2dec9 Mon Sep 17 00:00:00 2001 From: Michael Kerrisk Date: Fri, 9 May 2014 16:54:33 +0200 Subject: sched: Simplify return logic in sched_read_attr() Gotos are chained pointlessly here, and the 'out' label can be dispensed with. Signed-off-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/536CEC29.9090503@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2318fc484d8b..a78c5b62a470 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3821,7 +3821,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, for (; addr < end; addr++) { if (*addr) - goto err_size; + return -EFBIG; } attr->size = usize; @@ -3831,12 +3831,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, if (ret) return -EFAULT; -out: - return ret; - -err_size: - ret = -E2BIG; - goto out; + return 0; } /** -- cgit From 8bf21433f38b020c3d8a3805d1d7fb73d7b40c01 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 14 May 2014 11:40:37 -0400 Subject: sched: Call select_idle_sibling() when not affine_sd On smaller systems, the top level sched domain will be an affine domain, and select_idle_sibling is invoked for every SD_WAKE_AFFINE wakeup. This seems to be working well. On larger systems, with the node distance between far away NUMA nodes being > RECLAIM_DISTANCE, select_idle_sibling is only called if the waker and the wakee are on nodes less than RECLAIM_DISTANCE apart. This patch leaves in place the policy of not pulling the task across nodes on such systems, while fixing the issue that select_idle_sibling is not called at all in certain circumstances. The code will look for an idle CPU in the same CPU package as the CPU where the task ran previously. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: morten.rasmussen@arm.com Cc: george.mccollister@gmail.com Cc: ktkhai@parallels.com Cc: Mel Gorman Cc: Mike Galbraith Link: http://lkml.kernel.org/r/20140514114037.2d93266f@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd3fa14a2998..429164d117ea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4473,10 +4473,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f sd = tmp; } - if (affine_sd) { - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; + if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + prev_cpu = cpu; + if (sd_flag & SD_BALANCE_WAKE) { new_cpu = select_idle_sibling(p, prev_cpu); goto unlock; } -- cgit From c515db8cd311ef77b2dc7cbd6b695022655bb0f3 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 13 May 2014 11:11:01 +0200 Subject: sched/numa: Fix initialization of sched_domain_topology for NUMA Jet Chen has reported a kernel panics when booting qemu-system-x86_64 with kvm64 cpu. A panic occured while building the sched_domain. In sched_init_numa, we create a new topology table in which both default levels and numa levels are copied. The last row of the table must have a null pointer in the mask field. The current implementation doesn't add this last row in the computation of the table size. So we add 1 row in the allocation size that will be used as the last row of the table. The kzalloc will ensure that the mask field is NULL. Reported-by: Jet Chen Tested-by: Jet Chen Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Cc: fengguang.wu@intel.com Link: http://lkml.kernel.org/r/1399972261-25693-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a78c5b62a470..45d077ed24fb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6231,7 +6231,7 @@ static void sched_init_numa(void) /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); - tl = kzalloc((i + level) * + tl = kzalloc((i + level + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; -- cgit From caffcdd8d27ba78730d5540396ce72ad022aff2c Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 30 Apr 2014 14:39:38 +0100 Subject: sched: Do not zero sg->cpumask and sg->sgp->power in build_sched_groups() There is no need to zero struct sched_group member cpumask and struct sched_group_power member power since both structures are already allocated as zeroed memory in __sdt_alloc(). This patch has been tested with BUG_ON(!cpumask_empty(sched_group_cpus(sg))); and BUG_ON(sg->sgp->power); in build_sched_groups() on ARM TC2 and INTEL i5 M520 platform including CPU hotplug scenarios. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1398865178-12577-1-git-send-email-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 45d077ed24fb..6340c601475d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5794,8 +5794,6 @@ build_sched_groups(struct sched_domain *sd, int cpu) continue; group = get_group(i, sdd, &sg); - cpumask_clear(sched_group_cpus(sg)); - sg->sgp->power = 0; cpumask_setall(sched_group_mask(sg)); for_each_cpu(j, span) { -- cgit From a9467fa3cd2d5bf39e7cb7d0706d29d7ef4df212 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Thu, 8 May 2014 18:35:15 +0900 Subject: sched: Use clamp() and clamp_val() to make sys_nice() more readable Suggested-by: Kees Cook Signed-off-by: Dongsheng Yang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1399541715-19568-1-git-send-email-yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6340c601475d..f5605b6deea4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3057,17 +3057,10 @@ SYSCALL_DEFINE1(nice, int, increment) * We don't have to worry. Conceptually one call occurs first * and we have a single winner. */ - if (increment < -40) - increment = -40; - if (increment > 40) - increment = 40; - + increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); nice = task_nice(current) + increment; - if (nice < MIN_NICE) - nice = MIN_NICE; - if (nice > MAX_NICE) - nice = MAX_NICE; + nice = clamp_val(nice, MIN_NICE, MAX_NICE); if (increment < 0 && !can_nice(current, nice)) return -EPERM; -- cgit From 52a08ef1f13a11289c9e18cd4cfb4e51c024058b Mon Sep 17 00:00:00 2001 From: Jason Low Date: Thu, 8 May 2014 17:49:22 -0700 Subject: sched: Fix the rq->next_balance logic in rebalance_domains() and idle_balance() Currently, in idle_balance(), we update rq->next_balance when we pull_tasks. However, it is also important to update this in the !pulled_tasks case too. When the CPU is "busy" (the CPU isn't idle), rq->next_balance gets computed using sd->busy_factor (so we increase the balance interval when the CPU is busy). However, when the CPU goes idle, rq->next_balance could still be set to a large value that was computed with the sd->busy_factor. Thus, we need to also update rq->next_balance in idle_balance() in the cases where !pulled_tasks too, so that rq->next_balance gets updated without taking the busy_factor into account when the CPU is about to go idle. This patch makes rq->next_balance get updated independently of whether or not we pulled_task. Also, we add logic to ensure that we always traverse at least 1 of the sched domains to get a proper next_balance value for updating rq->next_balance. Additionally, since load_balance() modifies the sd->balance_interval, we need to re-obtain the sched domain's interval after the call to load_balance() in rebalance_domains() before we update rq->next_balance. This patch adds and uses 2 new helper functions, update_next_balance() and get_sd_balance_interval() to update next_balance and obtain the sched domain's balance_interval. Signed-off-by: Jason Low Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Cc: daniel.lezcano@linaro.org Cc: alex.shi@linaro.org Cc: efault@gmx.de Cc: vincent.guittot@linaro.org Cc: morten.rasmussen@arm.com Cc: aswin@hp.com Link: http://lkml.kernel.org/r/1399596562.2200.7.camel@j-VirtualBox Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 69 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 429164d117ea..26ec6686a00b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6672,17 +6672,44 @@ out: return ld_moved; } +static inline unsigned long +get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) +{ + unsigned long interval = sd->balance_interval; + + if (cpu_busy) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + + return interval; +} + +static inline void +update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +{ + unsigned long interval, next; + + interval = get_sd_balance_interval(sd, cpu_busy); + next = sd->last_balance + interval; + + if (time_after(*next_balance, next)) + *next_balance = next; +} + /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ static int idle_balance(struct rq *this_rq) { + unsigned long next_balance = jiffies + HZ; + int this_cpu = this_rq->cpu; struct sched_domain *sd; int pulled_task = 0; - unsigned long next_balance = jiffies + HZ; u64 curr_cost = 0; - int this_cpu = this_rq->cpu; idle_enter_fair(this_rq); @@ -6692,8 +6719,15 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost) + if (this_rq->avg_idle < sysctl_sched_migration_cost) { + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (sd) + update_next_balance(sd, 0, &next_balance); + rcu_read_unlock(); + goto out; + } /* * Drop the rq->lock, but keep IRQ/preempt disabled. @@ -6703,15 +6737,16 @@ static int idle_balance(struct rq *this_rq) update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { - unsigned long interval; int continue_balancing = 1; u64 t0, domain_cost; if (!(sd->flags & SD_LOAD_BALANCE)) continue; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { + update_next_balance(sd, 0, &next_balance); break; + } if (sd->flags & SD_BALANCE_NEWIDLE) { t0 = sched_clock_cpu(this_cpu); @@ -6727,9 +6762,7 @@ static int idle_balance(struct rq *this_rq) curr_cost += domain_cost; } - interval = msecs_to_jiffies(sd->balance_interval); - if (time_after(next_balance, sd->last_balance + interval)) - next_balance = sd->last_balance + interval; + update_next_balance(sd, 0, &next_balance); /* * Stop searching for tasks to pull if there are @@ -6753,15 +6786,11 @@ static int idle_balance(struct rq *this_rq) if (this_rq->cfs.h_nr_running && !pulled_task) pulled_task = 1; - if (pulled_task || time_after(jiffies, this_rq->next_balance)) { - /* - * We are going idle. next_balance may be set based on - * a busy processor. So reset next_balance. - */ +out: + /* Move the next balance forward */ + if (time_after(this_rq->next_balance, next_balance)) this_rq->next_balance = next_balance; - } -out: /* Is there a task of a high priority class? */ if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1; @@ -7044,16 +7073,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) break; } - interval = sd->balance_interval; - if (idle != CPU_IDLE) - interval *= sd->busy_factor; - - /* scale ms to jiffies */ - interval = msecs_to_jiffies(interval); - interval = clamp(interval, 1UL, max_load_balance_interval); + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { if (!spin_trylock(&balancing)) goto out; @@ -7069,6 +7091,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; } sd->last_balance = jiffies; + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); } if (need_serialize) spin_unlock(&balancing); -- cgit From 72465447867b9de6b5cdea5d10f9781585136270 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 9 May 2014 03:00:14 +0400 Subject: sched, nohz: Change rq->nr_running to always use wrappers Sometimes ->nr_running may cross 2 but interrupt is not being sent to rq's cpu. In this case we don't reenable the timer. Looks like this may be the reason for rare unexpected effects, if nohz is enabled. Patch replaces all places of direct changing of nr_running and makes add_nr_running() caring about crossing border. Signed-off-by: Kirill Tkhai Acked-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140508225830.2469.97461.stgit@localhost Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 4 ++-- kernel/sched/fair.c | 8 ++++---- kernel/sched/rt.c | 4 ++-- kernel/sched/sched.h | 12 +++++++----- kernel/sched/stop_task.c | 4 ++-- 5 files changed, 17 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 800e99b99075..e0a04ae1e0dd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -741,7 +741,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; - inc_nr_running(rq_of_dl_rq(dl_rq)); + add_nr_running(rq_of_dl_rq(dl_rq), 1); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -755,7 +755,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; - dec_nr_running(rq_of_dl_rq(dl_rq)); + sub_nr_running(rq_of_dl_rq(dl_rq), 1); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 26ec6686a00b..f7cac2ba62ea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3325,7 +3325,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) } if (!se) - rq->nr_running -= task_delta; + sub_nr_running(rq, task_delta); cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); @@ -3376,7 +3376,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) } if (!se) - rq->nr_running += task_delta; + add_nr_running(rq, task_delta); /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) @@ -3908,7 +3908,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { update_rq_runnable_avg(rq, rq->nr_running); - inc_nr_running(rq); + add_nr_running(rq, 1); } hrtick_update(rq); } @@ -3968,7 +3968,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } if (!se) { - dec_nr_running(rq); + sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); } hrtick_update(rq); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7795e292f4c9..0ebfd7a29472 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -973,7 +973,7 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) BUG_ON(!rq->nr_running); - rq->nr_running -= rt_rq->rt_nr_running; + sub_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 0; } @@ -989,7 +989,7 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) return; - rq->nr_running += rt_rq->rt_nr_running; + add_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 1; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b2cbe81308af..600e2291a75c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1206,12 +1206,14 @@ extern void update_idle_cpu_load(struct rq *this_rq); extern void init_task_runnable_average(struct task_struct *p); -static inline void inc_nr_running(struct rq *rq) +static inline void add_nr_running(struct rq *rq, unsigned count) { - rq->nr_running++; + unsigned prev_nr = rq->nr_running; + + rq->nr_running = prev_nr + count; #ifdef CONFIG_NO_HZ_FULL - if (rq->nr_running == 2) { + if (prev_nr < 2 && rq->nr_running >= 2) { if (tick_nohz_full_cpu(rq->cpu)) { /* Order rq->nr_running write against the IPI */ smp_wmb(); @@ -1221,9 +1223,9 @@ static inline void inc_nr_running(struct rq *rq) #endif } -static inline void dec_nr_running(struct rq *rq) +static inline void sub_nr_running(struct rq *rq, unsigned count) { - rq->nr_running--; + rq->nr_running -= count; } static inline void rq_last_tick_reset(struct rq *rq) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index d6ce65dde541..bfe0edadbfbb 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { - inc_nr_running(rq); + add_nr_running(rq, 1); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { - dec_nr_running(rq); + sub_nr_running(rq, 1); } static void yield_task_stop(struct rq *rq) -- cgit From a803f0261bb2bb57aab5542af3174db43b2a3887 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Thu, 8 May 2014 13:47:39 -0500 Subject: sched: Initialize rq->age_stamp on processor start If the sched_clock time starts at a large value, the kernel will spin in sched_avg_update for a long time while rq->age_stamp catches up with rq->clock. The comment in kernel/sched/clock.c says that there is no strict promise that it starts at zero. So initialize rq->age_stamp when a cpu starts up to avoid this. I was seeing long delays on a simulator that didn't start the clock at zero. This might also be an issue on reboots on processors that don't re-initialize the timer to zero on reset, and when using kexec. Signed-off-by: Corey Minyard Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1399574859-11714-1-git-send-email-minyard@acm.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f5605b6deea4..da302ca98f60 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5089,10 +5089,20 @@ static struct notifier_block migration_notifier = { .priority = CPU_PRI_MIGRATION, }; +static void __cpuinit set_cpu_rq_start_time(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + rq->age_stamp = sched_clock_cpu(cpu); +} + static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { + case CPU_STARTING: + set_cpu_rq_start_time(); + return NOTIFY_OK; case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; @@ -6970,6 +6980,7 @@ void __init sched_init(void) if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); idle_thread_set_boot_cpu(); + set_cpu_rq_start_time(); #endif init_sched_fair_class(); -- cgit From 7aa2c016db2162defff77f6f5731bff3f25e5175 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Thu, 8 May 2014 18:33:49 +0900 Subject: sched: Consolidate open coded implementations of nice level frobbing into nice_to_rlimit() and rlimit_to_nice() Signed-off-by: Dongsheng Yang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/a568a1e3cc8e78648f41b5035fa5e381d36274da.1399532322.git.yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- drivers/staging/android/binder.c | 2 +- include/linux/sched/prio.h | 16 ++++++++++++++++ kernel/sched/core.c | 2 +- kernel/sys.c | 6 +++--- 4 files changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index 179b21b66504..9311bb67ec35 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -436,7 +436,7 @@ static void binder_set_nice(long nice) set_user_nice(current, nice); return; } - min_nice = 20 - current->signal->rlim[RLIMIT_NICE].rlim_cur; + min_nice = rlimit_to_nice(current->signal->rlim[RLIMIT_NICE].rlim_cur); binder_debug(BINDER_DEBUG_PRIORITY_CAP, "%d: nice value %ld not allowed use %ld instead\n", current->pid, nice, min_nice); diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index ac322583c820..d9cf5a5762d9 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -41,4 +41,20 @@ #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +/* + * Convert nice value [19,-20] to rlimit style value [1,40]. + */ +static inline long nice_to_rlimit(long nice) +{ + return (MAX_NICE - nice + 1); +} + +/* + * Convert rlimit style value [1,40] to nice value [-20, 19]. + */ +static inline long rlimit_to_nice(long prio) +{ + return (MAX_NICE - prio + 1); +} + #endif /* _SCHED_PRIO_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da302ca98f60..321d800e4baa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3033,7 +3033,7 @@ EXPORT_SYMBOL(set_user_nice); int can_nice(const struct task_struct *p, const int nice) { /* convert nice value [19,-20] to rlimit style value [1,40] */ - int nice_rlim = 20 - nice; + int nice_rlim = nice_to_rlimit(nice); return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || capable(CAP_SYS_NICE)); diff --git a/kernel/sys.c b/kernel/sys.c index fba0f29401ea..66a751ebf9d9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) else p = current; if (p) { - niceval = 20 - task_nice(p); + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } @@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) else pgrp = task_pgrp(current); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - niceval = 20 - task_nice(p); + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); @@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) do_each_thread(g, p) { if (uid_eq(task_uid(p), uid)) { - niceval = 20 - task_nice(p); + niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } -- cgit From 4027d080854d1be96ef134a1c3024d5276114db6 Mon Sep 17 00:00:00 2001 From: "xiaofeng.yan" Date: Fri, 9 May 2014 03:21:27 +0000 Subject: sched/rt: Fix 'struct sched_dl_entity' and dl_task_time() comments, to match the current upstream code Signed-off-by: xiaofeng.yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1399605687-18094-1-git-send-email-xiaofeng.yan@huawei.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++-- kernel/sched/deadline.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 725eef121c9f..0f91d00efd87 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1175,8 +1175,8 @@ struct sched_dl_entity { /* * Original scheduling parameters. Copied here from sched_attr - * during sched_setscheduler2(), they will remain the same until - * the next sched_setscheduler2(). + * during sched_setattr(), they will remain the same until + * the next sched_setattr(). */ u64 dl_runtime; /* maximum runtime for each instance */ u64 dl_deadline; /* relative deadline of each instance */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e0a04ae1e0dd..f9ca7d19781a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -520,7 +520,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * We need to take care of a possible races here. In fact, the * task might have changed its scheduling policy to something * different from SCHED_DEADLINE or changed its reservation - * parameters (through sched_setscheduler()). + * parameters (through sched_setattr()). */ if (!dl_task(p) || dl_se->dl_new) goto unlock; -- cgit From e63da03639cc9e6e83b62e7ef8ffdbb92421416a Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 14 May 2014 13:22:21 -0400 Subject: sched/numa: Allow task switch if load imbalance improves Currently the NUMA balancing code only allows moving tasks between NUMA nodes when the load on both nodes is in balance. This breaks down when the load was imbalanced to begin with. Allow tasks to be moved between NUMA nodes if the imbalance is small, or if the new imbalance is be smaller than the original one. Suggested-by: Peter Zijlstra Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Signed-off-by: Ingo Molnar Link: http://lkml.kernel.org/r/20140514132221.274b3463@annuminas.surriel.com --- kernel/sched/fair.c | 46 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f7cac2ba62ea..b899613f2bc6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env, env->best_cpu = env->dst_cpu; } +static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, + long src_load, long dst_load, + struct task_numa_env *env) +{ + long imb, old_imb; + + /* We care about the slope of the imbalance, not the direction. */ + if (dst_load < src_load) + swap(dst_load, src_load); + + /* Is the difference below the threshold? */ + imb = dst_load * 100 - src_load * env->imbalance_pct; + if (imb <= 0) + return false; + + /* + * The imbalance is above the allowed threshold. + * Compare it with the old imbalance. + */ + if (orig_dst_load < orig_src_load) + swap(orig_dst_load, orig_src_load); + + old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + + /* Would this change make things worse? */ + return (old_imb > imb); +} + /* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking @@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - long dst_load, src_load; + long orig_src_load, src_load; + long orig_dst_load, dst_load; long load; long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env, * In the overloaded case, try and keep the load balanced. */ balance: - dst_load = env->dst_stats.load; - src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; + orig_src_load = env->src_stats.load; /* XXX missing power terms */ load = task_h_load(env->p); - dst_load += load; - src_load -= load; + dst_load = orig_dst_load + load; + src_load = orig_src_load - load; if (cur) { load = task_h_load(cur); @@ -1195,11 +1224,8 @@ balance: src_load += load; } - /* make src_load the smaller */ - if (dst_load < src_load) - swap(dst_load, src_load); - - if (src_load * env->imbalance_pct < dst_load * 100) + if (load_too_imbalanced(orig_src_load, orig_dst_load, + src_load, dst_load, env)) goto unlock; assign: -- cgit From b1ad065e65f56103db8b97edbd218a271ff5b1bb Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 15 May 2014 13:03:06 -0400 Subject: sched/numa: Update migrate_improves/degrades_locality() Update the migrate_improves/degrades_locality() functions with knowledge of pseudo-interleaving. Do not consider moving tasks around within the set of group's active nodes as improving or degrading locality. Instead, leave the load balancer free to balance the load between a numa_group's active nodes. Also, switch from the group/task_weight functions to the group/task_fault functions. The "weight" functions involve a division, but both calls use the same divisor, so there's no point in doing that from these functions. On a 4 node (x10 core) system, performance of SPECjbb2005 seems unaffected, though the number of migrations with 2 8-warehouse wide instances seems to have almost halved, due to the scheduler running each instance on a single node. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Link: http://lkml.kernel.org/r/20140515130306.61aae7db@cuia.bos.redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b899613f2bc6..503f750c2d25 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5123,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now) /* Returns true if the destination node has incurred more faults */ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) { + struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || @@ -5136,21 +5137,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; - /* Always encourage migration to the preferred node. */ - if (dst_nid == p->numa_preferred_nid) - return true; + if (numa_group) { + /* Task is already in the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return false; + + /* Task is moving into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return true; - /* If both task and group weight improve, this move is a winner. */ - if (task_weight(p, dst_nid) > task_weight(p, src_nid) && - group_weight(p, dst_nid) > group_weight(p, src_nid)) + return group_faults(p, dst_nid) > group_faults(p, src_nid); + } + + /* Encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) return true; - return false; + return task_faults(p, dst_nid) > task_faults(p, src_nid); } static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { + struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) @@ -5165,16 +5174,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; + if (numa_group) { + /* Task is moving within/into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return false; + + /* Task is moving out of the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return true; + + return group_faults(p, dst_nid) < group_faults(p, src_nid); + } + /* Migrating away from the preferred node is always bad. */ if (src_nid == p->numa_preferred_nid) return true; - /* If either task or group weight get worse, don't do it. */ - if (task_weight(p, dst_nid) < task_weight(p, src_nid) || - group_weight(p, dst_nid) < group_weight(p, src_nid)) - return true; - - return false; + return task_faults(p, dst_nid) < task_faults(p, src_nid); } #else -- cgit From 096aa33863a5e48de52d2ff30e0801b7487944f4 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 16 May 2014 00:13:32 -0400 Subject: sched/numa: Decay ->wakee_flips instead of zeroing Affine wakeups have the potential to interfere with NUMA placement. If a task wakes up too many other tasks, affine wakeups will get disabled. However, regardless of how many other tasks it wakes up, it gets re-enabled once a second, potentially interfering with NUMA placement of other tasks. By decaying wakee_wakes in half instead of zeroing it, we can avoid that problem for some workloads. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: chegu_vinod@hp.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20140516001332.67f91af2@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 503f750c2d25..c9617b73bcc0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4065,7 +4065,7 @@ static void record_wakee(struct task_struct *p) * about the loss. */ if (jiffies > current->wakee_flip_decay_ts + HZ) { - current->wakee_flips = 0; + current->wakee_flips >>= 1; current->wakee_flip_decay_ts = jiffies; } -- cgit From 1037de36edae30f1ddc0a2532decd50f92ac4901 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 22 May 2014 16:44:07 +0800 Subject: workqueue: rename first_worker() to first_idle_worker() first_worker() actually returns the first idle workers, the name first_idle_worker() which is self-commnet will be better. All the callers of first_worker() expect it returns an idle worker, the name first_idle_worker() with "idle" notation makes reviewers happier. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6603923b7ede..dd107b83f45f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -773,8 +773,8 @@ static bool too_many_workers(struct worker_pool *pool) * Wake up functions. */ -/* Return the first worker. Safe with preemption disabled */ -static struct worker *first_worker(struct worker_pool *pool) +/* Return the first idle worker. Safe with preemption disabled */ +static struct worker *first_idle_worker(struct worker_pool *pool) { if (unlikely(list_empty(&pool->idle_list))) return NULL; @@ -793,7 +793,7 @@ static struct worker *first_worker(struct worker_pool *pool) */ static void wake_up_worker(struct worker_pool *pool) { - struct worker *worker = first_worker(pool); + struct worker *worker = first_idle_worker(pool); if (likely(worker)) wake_up_process(worker->task); @@ -867,7 +867,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) */ if (atomic_dec_and_test(&pool->nr_running) && !list_empty(&pool->worklist)) - to_wakeup = first_worker(pool); + to_wakeup = first_idle_worker(pool); return to_wakeup ? to_wakeup->task : NULL; } @@ -3475,7 +3475,7 @@ static void put_unbound_pool(struct worker_pool *pool) mutex_lock(&pool->manager_arb); spin_lock_irq(&pool->lock); - while ((worker = first_worker(pool))) + while ((worker = first_idle_worker(pool))) destroy_worker(worker); WARN_ON(pool->nr_workers || pool->nr_idle); spin_unlock_irq(&pool->lock); -- cgit From 74b414ead1133972817d3ce7b934356150d03a7d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 22 May 2014 19:01:16 +0800 Subject: workqueue: remove the confusing POOL_FREEZING Currently, the global freezing state is propagated to worker_pools via POOL_FREEZING and then to each workqueue; however, the middle step - propagation through worker_pools - can be skipped as long as one or more max_active adjustments happens for each workqueue after the update to the global state is visible. The global workqueue freezing state and the max_active adjustments during workqueue creation and [un]freezing are serialized with wq_pool_mutex, so it's trivial to guarantee that max_actives stay in sync with global freezing state. POOL_FREEZING is unnecessary and makes the code more confusing and complicates freeze_workqueues_begin() and thaw_workqueues() by requiring them to walk through all pools. Remove POOL_FREEZING and use workqueue_freezing directly instead. tj: Description and comment updates. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 32 +++++++------------------------- 1 file changed, 7 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dd107b83f45f..bc3c18892b7d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -69,7 +69,6 @@ enum { * worker_attach_to_pool() is in progress. */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ - POOL_FREEZING = 1 << 3, /* freeze in progress */ /* worker flags */ WORKER_DIE = 1 << 1, /* die die die */ @@ -3533,9 +3532,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) if (!pool || init_worker_pool(pool) < 0) goto fail; - if (workqueue_freezing) - pool->flags |= POOL_FREEZING; - lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); @@ -3642,7 +3638,12 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) spin_lock_irq(&pwq->pool->lock); - if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { + /* + * During [un]freezing, the caller is responsible for ensuring that + * this function is called at least once after @workqueue_freezing + * is updated and visible. + */ + if (!freezable || !workqueue_freezing) { pwq->max_active = wq->saved_max_active; while (!list_empty(&pwq->delayed_works) && @@ -4751,24 +4752,14 @@ EXPORT_SYMBOL_GPL(work_on_cpu); */ void freeze_workqueues_begin(void) { - struct worker_pool *pool; struct workqueue_struct *wq; struct pool_workqueue *pwq; - int pi; mutex_lock(&wq_pool_mutex); WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; - /* set FREEZING */ - for_each_pool(pool, pi) { - spin_lock_irq(&pool->lock); - WARN_ON_ONCE(pool->flags & POOL_FREEZING); - pool->flags |= POOL_FREEZING; - spin_unlock_irq(&pool->lock); - } - list_for_each_entry(wq, &workqueues, list) { mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) @@ -4838,21 +4829,13 @@ void thaw_workqueues(void) { struct workqueue_struct *wq; struct pool_workqueue *pwq; - struct worker_pool *pool; - int pi; mutex_lock(&wq_pool_mutex); if (!workqueue_freezing) goto out_unlock; - /* clear FREEZING */ - for_each_pool(pool, pi) { - spin_lock_irq(&pool->lock); - WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); - pool->flags &= ~POOL_FREEZING; - spin_unlock_irq(&pool->lock); - } + workqueue_freezing = false; /* restore max_active and repopulate worklist */ list_for_each_entry(wq, &workqueues, list) { @@ -4862,7 +4845,6 @@ void thaw_workqueues(void) mutex_unlock(&wq->mutex); } - workqueue_freezing = false; out_unlock: mutex_unlock(&wq_pool_mutex); } -- cgit From e4c729664339e4be352d4c7434a5c6184285148d Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 14 Apr 2014 15:38:11 -0600 Subject: resources: Clarify sanity check message The resource map sanity check message is a bit confusing. Change it to be more readable: -resource map sanity check conflict: 0xfed10000 0xfed15fff 0xfed10000 0xfed13fff pnp 00:01 +resource sanity check: requesting [mem 0xfed10000-0xfed15fff], which spans more than pnp 00:01 [mem 0xfed10000-0xfed13fff] Signed-off-by: Bjorn Helgaas --- kernel/resource.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 8957d686e29b..3c2237ac32db 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1288,13 +1288,10 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) if (p->flags & IORESOURCE_BUSY) continue; - printk(KERN_WARNING "resource map sanity check conflict: " - "0x%llx 0x%llx 0x%llx 0x%llx %s\n", + printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n", (unsigned long long)addr, (unsigned long long)(addr + size - 1), - (unsigned long long)p->start, - (unsigned long long)p->end, - p->name); + p->name, p); err = -1; break; } -- cgit From 72e6ae285a1dbff553734985bedadf409d99c02d Mon Sep 17 00:00:00 2001 From: Victor Kamensky Date: Tue, 29 Apr 2014 04:20:52 +0100 Subject: ARM: 8043/1: uprobes need icache flush after xol write After instruction write into xol area, on ARM V7 architecture code need to flush dcache and icache to sync them up for given set of addresses. Having just 'flush_dcache_page(page)' call is not enough - it is possible to have stale instruction sitting in icache for given xol area slot address. Introduce arch_uprobe_ixol_copy weak function that by default calls uprobes copy_to_page function and than flush_dcache_page function and on ARM define new one that handles xol slot copy in ARM specific way flush_uprobe_xol_access function shares/reuses implementation with/of flush_ptrace_access function and takes care of writing instruction to user land address space on given variety of different cache types on ARM CPUs. Because flush_uprobe_xol_access does not have vma around flush_ptrace_access was split into two parts. First that retrieves set of condition from vma and common that receives those conditions as flags. Note ARM cache flush function need kernel address through which instruction write happened, so instead of using uprobes copy_to_page function changed code to explicitly map page and do memcpy. Note arch_uprobe_copy_ixol function, in similar way as copy_to_user_page function, has preempt_disable/preempt_enable. Signed-off-by: Victor Kamensky Acked-by: Oleg Nesterov Reviewed-by: David A. Long Signed-off-by: Russell King --- arch/arm/include/asm/cacheflush.h | 2 ++ arch/arm/kernel/uprobes.c | 20 ++++++++++++++++++++ arch/arm/mm/flush.c | 33 ++++++++++++++++++++++++++++----- include/linux/uprobes.h | 3 +++ kernel/events/uprobes.c | 25 +++++++++++++++++-------- 5 files changed, 70 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 00af9fe435e6..fd43f7f55b70 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -487,4 +487,6 @@ int set_memory_rw(unsigned long addr, int numpages); int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); +void flush_uprobe_xol_access(struct page *page, unsigned long uaddr, + void *kaddr, unsigned long len); #endif diff --git a/arch/arm/kernel/uprobes.c b/arch/arm/kernel/uprobes.c index f9bacee973bf..56adf9c1fde0 100644 --- a/arch/arm/kernel/uprobes.c +++ b/arch/arm/kernel/uprobes.c @@ -113,6 +113,26 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, return 0; } +void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, + void *src, unsigned long len) +{ + void *xol_page_kaddr = kmap_atomic(page); + void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK); + + preempt_disable(); + + /* Initialize the slot */ + memcpy(dst, src, len); + + /* flush caches (dcache/icache) */ + flush_uprobe_xol_access(page, vaddr, dst, len); + + preempt_enable(); + + kunmap_atomic(xol_page_kaddr); +} + + int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 3387e60e4ea3..43d54f5b26b9 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -104,17 +104,20 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsig #define flush_icache_alias(pfn,vaddr,len) do { } while (0) #endif +#define FLAG_PA_IS_EXEC 1 +#define FLAG_PA_CORE_IN_MM 2 + static void flush_ptrace_access_other(void *args) { __flush_icache_all(); } -static -void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, - unsigned long uaddr, void *kaddr, unsigned long len) +static inline +void __flush_ptrace_access(struct page *page, unsigned long uaddr, void *kaddr, + unsigned long len, unsigned int flags) { if (cache_is_vivt()) { - if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) { + if (flags & FLAG_PA_CORE_IN_MM) { unsigned long addr = (unsigned long)kaddr; __cpuc_coherent_kern_range(addr, addr + len); } @@ -128,7 +131,7 @@ void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, } /* VIPT non-aliasing D-cache */ - if (vma->vm_flags & VM_EXEC) { + if (flags & FLAG_PA_IS_EXEC) { unsigned long addr = (unsigned long)kaddr; if (icache_is_vipt_aliasing()) flush_icache_alias(page_to_pfn(page), uaddr, len); @@ -140,6 +143,26 @@ void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, } } +static +void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, + unsigned long uaddr, void *kaddr, unsigned long len) +{ + unsigned int flags = 0; + if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) + flags |= FLAG_PA_CORE_IN_MM; + if (vma->vm_flags & VM_EXEC) + flags |= FLAG_PA_IS_EXEC; + __flush_ptrace_access(page, uaddr, kaddr, len, flags); +} + +void flush_uprobe_xol_access(struct page *page, unsigned long uaddr, + void *kaddr, unsigned long len) +{ + unsigned int flags = FLAG_PA_CORE_IN_MM|FLAG_PA_IS_EXEC; + + __flush_ptrace_access(page, uaddr, kaddr, len, flags); +} + /* * Copy user data from/to a page which is mapped into a different * processes address space. Really, we want to allow our "user diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index edff2b97b864..c52f827ba6ce 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -32,6 +32,7 @@ struct vm_area_struct; struct mm_struct; struct inode; struct notifier_block; +struct page; #define UPROBE_HANDLER_REMOVE 1 #define UPROBE_HANDLER_MASK 1 @@ -127,6 +128,8 @@ extern int arch_uprobe_exception_notify(struct notifier_block *self, unsigned l extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs); extern bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); +extern void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, + void *src, unsigned long len); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 04709b66369d..4968213c63fa 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1296,14 +1296,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) if (unlikely(!xol_vaddr)) return 0; - /* Initialize the slot */ - copy_to_page(area->page, xol_vaddr, - &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); - /* - * We probably need flush_icache_user_range() but it needs vma. - * This should work on supported architectures too. - */ - flush_dcache_page(area->page); + arch_uprobe_copy_ixol(area->page, xol_vaddr, + &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return xol_vaddr; } @@ -1346,6 +1340,21 @@ static void xol_free_insn_slot(struct task_struct *tsk) } } +void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, + void *src, unsigned long len) +{ + /* Initialize the slot */ + copy_to_page(page, vaddr, src, len); + + /* + * We probably need flush_icache_user_range() but it needs vma. + * This should work on most of architectures by default. If + * architecture needs to do something different it can define + * its own version of the function. + */ + flush_dcache_page(page); +} + /** * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs * @regs: Reflects the saved state of the task after it has hit a breakpoint -- cgit From 27ddcc6596e50cb8f03d2e83248897667811d8f6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 26 May 2014 13:40:47 +0200 Subject: PM / sleep: Add state field to pm_states[] entries To allow sleep states corresponding to the "mem", "standby" and "freeze" lables to be different from the pm_states[] indexes of those strings, introduce struct pm_sleep_state, consisting of a string label and a state number, and turn pm_states[] into an array of objects of that type. This modification should not lead to any functional changes. Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 16 ++++++++-------- kernel/power/power.h | 7 ++++++- kernel/power/suspend.c | 12 ++++++------ kernel/power/suspend_test.c | 22 ++++++++++------------ 4 files changed, 30 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 6271bc4073ef..8e818432253c 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -293,12 +293,12 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, { char *s = buf; #ifdef CONFIG_SUSPEND - int i; + suspend_state_t i; + + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (valid_state(i)) + s += sprintf(s,"%s ", pm_states[i].label); - for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (pm_states[i] && valid_state(i)) - s += sprintf(s,"%s ", pm_states[i]); - } #endif #ifdef CONFIG_HIBERNATION s += sprintf(s, "%s\n", "disk"); @@ -314,7 +314,7 @@ static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND suspend_state_t state = PM_SUSPEND_MIN; - const char * const *s; + struct pm_sleep_state *s; #endif char *p; int len; @@ -328,7 +328,7 @@ static suspend_state_t decode_state(const char *buf, size_t n) #ifdef CONFIG_SUSPEND for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) - if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) + if (len == strlen(s->label) && !strncmp(buf, s->label, len)) return state; #endif @@ -448,7 +448,7 @@ static ssize_t autosleep_show(struct kobject *kobj, #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) return sprintf(buf, "%s\n", valid_state(state) ? - pm_states[state] : "error"); + pm_states[state].label : "error"); #endif #ifdef CONFIG_HIBERNATION return sprintf(buf, "disk\n"); diff --git a/kernel/power/power.h b/kernel/power/power.h index 15f37ea08719..99539c5da844 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -178,8 +178,13 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, unsigned int, char *); #ifdef CONFIG_SUSPEND +struct pm_sleep_state { + const char *label; + suspend_state_t state; +}; + /* kernel/power/suspend.c */ -extern const char *const pm_states[]; +extern struct pm_sleep_state pm_states[]; extern bool valid_state(suspend_state_t state); extern int suspend_devices_and_enter(suspend_state_t state); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 155721f7f909..5d93b138a2d4 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -31,10 +31,10 @@ #include "power.h" -const char *const pm_states[PM_SUSPEND_MAX] = { - [PM_SUSPEND_FREEZE] = "freeze", - [PM_SUSPEND_STANDBY] = "standby", - [PM_SUSPEND_MEM] = "mem", +struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { + [PM_SUSPEND_FREEZE] = { "freeze", PM_SUSPEND_FREEZE }, + [PM_SUSPEND_STANDBY] = { "standby", PM_SUSPEND_STANDBY }, + [PM_SUSPEND_MEM] = { "mem", PM_SUSPEND_MEM }, }; static const struct platform_suspend_ops *suspend_ops; @@ -343,7 +343,7 @@ static int enter_state(suspend_state_t state) sys_sync(); printk("done.\n"); - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); + pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); error = suspend_prepare(state); if (error) goto Unlock; @@ -351,7 +351,7 @@ static int enter_state(suspend_state_t state) if (suspend_test(TEST_FREEZER)) goto Finish; - pr_debug("PM: Entering %s sleep\n", pm_states[state]); + pr_debug("PM: Entering %s sleep\n", pm_states[state].label); pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); pm_restore_gfp_mask(); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 9b2a1d58558d..d4e3ab167a73 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) } if (state == PM_SUSPEND_MEM) { - printk(info_test, pm_states[state]); + printk(info_test, pm_states[state].label); status = pm_suspend(state); if (status == -ENODEV) state = PM_SUSPEND_STANDBY; } if (state == PM_SUSPEND_STANDBY) { - printk(info_test, pm_states[state]); + printk(info_test, pm_states[state].label); status = pm_suspend(state); } if (status < 0) @@ -136,18 +136,16 @@ static char warn_bad_state[] __initdata = static int __init setup_test_suspend(char *value) { - unsigned i; + suspend_state_t i; /* "=mem" ==> "mem" */ value++; - for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (!pm_states[i]) - continue; - if (strcmp(pm_states[i], value) != 0) - continue; - test_state = (__force suspend_state_t) i; - return 0; - } + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (!strcmp(pm_states[i].label, value)) { + test_state = pm_states[i].state; + return 0; + } + printk(warn_bad_state, value); return 0; } @@ -165,7 +163,7 @@ static int __init test_suspend(void) if (test_state == PM_SUSPEND_ON) goto done; if (!valid_state(test_state)) { - printk(warn_bad_state, pm_states[test_state]); + printk(warn_bad_state, pm_states[test_state].label); goto done; } -- cgit From 43e8317b0bba1d6eb85f38a4a233d82d7c20d732 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 26 May 2014 13:40:53 +0200 Subject: PM / sleep: Use valid_state() for platform-dependent sleep states only Use the observation that, for platform-dependent sleep states (PM_SUSPEND_STANDBY, PM_SUSPEND_MEM), a given state is either always supported or always unsupported and store that information in pm_states[] instead of calling valid_state() every time we need to check it. Also do not use valid_state() for PM_SUSPEND_FREEZE, which is always valid, and move the pm_test_level validity check for PM_SUSPEND_FREEZE directly into enter_state(). Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 9 ++++--- kernel/power/power.h | 2 -- kernel/power/suspend.c | 60 ++++++++++++++++++++++----------------------- kernel/power/suspend_test.c | 2 +- 4 files changed, 36 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 8e818432253c..9f51f0ab3d86 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -296,7 +296,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) - if (valid_state(i)) + if (pm_states[i].state) s += sprintf(s,"%s ", pm_states[i].label); #endif @@ -328,8 +328,9 @@ static suspend_state_t decode_state(const char *buf, size_t n) #ifdef CONFIG_SUSPEND for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) - if (len == strlen(s->label) && !strncmp(buf, s->label, len)) - return state; + if (s->state && len == strlen(s->label) + && !strncmp(buf, s->label, len)) + return s->state; #endif return PM_SUSPEND_ON; @@ -447,7 +448,7 @@ static ssize_t autosleep_show(struct kobject *kobj, #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) - return sprintf(buf, "%s\n", valid_state(state) ? + return sprintf(buf, "%s\n", pm_states[state].state ? pm_states[state].label : "error"); #endif #ifdef CONFIG_HIBERNATION diff --git a/kernel/power/power.h b/kernel/power/power.h index 99539c5da844..c60f13b5270a 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -186,14 +186,12 @@ struct pm_sleep_state { /* kernel/power/suspend.c */ extern struct pm_sleep_state pm_states[]; -extern bool valid_state(suspend_state_t state); extern int suspend_devices_and_enter(suspend_state_t state); #else /* !CONFIG_SUSPEND */ static inline int suspend_devices_and_enter(suspend_state_t state) { return -ENOSYS; } -static inline bool valid_state(suspend_state_t state) { return false; } #endif /* !CONFIG_SUSPEND */ #ifdef CONFIG_PM_TEST_SUSPEND diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 5d93b138a2d4..00aca60904b0 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -32,9 +32,9 @@ #include "power.h" struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { - [PM_SUSPEND_FREEZE] = { "freeze", PM_SUSPEND_FREEZE }, - [PM_SUSPEND_STANDBY] = { "standby", PM_SUSPEND_STANDBY }, - [PM_SUSPEND_MEM] = { "mem", PM_SUSPEND_MEM }, + [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, + [PM_SUSPEND_STANDBY] = { .label = "standby", }, + [PM_SUSPEND_MEM] = { .label = "mem", }, }; static const struct platform_suspend_ops *suspend_ops; @@ -68,42 +68,34 @@ void freeze_wake(void) } EXPORT_SYMBOL_GPL(freeze_wake); +static bool valid_state(suspend_state_t state) +{ + /* + * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level + * support and need to be valid to the low level + * implementation, no valid callback implies that none are valid. + */ + return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); +} + /** * suspend_set_ops - Set the global suspend method table. * @ops: Suspend operations to use. */ void suspend_set_ops(const struct platform_suspend_ops *ops) { + suspend_state_t i; + lock_system_sleep(); + suspend_ops = ops; + for (i = PM_SUSPEND_STANDBY; i <= PM_SUSPEND_MEM; i++) + pm_states[i].state = valid_state(i) ? i : 0; + unlock_system_sleep(); } EXPORT_SYMBOL_GPL(suspend_set_ops); -bool valid_state(suspend_state_t state) -{ - if (state == PM_SUSPEND_FREEZE) { -#ifdef CONFIG_PM_DEBUG - if (pm_test_level != TEST_NONE && - pm_test_level != TEST_FREEZER && - pm_test_level != TEST_DEVICES && - pm_test_level != TEST_PLATFORM) { - printk(KERN_WARNING "Unsupported pm_test mode for " - "freeze state, please choose " - "none/freezer/devices/platform.\n"); - return false; - } -#endif - return true; - } - /* - * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel - * support and need to be valid to the lowlevel - * implementation, no valid callback implies that none are valid. - */ - return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); -} - /** * suspend_valid_only_mem - Generic memory-only valid callback. * @@ -330,9 +322,17 @@ static int enter_state(suspend_state_t state) { int error; - if (!valid_state(state)) - return -ENODEV; - + if (state == PM_SUSPEND_FREEZE) { +#ifdef CONFIG_PM_DEBUG + if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { + pr_warning("PM: Unsupported test mode for freeze state," + "please choose none/freezer/devices/platform.\n"); + return -EAGAIN; + } +#endif + } else if (!valid_state(state)) { + return -EINVAL; + } if (!mutex_trylock(&pm_mutex)) return -EBUSY; diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index d4e3ab167a73..269b097e78ea 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -162,7 +162,7 @@ static int __init test_suspend(void) /* PM is initialized by now; is that state testable? */ if (test_state == PM_SUSPEND_ON) goto done; - if (!valid_state(test_state)) { + if (!pm_states[test_state].state) { printk(warn_bad_state, pm_states[test_state].label); goto done; } -- cgit From 0399d4db3edf5c58b6ec7f672f089f5085e49ed5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 26 May 2014 13:40:59 +0200 Subject: PM / sleep: Introduce command line argument for sleep state enumeration On some systems the platform doesn't support neither PM_SUSPEND_MEM nor PM_SUSPEND_STANDBY, so PM_SUSPEND_FREEZE is the only available system sleep state. However, some user space frameworks only use the "mem" and (sometimes) "standby" sleep state labels, so the users of those systems need to modify user space in order to be able to use system suspend at all and that is not always possible. For this reason, add a new kernel command line argument, relative_sleep_states, allowing the users of those systems to change the way in which the kernel assigns labels to system sleep states. Namely, for relative_sleep_states=1, the "mem", "standby" and "freeze" labels will enumerate the available system sleem states from the deepest to the shallowest, respectively, so that "mem" is always present in /sys/power/state and the other state strings may or may not be presend depending on what is supported by the platform. Update system sleep states documentation to reflect this change. Signed-off-by: Rafael J. Wysocki --- Documentation/ABI/testing/sysfs-power | 29 ++++++++---- Documentation/kernel-parameters.txt | 7 +++ Documentation/power/states.txt | 87 ++++++++++++++++++++++------------- kernel/power/main.c | 12 ++--- kernel/power/suspend.c | 32 ++++++++++++- 5 files changed, 119 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 64c9276e9421..f4551816329e 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -7,19 +7,30 @@ Description: subsystem. What: /sys/power/state -Date: August 2006 +Date: May 2014 Contact: Rafael J. Wysocki Description: - The /sys/power/state file controls the system power state. - Reading from this file returns what states are supported, - which is hard-coded to 'freeze' (Low-Power Idle), 'standby' - (Power-On Suspend), 'mem' (Suspend-to-RAM), and 'disk' - (Suspend-to-Disk). + The /sys/power/state file controls system sleep states. + Reading from this file returns the available sleep state + labels, which may be "mem", "standby", "freeze" and "disk" + (hibernation). The meanings of the first three labels depend on + the relative_sleep_states command line argument as follows: + 1) relative_sleep_states = 1 + "mem", "standby", "freeze" represent non-hibernation sleep + states from the deepest ("mem", always present) to the + shallowest ("freeze"). "standby" and "freeze" may or may + not be present depending on the capabilities of the + platform. "freeze" can only be present if "standby" is + present. + 2) relative_sleep_states = 0 (default) + "mem" - "suspend-to-RAM", present if supported. + "standby" - "power-on suspend", present if supported. + "freeze" - "suspend-to-idle", always present. Writing to this file one of these strings causes the system to - transition into that state. Please see the file - Documentation/power/states.txt for a description of each of - these states. + transition into the corresponding state, if available. See + Documentation/power/states.txt for a description of what + "suspend-to-RAM", "power-on suspend" and "suspend-to-idle" mean. What: /sys/power/disk Date: September 2006 diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 43842177b771..e19a88b63eeb 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2889,6 +2889,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. [KNL, SMP] Set scheduler's default relax_domain_level. See Documentation/cgroups/cpusets.txt. + relative_sleep_states= + [SUSPEND] Use sleep state labeling where the deepest + state available other than hibernation is always "mem". + Format: { "0" | "1" } + 0 -- Traditional sleep state labels. + 1 -- Relative sleep state labels. + reserve= [KNL,BUGS] Force the kernel to ignore some iomem area reservetop= [X86-32] diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt index 442d43df9b25..50f3ef9177c1 100644 --- a/Documentation/power/states.txt +++ b/Documentation/power/states.txt @@ -1,62 +1,87 @@ +System Power Management Sleep States -System Power Management States +(C) 2014 Intel Corp., Rafael J. Wysocki +The kernel supports up to four system sleep states generically, although three +of them depend on the platform support code to implement the low-level details +for each state. -The kernel supports four power management states generically, though -one is generic and the other three are dependent on platform support -code to implement the low-level details for each state. -This file describes each state, what they are -commonly called, what ACPI state they map to, and what string to write -to /sys/power/state to enter that state +The states are represented by strings that can be read or written to the +/sys/power/state file. Those strings may be "mem", "standby", "freeze" and +"disk", where the last one always represents hibernation (Suspend-To-Disk) and +the meaning of the remaining ones depends on the relative_sleep_states command +line argument. -state: Freeze / Low-Power Idle +For relative_sleep_states=1, the strings "mem", "standby" and "freeze" label the +available non-hibernation sleep states from the deepest to the shallowest, +respectively. In that case, "mem" is always present in /sys/power/state, +because there is at least one non-hibernation sleep state in every system. If +the given system supports two non-hibernation sleep states, "standby" is present +in /sys/power/state in addition to "mem". If the system supports three +non-hibernation sleep states, "freeze" will be present in /sys/power/state in +addition to "mem" and "standby". + +For relative_sleep_states=0, which is the default, the following descriptions +apply. + +state: Suspend-To-Idle ACPI state: S0 -String: "freeze" +Label: "freeze" -This state is a generic, pure software, light-weight, low-power state. -It allows more energy to be saved relative to idle by freezing user +This state is a generic, pure software, light-weight, system sleep state. +It allows more energy to be saved relative to runtime idle by freezing user space and putting all I/O devices into low-power states (possibly lower-power than available at run time), such that the processors can spend more time in their idle states. -This state can be used for platforms without Standby/Suspend-to-RAM + +This state can be used for platforms without Power-On Suspend/Suspend-to-RAM support, or it can be used in addition to Suspend-to-RAM (memory sleep) -to provide reduced resume latency. +to provide reduced resume latency. It is always supported. State: Standby / Power-On Suspend ACPI State: S1 -String: "standby" +Label: "standby" -This state offers minimal, though real, power savings, while providing -a very low-latency transition back to a working system. No operating -state is lost (the CPU retains power), so the system easily starts up +This state, if supported, offers moderate, though real, power savings, while +providing a relatively low-latency transition back to a working system. No +operating state is lost (the CPU retains power), so the system easily starts up again where it left off. -We try to put devices in a low-power state equivalent to D1, which -also offers low power savings, but low resume latency. Not all devices -support D1, and those that don't are left on. +In addition to freezing user space and putting all I/O devices into low-power +states, which is done for Suspend-To-Idle too, nonboot CPUs are taken offline +and all low-level system functions are suspended during transitions into this +state. For this reason, it should allow more energy to be saved relative to +Suspend-To-Idle, but the resume latency will generally be greater than for that +state. State: Suspend-to-RAM ACPI State: S3 -String: "mem" +Label: "mem" -This state offers significant power savings as everything in the -system is put into a low-power state, except for memory, which is -placed in self-refresh mode to retain its contents. +This state, if supported, offers significant power savings as everything in the +system is put into a low-power state, except for memory, which should be placed +into the self-refresh mode to retain its contents. All of the steps carried out +when entering Power-On Suspend are also carried out during transitions to STR. +Additional operations may take place depending on the platform capabilities. In +particular, on ACPI systems the kernel passes control to the BIOS (platform +firmware) as the last step during STR transitions and that usually results in +powering down some more low-level components that aren't directly controlled by +the kernel. -System and device state is saved and kept in memory. All devices are -suspended and put into D3. In many cases, all peripheral buses lose -power when entering STR, so devices must be able to handle the -transition back to the On state. +System and device state is saved and kept in memory. All devices are suspended +and put into low-power states. In many cases, all peripheral buses lose power +when entering STR, so devices must be able to handle the transition back to the +"on" state. -For at least ACPI, STR requires some minimal boot-strapping code to -resume the system from STR. This may be true on other platforms. +For at least ACPI, STR requires some minimal boot-strapping code to resume the +system from it. This may be the case on other platforms too. State: Suspend-to-disk ACPI State: S4 -String: "disk" +Label: "disk" This state offers the greatest power savings, and can be used even in the absence of low-level platform support for power management. This diff --git a/kernel/power/main.c b/kernel/power/main.c index 9f51f0ab3d86..573410d6647e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -279,14 +279,14 @@ static inline void pm_print_times_init(void) {} struct kobject *power_kobj; /** - * state - control system power state. + * state - control system sleep states. * - * show() returns what states are supported, which is hard-coded to - * 'freeze' (Low-Power Idle), 'standby' (Power-On Suspend), - * 'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk). + * show() returns available sleep state labels, which may be "mem", "standby", + * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a + * description of what they mean. * - * store() accepts one of those strings, translates it into the - * proper enumerated value, and initiates a suspend transition. + * store() accepts one of those strings, translates it into the proper + * enumerated value, and initiates a suspend transition. */ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 00aca60904b0..338a6f147974 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -78,6 +78,26 @@ static bool valid_state(suspend_state_t state) return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); } +/* + * If this is set, the "mem" label always corresponds to the deepest sleep state + * available, the "standby" label corresponds to the second deepest sleep state + * available (if any), and the "freeze" label corresponds to the remaining + * available sleep state (if there is one). + */ +static bool relative_states; + +static int __init sleep_states_setup(char *str) +{ + relative_states = !strncmp(str, "1", 1); + if (relative_states) { + pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE; + pm_states[PM_SUSPEND_FREEZE].state = 0; + } + return 1; +} + +__setup("relative_sleep_states=", sleep_states_setup); + /** * suspend_set_ops - Set the global suspend method table. * @ops: Suspend operations to use. @@ -85,12 +105,20 @@ static bool valid_state(suspend_state_t state) void suspend_set_ops(const struct platform_suspend_ops *ops) { suspend_state_t i; + int j = PM_SUSPEND_MAX - 1; lock_system_sleep(); suspend_ops = ops; - for (i = PM_SUSPEND_STANDBY; i <= PM_SUSPEND_MEM; i++) - pm_states[i].state = valid_state(i) ? i : 0; + for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) + if (valid_state(i)) + pm_states[j--].state = i; + else if (!relative_states) + pm_states[j--].state = 0; + + pm_states[j--].state = PM_SUSPEND_FREEZE; + while (j >= PM_SUSPEND_MIN) + pm_states[j--].state = 0; unlock_system_sleep(); } -- cgit From a257954bb38ab23dbf93df812b4b2c01cae29d8b Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 27 May 2014 16:07:37 +0800 Subject: genirq: Improve documentation to match current implementation Signed-off-by: Jiang Liu Cc: Konrad Rzeszutek Wilk Cc: Tony Luck Cc: Joerg Roedel Cc: Paul Gortmaker Cc: Greg Kroah-Hartman Cc: Benjamin Herrenschmidt Cc: Grant Likely Cc: Rafael J. Wysocki Cc: Bjorn Helgaas Cc: Randy Dunlap Cc: Yinghai Lu Cc: Jiri Kosina Link: http://lkml.kernel.org/r/1401178092-1228-3-git-send-email-jiang.liu@linux.intel.com Signed-off-by: Thomas Gleixner --- Documentation/IRQ-domain.txt | 3 +-- kernel/irq/internals.h | 2 +- kernel/irq/irqdomain.c | 6 +++--- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/Documentation/IRQ-domain.txt b/Documentation/IRQ-domain.txt index 03df71aeb38c..8a8b82c9ca53 100644 --- a/Documentation/IRQ-domain.txt +++ b/Documentation/IRQ-domain.txt @@ -41,8 +41,7 @@ An interrupt controller driver creates and registers an irq_domain by calling one of the irq_domain_add_*() functions (each mapping method has a different allocator function, more on that later). The function will return a pointer to the irq_domain on success. The caller must -provide the allocator function with an irq_domain_ops structure with -the .map callback populated as a minimum. +provide the allocator function with an irq_domain_ops structure. In most cases, the irq_domain will begin empty without any mappings between hwirq and IRQ numbers. Mappings are added to the irq_domain diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c4065e3edbc3..099ea2e0eb88 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -33,7 +33,7 @@ enum { }; /* - * Bit masks for desc->state + * Bit masks for desc->core_internal_state__do_not_mess_with_it * * IRQS_AUTODETECT - autodetection in progress * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f14033700c25..eb5e10e32e05 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -27,14 +27,14 @@ static struct irq_domain *irq_default_domain; * __irq_domain_add() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller * @size: Size of linear map; 0 for radix mapping only + * @hwirq_max: Maximum number of interrupts supported by controller * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no * direct mapping * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * - * Allocates and initialize and irq_domain structure. Caller is expected to - * register allocated irq_domain with irq_domain_register(). Returns pointer - * to IRQ domain, or NULL on failure. + * Allocates and initialize and irq_domain structure. + * Returns pointer to IRQ domain, or NULL on failure. */ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, irq_hw_number_t hwirq_max, int direct_max, -- cgit From 26fc9cd200ec839e0b3095e05ae018f27314e7aa Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Sat, 26 Apr 2014 15:40:28 +0800 Subject: kernfs: move the last knowledge of sysfs out from kernfs There is still one residue of sysfs remaining: the sb_magic SYSFS_MAGIC. However this should be kernfs user specific, so this patch moves it out. Kerrnfs user should specify their magic number while mouting. Signed-off-by: Jianyu Zhan Acked-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/mount.c | 11 ++++++----- fs/sysfs/mount.c | 4 +++- include/linux/kernfs.h | 13 ++++++++----- kernel/cgroup.c | 4 +++- 4 files changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index f25a7c0c3cdc..d171b98a6cdd 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -62,7 +62,7 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) return NULL; } -static int kernfs_fill_super(struct super_block *sb) +static int kernfs_fill_super(struct super_block *sb, unsigned long magic) { struct kernfs_super_info *info = kernfs_info(sb); struct inode *inode; @@ -71,7 +71,7 @@ static int kernfs_fill_super(struct super_block *sb) info->sb = sb; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = SYSFS_MAGIC; + sb->s_magic = magic; sb->s_op = &kernfs_sops; sb->s_time_gran = 1; @@ -132,6 +132,7 @@ const void *kernfs_super_ns(struct super_block *sb) * @fs_type: file_system_type of the fs being mounted * @flags: mount flags specified for the mount * @root: kernfs_root of the hierarchy being mounted + * @magic: file system specific magic number * @new_sb_created: tell the caller if we allocated a new superblock * @ns: optional namespace tag of the mount * @@ -143,8 +144,8 @@ const void *kernfs_super_ns(struct super_block *sb) * The return value can be passed to the vfs layer verbatim. */ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, bool *new_sb_created, - const void *ns) + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created, const void *ns) { struct super_block *sb; struct kernfs_super_info *info; @@ -169,7 +170,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, if (!sb->s_root) { struct kernfs_super_info *info = kernfs_info(sb); - error = kernfs_fill_super(sb); + error = kernfs_fill_super(sb, magic); if (error) { deactivate_locked_super(sb); return ERR_PTR(error); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8794423f7efb..8a49486bf30c 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -13,6 +13,7 @@ #define DEBUG #include +#include #include #include #include @@ -38,7 +39,8 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, } ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); - root = kernfs_mount_ns(fs_type, flags, sysfs_root, &new_sb, ns); + root = kernfs_mount_ns(fs_type, flags, sysfs_root, + SYSFS_MAGIC, &new_sb, ns); if (IS_ERR(root) || !new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); return root; diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index c841688a78a3..17aa1cce6f8e 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -301,8 +301,8 @@ void kernfs_notify(struct kernfs_node *kn); const void *kernfs_super_ns(struct super_block *sb); struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, bool *new_sb_created, - const void *ns); + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created, const void *ns); void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); @@ -395,7 +395,8 @@ static inline const void *kernfs_super_ns(struct super_block *sb) static inline struct dentry * kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, bool *new_sb_created, const void *ns) + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created, const void *ns) { return ERR_PTR(-ENOSYS); } static inline void kernfs_kill_sb(struct super_block *sb) { } @@ -453,9 +454,11 @@ static inline int kernfs_rename(struct kernfs_node *kn, static inline struct dentry * kernfs_mount(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, bool *new_sb_created) + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created) { - return kernfs_mount_ns(fs_type, flags, root, new_sb_created, NULL); + return kernfs_mount_ns(fs_type, flags, root, + magic, new_sb_created, NULL); } #endif /* __LINUX_KERNFS_H */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3f1ca934a237..ceee0c54c6a4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1604,7 +1605,8 @@ out_unlock: if (ret) return ERR_PTR(ret); - dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb); + dentry = kernfs_mount(fs_type, flags, root->kf_root, + CGROUP_SUPER_MAGIC, &new_sb); if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); return dentry; -- cgit From 011e4b02f1da156ac7fea28a9da878f3c23af739 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 27 May 2014 16:25:34 +0530 Subject: powerpc, kexec: Fix "Processor X is stuck" issue during kexec from ST mode If we try to perform a kexec when the machine is in ST (Single-Threaded) mode (ppc64_cpu --smt=off), the kexec operation doesn't succeed properly, and we get the following messages during boot: [ 0.089866] POWER8 performance monitor hardware support registered [ 0.089985] power8-pmu: PMAO restore workaround active. [ 5.095419] Processor 1 is stuck. [ 10.097933] Processor 2 is stuck. [ 15.100480] Processor 3 is stuck. [ 20.102982] Processor 4 is stuck. [ 25.105489] Processor 5 is stuck. [ 30.108005] Processor 6 is stuck. [ 35.110518] Processor 7 is stuck. [ 40.113369] Processor 9 is stuck. [ 45.115879] Processor 10 is stuck. [ 50.118389] Processor 11 is stuck. [ 55.120904] Processor 12 is stuck. [ 60.123425] Processor 13 is stuck. [ 65.125970] Processor 14 is stuck. [ 70.128495] Processor 15 is stuck. [ 75.131316] Processor 17 is stuck. Note that only the sibling threads are stuck, while the primary threads (0, 8, 16 etc) boot just fine. Looking closer at the previous step of kexec, we observe that kexec tries to wakeup (bring online) the sibling threads of all the cores, before performing kexec: [ 9464.131231] Starting new kernel [ 9464.148507] kexec: Waking offline cpu 1. [ 9464.148552] kexec: Waking offline cpu 2. [ 9464.148600] kexec: Waking offline cpu 3. [ 9464.148636] kexec: Waking offline cpu 4. [ 9464.148671] kexec: Waking offline cpu 5. [ 9464.148708] kexec: Waking offline cpu 6. [ 9464.148743] kexec: Waking offline cpu 7. [ 9464.148779] kexec: Waking offline cpu 9. [ 9464.148815] kexec: Waking offline cpu 10. [ 9464.148851] kexec: Waking offline cpu 11. [ 9464.148887] kexec: Waking offline cpu 12. [ 9464.148922] kexec: Waking offline cpu 13. [ 9464.148958] kexec: Waking offline cpu 14. [ 9464.148994] kexec: Waking offline cpu 15. [ 9464.149030] kexec: Waking offline cpu 17. Instrumenting this piece of code revealed that the cpu_up() operation actually fails with -EBUSY. Thus, only the primary threads of all the cores are online during kexec, and hence this is a sure-shot receipe for disaster, as explained in commit e8e5c2155b (powerpc/kexec: Fix orphaned offline CPUs across kexec), as well as in the comment above wake_offline_cpus(). It turns out that cpu_up() was returning -EBUSY because the variable 'cpu_hotplug_disabled' was set to 1; and this disabling of CPU hotplug was done by migrate_to_reboot_cpu() inside kernel_kexec(). Now, migrate_to_reboot_cpu() was originally written with the assumption that any further code will not need to perform CPU hotplug, since we are anyway in the reboot path. However, kexec is clearly not such a case, since we depend on onlining CPUs, atleast on powerpc. So re-enable cpu-hotplug after returning from migrate_to_reboot_cpu() in the kexec path, to fix this regression in kexec on powerpc. Also, wrap the cpu_up() in powerpc kexec code within a WARN_ON(), so that we can catch such issues more easily in the future. Fixes: c97102ba963 (kexec: migrate to reboot cpu) Cc: stable@vger.kernel.org Signed-off-by: Srivatsa S. Bhat Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/machine_kexec_64.c | 2 +- kernel/kexec.c | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 59d229a2a3e0..879b3aacac32 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -237,7 +237,7 @@ static void wake_offline_cpus(void) if (!cpu_online(cpu)) { printk(KERN_INFO "kexec: Waking offline cpu %d.\n", cpu); - cpu_up(cpu); + WARN_ON(cpu_up(cpu)); } } } diff --git a/kernel/kexec.c b/kernel/kexec.c index c8380ad203bc..28c57069ef68 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1683,6 +1683,14 @@ int kernel_kexec(void) kexec_in_progress = true; kernel_restart_prepare(NULL); migrate_to_reboot_cpu(); + + /* + * migrate_to_reboot_cpu() disables CPU hotplug assuming that + * no further code needs to use CPU hotplug (which is true in + * the reboot case). However, the kexec path depends on using + * CPU hotplug again; so re-enable it here. + */ + cpu_hotplug_enable(); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); } -- cgit From 015af06e103fa47af29ada0f564301c81d4973b2 Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Tue, 27 May 2014 14:28:59 -0400 Subject: kernel/workqueue.c: pr_warning/pr_warn & printk/pr_info This commit did an incorrect printk->pr_info conversion. If we were converting to pr_info() we should lose the log_level parameter. The problem is that this is called (indirectly) by show_regs_print_info(), which is called with various log_levels (from _INFO clear to _EMERG). So we leave it as a printk() call so the desired log_level is applied. Not a full revert, as the other half of the patch is correct. Signed-off-by: Valdis Kletnieks Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bc3c18892b7d..90a0fa592b72 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4440,7 +4440,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { - pr_info("%sWorkqueue: %s %pf", log_lvl, name, fn); + printk("%sWorkqueue: %s %pf", log_lvl, name, fn); if (desc[0]) pr_cont(" (%s)", desc); pr_cont("\n"); -- cgit From 397335f004f41e5fcf7a795e94eb3ab83411a17c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 22 May 2014 03:25:39 +0000 Subject: rtmutex: Fix deadlock detector for real The current deadlock detection logic does not work reliably due to the following early exit path: /* * Drop out, when the task has no waiters. Note, * top_waiter can be NULL, when we are in the deboosting * mode! */ if (top_waiter && (!task_has_pi_waiters(task) || top_waiter != task_top_pi_waiter(task))) goto out_unlock_pi; So this not only exits when the task has no waiters, it also exits unconditionally when the current waiter is not the top priority waiter of the task. So in a nested locking scenario, it might abort the lock chain walk and therefor miss a potential deadlock. Simple fix: Continue the chain walk, when deadlock detection is enabled. We also avoid the whole enqueue, if we detect the deadlock right away (A-A). It's an optimization, but also prevents that another waiter who comes in after the detection and before the task has undone the damage observes the situation and detects the deadlock and returns -EDEADLOCK, which is wrong as the other task is not in a deadlock situation. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Reviewed-by: Steven Rostedt Cc: Lai Jiangshan Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20140522031949.725272460@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/locking/rtmutex.c | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index aa4dff04b594..a620d4d08ca6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -343,9 +343,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * top_waiter can be NULL, when we are in the deboosting * mode! */ - if (top_waiter && (!task_has_pi_waiters(task) || - top_waiter != task_top_pi_waiter(task))) - goto out_unlock_pi; + if (top_waiter) { + if (!task_has_pi_waiters(task)) + goto out_unlock_pi; + /* + * If deadlock detection is off, we stop here if we + * are not the top pi waiter of the task. + */ + if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) + goto out_unlock_pi; + } /* * When deadlock detection is off then we check, if further @@ -361,7 +368,12 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto retry; } - /* Deadlock detection */ + /* + * Deadlock detection. If the lock is the same as the original + * lock which caused us to walk the lock chain or if the + * current lock is owned by the task which initiated the chain + * walk, we detected a deadlock. + */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); raw_spin_unlock(&lock->wait_lock); @@ -527,6 +539,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, unsigned long flags; int chain_walk = 0, res; + /* + * Early deadlock detection. We really don't want the task to + * enqueue on itself just to untangle the mess later. It's not + * only an optimization. We drop the locks, so another waiter + * can come in before the chain walk detects the deadlock. So + * the other will detect the deadlock and return -EDEADLOCK, + * which is wrong, as the other waiter is not in a deadlock + * situation. + */ + if (detect_deadlock && owner == task) + return -EDEADLK; + raw_spin_lock_irqsave(&task->pi_lock, flags); __rt_mutex_adjust_prio(task); waiter->task = task; -- cgit From 7fa21dd8bd191564a195291161d6b43db5d9c350 Mon Sep 17 00:00:00 2001 From: Stephen Chivers Date: Wed, 14 May 2014 08:04:39 +1000 Subject: printk/of_serial: fix serial console cessation part way through boot. Commit 5f5c9ae56c38942623f69c3e6dc6ec78e4da2076 "serial_core: Unregister console in uart_remove_one_port()" fixed a crash where a serial port was removed but not deregistered as a console. There is a side effect of that commit for platforms having serial consoles and of_serial configured (CONFIG_SERIAL_OF_PLATFORM). The serial console is disabled midway through the boot process. This cessation of the serial console affects PowerPC computers such as the MVME5100 and SAM440EP. The sequence is: bootconsole [udbg0] enabled .... serial8250/16550 driver initialises and registers its UARTS, one of these is the serial console. console [ttyS0] enabled .... of_serial probes "platform" devices, registering them as it goes. One of these is the serial console. console [ttyS0] disabled. The disabling of the serial console is due to: a. unregister_console in printk not clearing the CONS_ENABLED bit in the console flags, even though it has announced that the console is disabled; and b. of_platform_serial_probe in of_serial not setting the port type before it registers with serial8250_register_8250_port. This patch ensures that the serial console is re-enabled when of_serial registers a serial port that corresponds to the designated console. Signed-off-by: Stephen Chivers Tested-by: Stephen Chivers Acked-by: Geert Uytterhoeven [unregister_console] Cc: stable # 3.15 === The above failure was identified in Linux-3.15-rc2. Tested using MVME5100 and SAM440EP PowerPC computers with kernels built from Linux-3.15-rc5 and tty-next. The continued operation of the serial console is vital for computers such as the MVME5100 as that Single Board Computer does not have any grapical/display hardware. Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/of_serial.c | 1 + kernel/printk/printk.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel') diff --git a/drivers/tty/serial/of_serial.c b/drivers/tty/serial/of_serial.c index 77ec6a147522..68d4455f3cf9 100644 --- a/drivers/tty/serial/of_serial.c +++ b/drivers/tty/serial/of_serial.c @@ -173,6 +173,7 @@ static int of_platform_serial_probe(struct platform_device *ofdev) { struct uart_8250_port port8250; memset(&port8250, 0, sizeof(port8250)); + port.type = port_type; port8250.port = port; if (port.fifosize) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a45b50962295..81c71617ea28 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2413,6 +2413,7 @@ int unregister_console(struct console *console) if (console_drivers != NULL && console->flags & CON_CONSDEV) console_drivers->flags |= CON_CONSDEV; + console->flags &= ~CON_ENABLED; console_unlock(); console_sysfs_notify(); return res; -- cgit From 2184db46e425c2b84a783eeead0e2b24ce67cd7f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 28 May 2014 13:14:40 -0400 Subject: tracing: Print nasty banner when trace_printk() is in use trace_printk() is used to debug fast paths within the kernel. Places that gets called in any context (interrupt or NMI) or thousands of times a second. Something you do not want to do with a printk(). In order to make it completely lockless as it needs a temporary buffer to handle some of the string formatting, a page is created per cpu for every context (four per cpu; normal, softirq, irq, NMI). Since trace_printk() should only be used for debugging purposes, there's no reason to waste memory on these buffers on a production system. That means, trace_printk() should never be used unless a developer is debugging their kernel. There's macro magic to allocate the buffers if trace_printk() is used anywhere in the kernel. To help enforce that trace_printk() isn't used outside of development, when it is used, a nasty banner is displayed on bootup (or when a module is loaded that uses trace_printk() and the kernel core does not). Here's the banner: ********************************************************** ** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE ** ** ** ** trace_printk() being used. Allocating extra memory. ** ** ** ** This means that this is a DEBUG kernel and it is ** ** unsafe for produciton use. ** ** ** ** If you see this message and you are not debugging ** ** the kernel, report this immediately to your vendor! ** ** ** ** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE ** ********************************************************** That should hopefully keep developers from trying to sneak in a trace_printk() or two. Link: http://lkml.kernel.org/p/20140528131440.2283213c@gandalf.local.home Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 05431696b10c..eb228b9de170 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1975,7 +1975,21 @@ void trace_printk_init_buffers(void) if (alloc_percpu_trace_buffer()) return; - pr_info("ftrace: Allocated trace_printk buffers\n"); + /* trace_printk() is for debug use only. Don't use it in production. */ + + pr_warning("\n**********************************************************\n"); + pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warning("** **\n"); + pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warning("** **\n"); + pr_warning("** This means that this is a DEBUG kernel and it is **\n"); + pr_warning("** unsafe for produciton use. **\n"); + pr_warning("** **\n"); + pr_warning("** If you see this message and you are not debugging **\n"); + pr_warning("** the kernel, report this immediately to your vendor! **\n"); + pr_warning("** **\n"); + pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warning("**********************************************************\n"); /* Expand the buffers to set size */ tracing_update_buffers(); -- cgit From 81dc9f0ef21e40114cc895894c7acf3055f6d1fb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 29 May 2014 22:49:07 -0400 Subject: tracing: Add tracepoint benchmark tracepoint In order to help benchmark the time tracepoints take, a new config option is added called CONFIG_TRACEPOINT_BENCHMARK. When this option is set a tracepoint is created called "benchmark:benchmark_event". When the tracepoint is enabled, it kicks off a kernel thread that goes into an infinite loop (calling cond_sched() to let other tasks run), and calls the tracepoint. Each iteration will record the time it took to write to the tracepoint and the next iteration that data will be passed to the tracepoint itself. That is, the tracepoint will report the time it took to do the previous tracepoint. The string written to the tracepoint is a static string of 128 bytes to keep the time the same. The initial string is simply a write of "START". The second string records the cold cache time of the first write which is not added to the rest of the calculations. As it is a tight loop, it benchmarks as hot cache. That's fine because we care most about hot paths that are probably in cache already. An example of the output: START first=3672 [COLD CACHED] last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712 last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337 last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064 last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411 last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389 last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666 Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 30 +++++++ kernel/trace/Makefile | 3 + kernel/trace/trace_benchmark.c | 176 +++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_benchmark.h | 41 ++++++++++ 4 files changed, 250 insertions(+) create mode 100644 kernel/trace/trace_benchmark.c create mode 100644 kernel/trace/trace_benchmark.h (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8639819f6cef..d4409356f40d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -535,6 +535,36 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. +config TRACEPOINT_BENCHMARK + bool "Add tracepoint that benchmarks tracepoints" + help + This option creates the tracepoint "benchmark:benchmark_event". + When the tracepoint is enabled, it kicks off a kernel thread that + goes into an infinite loop (calling cond_sched() to let other tasks + run), and calls the tracepoint. Each iteration will record the time + it took to write to the tracepoint and the next iteration that + data will be passed to the tracepoint itself. That is, the tracepoint + will report the time it took to do the previous tracepoint. + The string written to the tracepoint is a static string of 128 bytes + to keep the time the same. The initial string is simply a write of + "START". The second string records the cold cache time of the first + write which is not added to the rest of the calculations. + + As it is a tight loop, it benchmarks as hot cache. That's fine because + we care most about hot paths that are probably in cache already. + + An example of the output: + + START + first=3672 [COLD CACHED] + last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712 + last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337 + last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064 + last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411 + last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389 + last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666 + + config RING_BUFFER_BENCHMARK tristate "Ring buffer benchmark stress tester" depends on RING_BUFFER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 1378e84fbe39..2611613f14f1 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -17,6 +17,7 @@ ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +CFLAGS_trace_benchmark.o := -I$(src) CFLAGS_trace_events_filter.o := -I$(src) obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o @@ -62,4 +63,6 @@ endif obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o +obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o + libftrace-y := ftrace.o diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c new file mode 100644 index 000000000000..7dc1c42dfee2 --- /dev/null +++ b/kernel/trace/trace_benchmark.c @@ -0,0 +1,176 @@ +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include "trace_benchmark.h" + +static struct task_struct *bm_event_thread; + +static char bm_str[BENCHMARK_EVENT_STRLEN] = "START"; + +static u64 bm_total; +static u64 bm_totalsq; +static u64 bm_last; +static u64 bm_max; +static u64 bm_min; +static u64 bm_first; +static s64 bm_cnt; + +/* + * This gets called in a loop recording the time it took to write + * the tracepoint. What it writes is the time statistics of the last + * tracepoint write. As there is nothing to write the first time + * it simply writes "START". As the first write is cold cache and + * the rest is hot, we save off that time in bm_first and it is + * reported as "first", which is shown in the second write to the + * tracepoint. The "first" field is writen within the statics from + * then on but never changes. + */ +static void trace_do_benchmark(void) +{ + u64 start; + u64 stop; + u64 delta; + s64 stddev; + u64 seed; + u64 seedsq; + u64 last_seed; + unsigned int avg; + unsigned int std = 0; + + /* Only run if the tracepoint is actually active */ + if (!trace_benchmark_event_enabled()) + return; + + local_irq_disable(); + start = trace_clock_local(); + trace_benchmark_event(bm_str); + stop = trace_clock_local(); + local_irq_enable(); + + bm_cnt++; + + delta = stop - start; + + /* + * The first read is cold cached, keep it separate from the + * other calculations. + */ + if (bm_cnt == 1) { + bm_first = delta; + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "first=%llu [COLD CACHED]", bm_first); + return; + } + + bm_last = delta; + + bm_total += delta; + bm_totalsq += delta * delta; + + if (delta > bm_max) + bm_max = delta; + if (!bm_min || delta < bm_min) + bm_min = delta; + + if (bm_cnt > 1) { + /* + * Apply Welford's method to calculate standard deviation: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + */ + stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total; + do_div(stddev, bm_cnt); + do_div(stddev, bm_cnt - 1); + } else + stddev = 0; + + delta = bm_total; + do_div(delta, bm_cnt); + avg = delta; + + if (stddev > 0) { + int i = 0; + /* + * stddev is the square of standard deviation but + * we want the actualy number. Use the average + * as our seed to find the std. + * + * The next try is: + * x = (x + N/x) / 2 + * + * Where N is the squared number to find the square + * root of. + */ + seed = avg; + do { + last_seed = seed; + seed = stddev; + if (!last_seed) + break; + do_div(seed, last_seed); + seed += last_seed; + do_div(seed, 2); + } while (i++ < 10 && last_seed != seed); + + std = seed; + } + + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld", + bm_last, bm_first, bm_max, bm_min, avg, std, stddev); +} + +static int benchmark_event_kthread(void *arg) +{ + /* sleep a bit to make sure the tracepoint gets activated */ + msleep(100); + + while (!kthread_should_stop()) { + + trace_do_benchmark(); + + /* + * We don't go to sleep, but let others + * run as well. + */ + cond_resched(); + } + + return 0; +} + +/* + * When the benchmark tracepoint is enabled, it calls this + * function and the thread that calls the tracepoint is created. + */ +void trace_benchmark_reg(void) +{ + bm_event_thread = kthread_run(benchmark_event_kthread, + NULL, "event_benchmark"); + WARN_ON(!bm_event_thread); +} + +/* + * When the benchmark tracepoint is disabled, it calls this + * function and the thread that calls the tracepoint is deleted + * and all the numbers are reset. + */ +void trace_benchmark_unreg(void) +{ + if (!bm_event_thread) + return; + + kthread_stop(bm_event_thread); + + strcpy(bm_str, "START"); + bm_total = 0; + bm_totalsq = 0; + bm_last = 0; + bm_max = 0; + bm_min = 0; + bm_cnt = 0; + /* bm_first doesn't need to be reset but reset it anyway */ + bm_first = 0; +} diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h new file mode 100644 index 000000000000..3c1df1df4e29 --- /dev/null +++ b/kernel/trace/trace_benchmark.h @@ -0,0 +1,41 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM benchmark + +#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BENCHMARK_H + +#include + +extern void trace_benchmark_reg(void); +extern void trace_benchmark_unreg(void); + +#define BENCHMARK_EVENT_STRLEN 128 + +TRACE_EVENT_FN(benchmark_event, + + TP_PROTO(const char *str), + + TP_ARGS(str), + + TP_STRUCT__entry( + __array( char, str, BENCHMARK_EVENT_STRLEN ) + ), + + TP_fast_assign( + memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN); + ), + + TP_printk("%s", __entry->str), + + trace_benchmark_reg, trace_benchmark_unreg +); + +#endif /* _TRACE_BENCHMARK_H */ + +#undef TRACE_INCLUDE_FILE +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_benchmark + +/* This part must be outside protection */ +#include -- cgit From 42584c81c5adc1737a6fe0687facc5e62a5dc8c1 Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Thu, 20 Feb 2014 17:44:31 +0900 Subject: tracing: Have saved_cmdlines use the seq_read infrastructure Current tracing_saved_cmdlines_read() implementation is naive; It allocates a large buffer, constructs output data to that buffer for each read operation, and then copies a portion of the buffer to the user space buffer. This has several issues such as slow memory allocation, high CPU usage, and even corruption of the output data. The seq_read infrastructure is made to handle this type of work. By converting it to use seq_read() the code becomes smaller, simplified, as well as correct. Link: http://lkml.kernel.org/p/20140220084431.3839.51793.stgit@yunodevel Signed-off-by: Hidehiro Kawai Cc: Namhyung Kim Cc: Masami Hiramatsu Signed-off-by: Yoshihiro YUNOMAE Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: linux-kernel@vger.kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 89 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index eb228b9de170..1855dae73f34 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3699,55 +3699,74 @@ static const struct file_operations tracing_readme_fops = { .llseek = generic_file_llseek, }; -static ssize_t -tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) +static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) { - char *buf_comm; - char *file_buf; - char *buf; - int len = 0; - int pid; - int i; + unsigned int *ptr = v; - file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); - if (!file_buf) - return -ENOMEM; + if (*pos || m->count) + ptr++; - buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); - if (!buf_comm) { - kfree(file_buf); - return -ENOMEM; - } + (*pos)++; + + for (; ptr < &map_cmdline_to_pid[SAVED_CMDLINES]; ptr++) { + if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) + continue; - buf = file_buf; + return ptr; + } - for (i = 0; i < SAVED_CMDLINES; i++) { - int r; + return NULL; +} - pid = map_cmdline_to_pid[i]; - if (pid == -1 || pid == NO_CMDLINE_MAP) - continue; +static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; - trace_find_cmdline(pid, buf_comm); - r = sprintf(buf, "%d %s\n", pid, buf_comm); - buf += r; - len += r; + v = &map_cmdline_to_pid[0]; + while (l <= *pos) { + v = saved_cmdlines_next(m, v, &l); + if (!v) + return NULL; } - len = simple_read_from_buffer(ubuf, cnt, ppos, - file_buf, len); + return v; +} - kfree(file_buf); - kfree(buf_comm); +static void saved_cmdlines_stop(struct seq_file *m, void *v) +{ +} - return len; +static int saved_cmdlines_show(struct seq_file *m, void *v) +{ + char buf[TASK_COMM_LEN]; + unsigned int *pid = v; + + trace_find_cmdline(*pid, buf); + seq_printf(m, "%d %s\n", *pid, buf); + return 0; +} + +static const struct seq_operations tracing_saved_cmdlines_seq_ops = { + .start = saved_cmdlines_start, + .next = saved_cmdlines_next, + .stop = saved_cmdlines_stop, + .show = saved_cmdlines_show, +}; + +static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + return seq_open(filp, &tracing_saved_cmdlines_seq_ops); } static const struct file_operations tracing_saved_cmdlines_fops = { - .open = tracing_open_generic, - .read = tracing_saved_cmdlines_read, - .llseek = generic_file_llseek, + .open = tracing_saved_cmdlines_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, }; static ssize_t -- cgit From 379cfdac37923653c9d4242d10052378b7563005 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 30 May 2014 09:42:39 -0400 Subject: tracing: Try again for saved cmdline if failed due to locking In order to prevent the saved cmdline cache from being filled when tracing is not active, the comms are only recorded after a trace event is recorded. The problem is, a comm can fail to be recorded if the trace_cmdline_lock is held. That lock is taken via a trylock to allow it to happen from any context (including NMI). If the lock fails to be taken, the comm is skipped. No big deal, as we will try again later. But! Because of the code that was added to only record after an event, we may not try again later as the recording is made as a oneshot per event per CPU. Only disable the recording of the comm if the comm is actually recorded. Fixes: 7ffbd48d5cab "tracing: Cache comms only after an event occurred" Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1855dae73f34..22a902e2ded9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1441,12 +1441,12 @@ static void tracing_stop_tr(struct trace_array *tr) void trace_stop_cmdline_recording(void); -static void trace_save_cmdline(struct task_struct *tsk) +static int trace_save_cmdline(struct task_struct *tsk) { unsigned pid, idx; if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) - return; + return 0; /* * It's not the end of the world if we don't get @@ -1455,7 +1455,7 @@ static void trace_save_cmdline(struct task_struct *tsk) * so if we miss here, then better luck next time. */ if (!arch_spin_trylock(&trace_cmdline_lock)) - return; + return 0; idx = map_pid_to_cmdline[tsk->pid]; if (idx == NO_CMDLINE_MAP) { @@ -1480,6 +1480,8 @@ static void trace_save_cmdline(struct task_struct *tsk) memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); arch_spin_unlock(&trace_cmdline_lock); + + return 1; } void trace_find_cmdline(int pid, char comm[]) @@ -1521,9 +1523,8 @@ void tracing_record_cmdline(struct task_struct *tsk) if (!__this_cpu_read(trace_cmdline_save)) return; - __this_cpu_write(trace_cmdline_save, false); - - trace_save_cmdline(tsk); + if (trace_save_cmdline(tsk)) + __this_cpu_write(trace_cmdline_save, false); } void -- cgit From 4c27e756bc019ec1c11232893af036fdae720a97 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 30 May 2014 10:49:46 -0400 Subject: tracing: Move locking of trace_cmdline_lock into start/stop seq calls With the conversion of the saved_cmdlines output to use seq_read, there is now a race between accessing the values of the saved_cmdlines and the writing to them. The trace_cmdline_lock needs to be taken at the start and stop of the seq calls. A new __trace_find_cmdline() call is created to allow for the look up to happen without taking the lock. Fixes: 42584c81c5ad tracing: Have saved_cmdlines use the seq_read infrastructure Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 22a902e2ded9..626dbfde5d56 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1484,7 +1484,7 @@ static int trace_save_cmdline(struct task_struct *tsk) return 1; } -void trace_find_cmdline(int pid, char comm[]) +static void __trace_find_cmdline(int pid, char comm[]) { unsigned map; @@ -1503,13 +1503,19 @@ void trace_find_cmdline(int pid, char comm[]) return; } - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); map = map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) strcpy(comm, saved_cmdlines[map]); else strcpy(comm, "<...>"); +} + +void trace_find_cmdline(int pid, char comm[]) +{ + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + __trace_find_cmdline(pid, comm); arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); @@ -3724,6 +3730,9 @@ static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) void *v; loff_t l = 0; + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + v = &map_cmdline_to_pid[0]; while (l <= *pos) { v = saved_cmdlines_next(m, v, &l); @@ -3736,6 +3745,8 @@ static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) static void saved_cmdlines_stop(struct seq_file *m, void *v) { + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); } static int saved_cmdlines_show(struct seq_file *m, void *v) @@ -3743,7 +3754,7 @@ static int saved_cmdlines_show(struct seq_file *m, void *v) char buf[TASK_COMM_LEN]; unsigned int *pid = v; - trace_find_cmdline(*pid, buf); + __trace_find_cmdline(*pid, buf); seq_printf(m, "%d %s\n", *pid, buf); return 0; } -- cgit From 057b0a7518e4b8fca26201715996d6d928a62300 Mon Sep 17 00:00:00 2001 From: Niv Yehezkel Date: Sat, 31 May 2014 06:26:01 -0400 Subject: PM / hibernate: fixed typo in comment Fix a trivial comment typo (s/mam/map) in kernel/power/swap.c. Signed-off-by: Niv Yehezkel Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8c9a4819f798..aaa3261dea5d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -567,7 +567,7 @@ static int lzo_compress_threadfn(void *data) /** * save_image_lzo - Save the suspend image data compressed with LZO. - * @handle: Swap mam handle to use for saving the image. + * @handle: Swap map handle to use for saving the image. * @snapshot: Image to read data from. * @nr_to_write: Number of pages to save. */ -- cgit From 3480593131e0b781287dae0139bf7ccee7cba7ff Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 29 May 2014 10:22:50 +0200 Subject: net: filter: get rid of BPF_S_* enum This patch finally allows us to get rid of the BPF_S_* enum. Currently, the code performs unnecessary encode and decode workarounds in seccomp and filter migration itself when a filter is being attached in order to overcome BPF_S_* encoding which is not used anymore by the new interpreter resp. JIT compilers. Keeping it around would mean that also in future we would need to extend and maintain this enum and related encoders/decoders. We can get rid of all that and save us these operations during filter attaching. Naturally, also JIT compilers need to be updated by this. Before JIT conversion is being done, each compiler checks if A is being loaded at startup to obtain information if it needs to emit instructions to clear A first. Since BPF extensions are a subset of BPF_LD | BPF_{W,H,B} | BPF_ABS variants, case statements for extensions can be removed at that point. To ease and minimalize code changes in the classic JITs, we have introduced bpf_anc_helper(). Tested with test_bpf on x86_64 (JIT, int), s390x (JIT, int), arm (JIT, int), i368 (int), ppc64 (JIT, int); for sparc we unfortunately didn't have access, but changes are analogous to the rest. Joint work with Alexei Starovoitov. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Mircea Gherzan Cc: Kees Cook Acked-by: Chema Gonzalez Signed-off-by: David S. Miller --- arch/arm/net/bpf_jit_32.c | 139 ++++++++-------- arch/powerpc/net/bpf_jit_64.S | 2 +- arch/powerpc/net/bpf_jit_comp.c | 157 +++++++++--------- arch/s390/net/bpf_jit_comp.c | 163 +++++++++---------- arch/sparc/net/bpf_jit_comp.c | 154 +++++++++--------- include/linux/filter.h | 108 +++++-------- kernel/seccomp.c | 83 +++++----- net/core/filter.c | 341 +++++++++++++++------------------------- 8 files changed, 498 insertions(+), 649 deletions(-) (limited to 'kernel') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 6f879c319a9d..fb5503ce016f 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -136,7 +136,7 @@ static u16 saved_regs(struct jit_ctx *ctx) u16 ret = 0; if ((ctx->skf->len > 1) || - (ctx->skf->insns[0].code == BPF_S_RET_A)) + (ctx->skf->insns[0].code == (BPF_RET | BPF_A))) ret |= 1 << r_A; #ifdef CONFIG_FRAME_POINTER @@ -164,18 +164,10 @@ static inline int mem_words_used(struct jit_ctx *ctx) static inline bool is_load_to_a(u16 inst) { switch (inst) { - case BPF_S_LD_W_LEN: - case BPF_S_LD_W_ABS: - case BPF_S_LD_H_ABS: - case BPF_S_LD_B_ABS: - case BPF_S_ANC_CPU: - case BPF_S_ANC_IFINDEX: - case BPF_S_ANC_MARK: - case BPF_S_ANC_PROTOCOL: - case BPF_S_ANC_RXHASH: - case BPF_S_ANC_VLAN_TAG: - case BPF_S_ANC_VLAN_TAG_PRESENT: - case BPF_S_ANC_QUEUE: + case BPF_LD | BPF_W | BPF_LEN: + case BPF_LD | BPF_W | BPF_ABS: + case BPF_LD | BPF_H | BPF_ABS: + case BPF_LD | BPF_B | BPF_ABS: return true; default: return false; @@ -215,7 +207,7 @@ static void build_prologue(struct jit_ctx *ctx) emit(ARM_MOV_I(r_X, 0), ctx); /* do not leak kernel data to userspace */ - if ((first_inst != BPF_S_RET_K) && !(is_load_to_a(first_inst))) + if ((first_inst != (BPF_RET | BPF_K)) && !(is_load_to_a(first_inst))) emit(ARM_MOV_I(r_A, 0), ctx); /* stack space for the BPF_MEM words */ @@ -480,36 +472,39 @@ static int build_body(struct jit_ctx *ctx) u32 k; for (i = 0; i < prog->len; i++) { + u16 code; + inst = &(prog->insns[i]); /* K as an immediate value operand */ k = inst->k; + code = bpf_anc_helper(inst); /* compute offsets only in the fake pass */ if (ctx->target == NULL) ctx->offsets[i] = ctx->idx * 4; - switch (inst->code) { - case BPF_S_LD_IMM: + switch (code) { + case BPF_LD | BPF_IMM: emit_mov_i(r_A, k, ctx); break; - case BPF_S_LD_W_LEN: + case BPF_LD | BPF_W | BPF_LEN: ctx->seen |= SEEN_SKB; BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); emit(ARM_LDR_I(r_A, r_skb, offsetof(struct sk_buff, len)), ctx); break; - case BPF_S_LD_MEM: + case BPF_LD | BPF_MEM: /* A = scratch[k] */ ctx->seen |= SEEN_MEM_WORD(k); emit(ARM_LDR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx); break; - case BPF_S_LD_W_ABS: + case BPF_LD | BPF_W | BPF_ABS: load_order = 2; goto load; - case BPF_S_LD_H_ABS: + case BPF_LD | BPF_H | BPF_ABS: load_order = 1; goto load; - case BPF_S_LD_B_ABS: + case BPF_LD | BPF_B | BPF_ABS: load_order = 0; load: /* the interpreter will deal with the negative K */ @@ -552,31 +547,31 @@ load_common: emit_err_ret(ARM_COND_NE, ctx); emit(ARM_MOV_R(r_A, ARM_R0), ctx); break; - case BPF_S_LD_W_IND: + case BPF_LD | BPF_W | BPF_IND: load_order = 2; goto load_ind; - case BPF_S_LD_H_IND: + case BPF_LD | BPF_H | BPF_IND: load_order = 1; goto load_ind; - case BPF_S_LD_B_IND: + case BPF_LD | BPF_B | BPF_IND: load_order = 0; load_ind: OP_IMM3(ARM_ADD, r_off, r_X, k, ctx); goto load_common; - case BPF_S_LDX_IMM: + case BPF_LDX | BPF_IMM: ctx->seen |= SEEN_X; emit_mov_i(r_X, k, ctx); break; - case BPF_S_LDX_W_LEN: + case BPF_LDX | BPF_W | BPF_LEN: ctx->seen |= SEEN_X | SEEN_SKB; emit(ARM_LDR_I(r_X, r_skb, offsetof(struct sk_buff, len)), ctx); break; - case BPF_S_LDX_MEM: + case BPF_LDX | BPF_MEM: ctx->seen |= SEEN_X | SEEN_MEM_WORD(k); emit(ARM_LDR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx); break; - case BPF_S_LDX_B_MSH: + case BPF_LDX | BPF_B | BPF_MSH: /* x = ((*(frame + k)) & 0xf) << 2; */ ctx->seen |= SEEN_X | SEEN_DATA | SEEN_CALL; /* the interpreter should deal with the negative K */ @@ -606,113 +601,113 @@ load_ind: emit(ARM_AND_I(r_X, ARM_R0, 0x00f), ctx); emit(ARM_LSL_I(r_X, r_X, 2), ctx); break; - case BPF_S_ST: + case BPF_ST: ctx->seen |= SEEN_MEM_WORD(k); emit(ARM_STR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx); break; - case BPF_S_STX: + case BPF_STX: update_on_xread(ctx); ctx->seen |= SEEN_MEM_WORD(k); emit(ARM_STR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx); break; - case BPF_S_ALU_ADD_K: + case BPF_ALU | BPF_ADD | BPF_K: /* A += K */ OP_IMM3(ARM_ADD, r_A, r_A, k, ctx); break; - case BPF_S_ALU_ADD_X: + case BPF_ALU | BPF_ADD | BPF_X: update_on_xread(ctx); emit(ARM_ADD_R(r_A, r_A, r_X), ctx); break; - case BPF_S_ALU_SUB_K: + case BPF_ALU | BPF_SUB | BPF_K: /* A -= K */ OP_IMM3(ARM_SUB, r_A, r_A, k, ctx); break; - case BPF_S_ALU_SUB_X: + case BPF_ALU | BPF_SUB | BPF_X: update_on_xread(ctx); emit(ARM_SUB_R(r_A, r_A, r_X), ctx); break; - case BPF_S_ALU_MUL_K: + case BPF_ALU | BPF_MUL | BPF_K: /* A *= K */ emit_mov_i(r_scratch, k, ctx); emit(ARM_MUL(r_A, r_A, r_scratch), ctx); break; - case BPF_S_ALU_MUL_X: + case BPF_ALU | BPF_MUL | BPF_X: update_on_xread(ctx); emit(ARM_MUL(r_A, r_A, r_X), ctx); break; - case BPF_S_ALU_DIV_K: + case BPF_ALU | BPF_DIV | BPF_K: if (k == 1) break; emit_mov_i(r_scratch, k, ctx); emit_udiv(r_A, r_A, r_scratch, ctx); break; - case BPF_S_ALU_DIV_X: + case BPF_ALU | BPF_DIV | BPF_X: update_on_xread(ctx); emit(ARM_CMP_I(r_X, 0), ctx); emit_err_ret(ARM_COND_EQ, ctx); emit_udiv(r_A, r_A, r_X, ctx); break; - case BPF_S_ALU_OR_K: + case BPF_ALU | BPF_OR | BPF_K: /* A |= K */ OP_IMM3(ARM_ORR, r_A, r_A, k, ctx); break; - case BPF_S_ALU_OR_X: + case BPF_ALU | BPF_OR | BPF_X: update_on_xread(ctx); emit(ARM_ORR_R(r_A, r_A, r_X), ctx); break; - case BPF_S_ALU_XOR_K: + case BPF_ALU | BPF_XOR | BPF_K: /* A ^= K; */ OP_IMM3(ARM_EOR, r_A, r_A, k, ctx); break; - case BPF_S_ANC_ALU_XOR_X: - case BPF_S_ALU_XOR_X: + case BPF_ANC | SKF_AD_ALU_XOR_X: + case BPF_ALU | BPF_XOR | BPF_X: /* A ^= X */ update_on_xread(ctx); emit(ARM_EOR_R(r_A, r_A, r_X), ctx); break; - case BPF_S_ALU_AND_K: + case BPF_ALU | BPF_AND | BPF_K: /* A &= K */ OP_IMM3(ARM_AND, r_A, r_A, k, ctx); break; - case BPF_S_ALU_AND_X: + case BPF_ALU | BPF_AND | BPF_X: update_on_xread(ctx); emit(ARM_AND_R(r_A, r_A, r_X), ctx); break; - case BPF_S_ALU_LSH_K: + case BPF_ALU | BPF_LSH | BPF_K: if (unlikely(k > 31)) return -1; emit(ARM_LSL_I(r_A, r_A, k), ctx); break; - case BPF_S_ALU_LSH_X: + case BPF_ALU | BPF_LSH | BPF_X: update_on_xread(ctx); emit(ARM_LSL_R(r_A, r_A, r_X), ctx); break; - case BPF_S_ALU_RSH_K: + case BPF_ALU | BPF_RSH | BPF_K: if (unlikely(k > 31)) return -1; emit(ARM_LSR_I(r_A, r_A, k), ctx); break; - case BPF_S_ALU_RSH_X: + case BPF_ALU | BPF_RSH | BPF_X: update_on_xread(ctx); emit(ARM_LSR_R(r_A, r_A, r_X), ctx); break; - case BPF_S_ALU_NEG: + case BPF_ALU | BPF_NEG: /* A = -A */ emit(ARM_RSB_I(r_A, r_A, 0), ctx); break; - case BPF_S_JMP_JA: + case BPF_JMP | BPF_JA: /* pc += K */ emit(ARM_B(b_imm(i + k + 1, ctx)), ctx); break; - case BPF_S_JMP_JEQ_K: + case BPF_JMP | BPF_JEQ | BPF_K: /* pc += (A == K) ? pc->jt : pc->jf */ condt = ARM_COND_EQ; goto cmp_imm; - case BPF_S_JMP_JGT_K: + case BPF_JMP | BPF_JGT | BPF_K: /* pc += (A > K) ? pc->jt : pc->jf */ condt = ARM_COND_HI; goto cmp_imm; - case BPF_S_JMP_JGE_K: + case BPF_JMP | BPF_JGE | BPF_K: /* pc += (A >= K) ? pc->jt : pc->jf */ condt = ARM_COND_HS; cmp_imm: @@ -731,22 +726,22 @@ cond_jump: _emit(condt ^ 1, ARM_B(b_imm(i + inst->jf + 1, ctx)), ctx); break; - case BPF_S_JMP_JEQ_X: + case BPF_JMP | BPF_JEQ | BPF_X: /* pc += (A == X) ? pc->jt : pc->jf */ condt = ARM_COND_EQ; goto cmp_x; - case BPF_S_JMP_JGT_X: + case BPF_JMP | BPF_JGT | BPF_X: /* pc += (A > X) ? pc->jt : pc->jf */ condt = ARM_COND_HI; goto cmp_x; - case BPF_S_JMP_JGE_X: + case BPF_JMP | BPF_JGE | BPF_X: /* pc += (A >= X) ? pc->jt : pc->jf */ condt = ARM_COND_CS; cmp_x: update_on_xread(ctx); emit(ARM_CMP_R(r_A, r_X), ctx); goto cond_jump; - case BPF_S_JMP_JSET_K: + case BPF_JMP | BPF_JSET | BPF_K: /* pc += (A & K) ? pc->jt : pc->jf */ condt = ARM_COND_NE; /* not set iff all zeroes iff Z==1 iff EQ */ @@ -759,16 +754,16 @@ cmp_x: emit(ARM_TST_I(r_A, imm12), ctx); } goto cond_jump; - case BPF_S_JMP_JSET_X: + case BPF_JMP | BPF_JSET | BPF_X: /* pc += (A & X) ? pc->jt : pc->jf */ update_on_xread(ctx); condt = ARM_COND_NE; emit(ARM_TST_R(r_A, r_X), ctx); goto cond_jump; - case BPF_S_RET_A: + case BPF_RET | BPF_A: emit(ARM_MOV_R(ARM_R0, r_A), ctx); goto b_epilogue; - case BPF_S_RET_K: + case BPF_RET | BPF_K: if ((k == 0) && (ctx->ret0_fp_idx < 0)) ctx->ret0_fp_idx = i; emit_mov_i(ARM_R0, k, ctx); @@ -776,17 +771,17 @@ b_epilogue: if (i != ctx->skf->len - 1) emit(ARM_B(b_imm(prog->len, ctx)), ctx); break; - case BPF_S_MISC_TAX: + case BPF_MISC | BPF_TAX: /* X = A */ ctx->seen |= SEEN_X; emit(ARM_MOV_R(r_X, r_A), ctx); break; - case BPF_S_MISC_TXA: + case BPF_MISC | BPF_TXA: /* A = X */ update_on_xread(ctx); emit(ARM_MOV_R(r_A, r_X), ctx); break; - case BPF_S_ANC_PROTOCOL: + case BPF_ANC | SKF_AD_PROTOCOL: /* A = ntohs(skb->protocol) */ ctx->seen |= SEEN_SKB; BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, @@ -795,7 +790,7 @@ b_epilogue: emit(ARM_LDRH_I(r_scratch, r_skb, off), ctx); emit_swap16(r_A, r_scratch, ctx); break; - case BPF_S_ANC_CPU: + case BPF_ANC | SKF_AD_CPU: /* r_scratch = current_thread_info() */ OP_IMM3(ARM_BIC, r_scratch, ARM_SP, THREAD_SIZE - 1, ctx); /* A = current_thread_info()->cpu */ @@ -803,7 +798,7 @@ b_epilogue: off = offsetof(struct thread_info, cpu); emit(ARM_LDR_I(r_A, r_scratch, off), ctx); break; - case BPF_S_ANC_IFINDEX: + case BPF_ANC | SKF_AD_IFINDEX: /* A = skb->dev->ifindex */ ctx->seen |= SEEN_SKB; off = offsetof(struct sk_buff, dev); @@ -817,30 +812,30 @@ b_epilogue: off = offsetof(struct net_device, ifindex); emit(ARM_LDR_I(r_A, r_scratch, off), ctx); break; - case BPF_S_ANC_MARK: + case BPF_ANC | SKF_AD_MARK: ctx->seen |= SEEN_SKB; BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); off = offsetof(struct sk_buff, mark); emit(ARM_LDR_I(r_A, r_skb, off), ctx); break; - case BPF_S_ANC_RXHASH: + case BPF_ANC | SKF_AD_RXHASH: ctx->seen |= SEEN_SKB; BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); off = offsetof(struct sk_buff, hash); emit(ARM_LDR_I(r_A, r_skb, off), ctx); break; - case BPF_S_ANC_VLAN_TAG: - case BPF_S_ANC_VLAN_TAG_PRESENT: + case BPF_ANC | SKF_AD_VLAN_TAG: + case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT: ctx->seen |= SEEN_SKB; BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); off = offsetof(struct sk_buff, vlan_tci); emit(ARM_LDRH_I(r_A, r_skb, off), ctx); - if (inst->code == BPF_S_ANC_VLAN_TAG) + if (code == (BPF_ANC | SKF_AD_VLAN_TAG)) OP_IMM3(ARM_AND, r_A, r_A, VLAN_VID_MASK, ctx); else OP_IMM3(ARM_AND, r_A, r_A, VLAN_TAG_PRESENT, ctx); break; - case BPF_S_ANC_QUEUE: + case BPF_ANC | SKF_AD_QUEUE: ctx->seen |= SEEN_SKB; BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); diff --git a/arch/powerpc/net/bpf_jit_64.S b/arch/powerpc/net/bpf_jit_64.S index e76eba74d9da..8f87d9217122 100644 --- a/arch/powerpc/net/bpf_jit_64.S +++ b/arch/powerpc/net/bpf_jit_64.S @@ -78,7 +78,7 @@ sk_load_byte_positive_offset: blr /* - * BPF_S_LDX_B_MSH: ldxb 4*([offset]&0xf) + * BPF_LDX | BPF_B | BPF_MSH: ldxb 4*([offset]&0xf) * r_addr is the offset value */ .globl sk_load_byte_msh diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 808ce1cae21a..6dcdadefd8d0 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -79,19 +79,11 @@ static void bpf_jit_build_prologue(struct sk_filter *fp, u32 *image, } switch (filter[0].code) { - case BPF_S_RET_K: - case BPF_S_LD_W_LEN: - case BPF_S_ANC_PROTOCOL: - case BPF_S_ANC_IFINDEX: - case BPF_S_ANC_MARK: - case BPF_S_ANC_RXHASH: - case BPF_S_ANC_VLAN_TAG: - case BPF_S_ANC_VLAN_TAG_PRESENT: - case BPF_S_ANC_CPU: - case BPF_S_ANC_QUEUE: - case BPF_S_LD_W_ABS: - case BPF_S_LD_H_ABS: - case BPF_S_LD_B_ABS: + case BPF_RET | BPF_K: + case BPF_LD | BPF_W | BPF_LEN: + case BPF_LD | BPF_W | BPF_ABS: + case BPF_LD | BPF_H | BPF_ABS: + case BPF_LD | BPF_B | BPF_ABS: /* first instruction sets A register (or is RET 'constant') */ break; default: @@ -144,6 +136,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, for (i = 0; i < flen; i++) { unsigned int K = filter[i].k; + u16 code = bpf_anc_helper(&filter[i]); /* * addrs[] maps a BPF bytecode address into a real offset from @@ -151,35 +144,35 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, */ addrs[i] = ctx->idx * 4; - switch (filter[i].code) { + switch (code) { /*** ALU ops ***/ - case BPF_S_ALU_ADD_X: /* A += X; */ + case BPF_ALU | BPF_ADD | BPF_X: /* A += X; */ ctx->seen |= SEEN_XREG; PPC_ADD(r_A, r_A, r_X); break; - case BPF_S_ALU_ADD_K: /* A += K; */ + case BPF_ALU | BPF_ADD | BPF_K: /* A += K; */ if (!K) break; PPC_ADDI(r_A, r_A, IMM_L(K)); if (K >= 32768) PPC_ADDIS(r_A, r_A, IMM_HA(K)); break; - case BPF_S_ALU_SUB_X: /* A -= X; */ + case BPF_ALU | BPF_SUB | BPF_X: /* A -= X; */ ctx->seen |= SEEN_XREG; PPC_SUB(r_A, r_A, r_X); break; - case BPF_S_ALU_SUB_K: /* A -= K */ + case BPF_ALU | BPF_SUB | BPF_K: /* A -= K */ if (!K) break; PPC_ADDI(r_A, r_A, IMM_L(-K)); if (K >= 32768) PPC_ADDIS(r_A, r_A, IMM_HA(-K)); break; - case BPF_S_ALU_MUL_X: /* A *= X; */ + case BPF_ALU | BPF_MUL | BPF_X: /* A *= X; */ ctx->seen |= SEEN_XREG; PPC_MUL(r_A, r_A, r_X); break; - case BPF_S_ALU_MUL_K: /* A *= K */ + case BPF_ALU | BPF_MUL | BPF_K: /* A *= K */ if (K < 32768) PPC_MULI(r_A, r_A, K); else { @@ -187,7 +180,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, PPC_MUL(r_A, r_A, r_scratch1); } break; - case BPF_S_ALU_MOD_X: /* A %= X; */ + case BPF_ALU | BPF_MOD | BPF_X: /* A %= X; */ ctx->seen |= SEEN_XREG; PPC_CMPWI(r_X, 0); if (ctx->pc_ret0 != -1) { @@ -201,13 +194,13 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, PPC_MUL(r_scratch1, r_X, r_scratch1); PPC_SUB(r_A, r_A, r_scratch1); break; - case BPF_S_ALU_MOD_K: /* A %= K; */ + case BPF_ALU | BPF_MOD | BPF_K: /* A %= K; */ PPC_LI32(r_scratch2, K); PPC_DIVWU(r_scratch1, r_A, r_scratch2); PPC_MUL(r_scratch1, r_scratch2, r_scratch1); PPC_SUB(r_A, r_A, r_scratch1); break; - case BPF_S_ALU_DIV_X: /* A /= X; */ + case BPF_ALU | BPF_DIV | BPF_X: /* A /= X; */ ctx->seen |= SEEN_XREG; PPC_CMPWI(r_X, 0); if (ctx->pc_ret0 != -1) { @@ -223,17 +216,17 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, } PPC_DIVWU(r_A, r_A, r_X); break; - case BPF_S_ALU_DIV_K: /* A /= K */ + case BPF_ALU | BPF_DIV | BPF_K: /* A /= K */ if (K == 1) break; PPC_LI32(r_scratch1, K); PPC_DIVWU(r_A, r_A, r_scratch1); break; - case BPF_S_ALU_AND_X: + case BPF_ALU | BPF_AND | BPF_X: ctx->seen |= SEEN_XREG; PPC_AND(r_A, r_A, r_X); break; - case BPF_S_ALU_AND_K: + case BPF_ALU | BPF_AND | BPF_K: if (!IMM_H(K)) PPC_ANDI(r_A, r_A, K); else { @@ -241,51 +234,51 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, PPC_AND(r_A, r_A, r_scratch1); } break; - case BPF_S_ALU_OR_X: + case BPF_ALU | BPF_OR | BPF_X: ctx->seen |= SEEN_XREG; PPC_OR(r_A, r_A, r_X); break; - case BPF_S_ALU_OR_K: + case BPF_ALU | BPF_OR | BPF_K: if (IMM_L(K)) PPC_ORI(r_A, r_A, IMM_L(K)); if (K >= 65536) PPC_ORIS(r_A, r_A, IMM_H(K)); break; - case BPF_S_ANC_ALU_XOR_X: - case BPF_S_ALU_XOR_X: /* A ^= X */ + case BPF_ANC | SKF_AD_ALU_XOR_X: + case BPF_ALU | BPF_XOR | BPF_X: /* A ^= X */ ctx->seen |= SEEN_XREG; PPC_XOR(r_A, r_A, r_X); break; - case BPF_S_ALU_XOR_K: /* A ^= K */ + case BPF_ALU | BPF_XOR | BPF_K: /* A ^= K */ if (IMM_L(K)) PPC_XORI(r_A, r_A, IMM_L(K)); if (K >= 65536) PPC_XORIS(r_A, r_A, IMM_H(K)); break; - case BPF_S_ALU_LSH_X: /* A <<= X; */ + case BPF_ALU | BPF_LSH | BPF_X: /* A <<= X; */ ctx->seen |= SEEN_XREG; PPC_SLW(r_A, r_A, r_X); break; - case BPF_S_ALU_LSH_K: + case BPF_ALU | BPF_LSH | BPF_K: if (K == 0) break; else PPC_SLWI(r_A, r_A, K); break; - case BPF_S_ALU_RSH_X: /* A >>= X; */ + case BPF_ALU | BPF_RSH | BPF_X: /* A >>= X; */ ctx->seen |= SEEN_XREG; PPC_SRW(r_A, r_A, r_X); break; - case BPF_S_ALU_RSH_K: /* A >>= K; */ + case BPF_ALU | BPF_RSH | BPF_K: /* A >>= K; */ if (K == 0) break; else PPC_SRWI(r_A, r_A, K); break; - case BPF_S_ALU_NEG: + case BPF_ALU | BPF_NEG: PPC_NEG(r_A, r_A); break; - case BPF_S_RET_K: + case BPF_RET | BPF_K: PPC_LI32(r_ret, K); if (!K) { if (ctx->pc_ret0 == -1) @@ -312,7 +305,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, PPC_BLR(); } break; - case BPF_S_RET_A: + case BPF_RET | BPF_A: PPC_MR(r_ret, r_A); if (i != flen - 1) { if (ctx->seen) @@ -321,53 +314,53 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, PPC_BLR(); } break; - case BPF_S_MISC_TAX: /* X = A */ + case BPF_MISC | BPF_TAX: /* X = A */ PPC_MR(r_X, r_A); break; - case BPF_S_MISC_TXA: /* A = X */ + case BPF_MISC | BPF_TXA: /* A = X */ ctx->seen |= SEEN_XREG; PPC_MR(r_A, r_X); break; /*** Constant loads/M[] access ***/ - case BPF_S_LD_IMM: /* A = K */ + case BPF_LD | BPF_IMM: /* A = K */ PPC_LI32(r_A, K); break; - case BPF_S_LDX_IMM: /* X = K */ + case BPF_LDX | BPF_IMM: /* X = K */ PPC_LI32(r_X, K); break; - case BPF_S_LD_MEM: /* A = mem[K] */ + case BPF_LD | BPF_MEM: /* A = mem[K] */ PPC_MR(r_A, r_M + (K & 0xf)); ctx->seen |= SEEN_MEM | (1<<(K & 0xf)); break; - case BPF_S_LDX_MEM: /* X = mem[K] */ + case BPF_LDX | BPF_MEM: /* X = mem[K] */ PPC_MR(r_X, r_M + (K & 0xf)); ctx->seen |= SEEN_MEM | (1<<(K & 0xf)); break; - case BPF_S_ST: /* mem[K] = A */ + case BPF_ST: /* mem[K] = A */ PPC_MR(r_M + (K & 0xf), r_A); ctx->seen |= SEEN_MEM | (1<<(K & 0xf)); break; - case BPF_S_STX: /* mem[K] = X */ + case BPF_STX: /* mem[K] = X */ PPC_MR(r_M + (K & 0xf), r_X); ctx->seen |= SEEN_XREG | SEEN_MEM | (1<<(K & 0xf)); break; - case BPF_S_LD_W_LEN: /* A = skb->len; */ + case BPF_LD | BPF_W | BPF_LEN: /* A = skb->len; */ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); PPC_LWZ_OFFS(r_A, r_skb, offsetof(struct sk_buff, len)); break; - case BPF_S_LDX_W_LEN: /* X = skb->len; */ + case BPF_LDX | BPF_W | BPF_LEN: /* X = skb->len; */ PPC_LWZ_OFFS(r_X, r_skb, offsetof(struct sk_buff, len)); break; /*** Ancillary info loads ***/ - case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */ + case BPF_ANC | SKF_AD_PROTOCOL: /* A = ntohs(skb->protocol); */ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); PPC_NTOHS_OFFS(r_A, r_skb, offsetof(struct sk_buff, protocol)); break; - case BPF_S_ANC_IFINDEX: + case BPF_ANC | SKF_AD_IFINDEX: PPC_LD_OFFS(r_scratch1, r_skb, offsetof(struct sk_buff, dev)); PPC_CMPDI(r_scratch1, 0); @@ -384,33 +377,33 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, PPC_LWZ_OFFS(r_A, r_scratch1, offsetof(struct net_device, ifindex)); break; - case BPF_S_ANC_MARK: + case BPF_ANC | SKF_AD_MARK: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); PPC_LWZ_OFFS(r_A, r_skb, offsetof(struct sk_buff, mark)); break; - case BPF_S_ANC_RXHASH: + case BPF_ANC | SKF_AD_RXHASH: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); PPC_LWZ_OFFS(r_A, r_skb, offsetof(struct sk_buff, hash)); break; - case BPF_S_ANC_VLAN_TAG: - case BPF_S_ANC_VLAN_TAG_PRESENT: + case BPF_ANC | SKF_AD_VLAN_TAG: + case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); PPC_LHZ_OFFS(r_A, r_skb, offsetof(struct sk_buff, vlan_tci)); - if (filter[i].code == BPF_S_ANC_VLAN_TAG) + if (code == (BPF_ANC | SKF_AD_VLAN_TAG)) PPC_ANDI(r_A, r_A, VLAN_VID_MASK); else PPC_ANDI(r_A, r_A, VLAN_TAG_PRESENT); break; - case BPF_S_ANC_QUEUE: + case BPF_ANC | SKF_AD_QUEUE: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); PPC_LHZ_OFFS(r_A, r_skb, offsetof(struct sk_buff, queue_mapping)); break; - case BPF_S_ANC_CPU: + case BPF_ANC | SKF_AD_CPU: #ifdef CONFIG_SMP /* * PACA ptr is r13: @@ -426,13 +419,13 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, break; /*** Absolute loads from packet header/data ***/ - case BPF_S_LD_W_ABS: + case BPF_LD | BPF_W | BPF_ABS: func = CHOOSE_LOAD_FUNC(K, sk_load_word); goto common_load; - case BPF_S_LD_H_ABS: + case BPF_LD | BPF_H | BPF_ABS: func = CHOOSE_LOAD_FUNC(K, sk_load_half); goto common_load; - case BPF_S_LD_B_ABS: + case BPF_LD | BPF_B | BPF_ABS: func = CHOOSE_LOAD_FUNC(K, sk_load_byte); common_load: /* Load from [K]. */ @@ -449,13 +442,13 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, break; /*** Indirect loads from packet header/data ***/ - case BPF_S_LD_W_IND: + case BPF_LD | BPF_W | BPF_IND: func = sk_load_word; goto common_load_ind; - case BPF_S_LD_H_IND: + case BPF_LD | BPF_H | BPF_IND: func = sk_load_half; goto common_load_ind; - case BPF_S_LD_B_IND: + case BPF_LD | BPF_B | BPF_IND: func = sk_load_byte; common_load_ind: /* @@ -473,31 +466,31 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, PPC_BCC(COND_LT, exit_addr); break; - case BPF_S_LDX_B_MSH: + case BPF_LDX | BPF_B | BPF_MSH: func = CHOOSE_LOAD_FUNC(K, sk_load_byte_msh); goto common_load; break; /*** Jump and branches ***/ - case BPF_S_JMP_JA: + case BPF_JMP | BPF_JA: if (K != 0) PPC_JMP(addrs[i + 1 + K]); break; - case BPF_S_JMP_JGT_K: - case BPF_S_JMP_JGT_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: true_cond = COND_GT; goto cond_branch; - case BPF_S_JMP_JGE_K: - case BPF_S_JMP_JGE_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: true_cond = COND_GE; goto cond_branch; - case BPF_S_JMP_JEQ_K: - case BPF_S_JMP_JEQ_X: + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: true_cond = COND_EQ; goto cond_branch; - case BPF_S_JMP_JSET_K: - case BPF_S_JMP_JSET_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: true_cond = COND_NE; /* Fall through */ cond_branch: @@ -508,20 +501,20 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, break; } - switch (filter[i].code) { - case BPF_S_JMP_JGT_X: - case BPF_S_JMP_JGE_X: - case BPF_S_JMP_JEQ_X: + switch (code) { + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JEQ | BPF_X: ctx->seen |= SEEN_XREG; PPC_CMPLW(r_A, r_X); break; - case BPF_S_JMP_JSET_X: + case BPF_JMP | BPF_JSET | BPF_X: ctx->seen |= SEEN_XREG; PPC_AND_DOT(r_scratch1, r_A, r_X); break; - case BPF_S_JMP_JEQ_K: - case BPF_S_JMP_JGT_K: - case BPF_S_JMP_JGE_K: + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: if (K < 32768) PPC_CMPLWI(r_A, K); else { @@ -529,7 +522,7 @@ static int bpf_jit_build_body(struct sk_filter *fp, u32 *image, PPC_CMPLW(r_A, r_scratch1); } break; - case BPF_S_JMP_JSET_K: + case BPF_JMP | BPF_JSET | BPF_K: if (K < 32768) /* PPC_ANDI is /only/ dot-form */ PPC_ANDI(r_scratch1, r_A, K); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index e9f8fa9337fe..a2cbd875543a 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -269,27 +269,17 @@ static void bpf_jit_noleaks(struct bpf_jit *jit, struct sock_filter *filter) EMIT4(0xa7c80000); /* Clear A if the first register does not set it. */ switch (filter[0].code) { - case BPF_S_LD_W_ABS: - case BPF_S_LD_H_ABS: - case BPF_S_LD_B_ABS: - case BPF_S_LD_W_LEN: - case BPF_S_LD_W_IND: - case BPF_S_LD_H_IND: - case BPF_S_LD_B_IND: - case BPF_S_LD_IMM: - case BPF_S_LD_MEM: - case BPF_S_MISC_TXA: - case BPF_S_ANC_PROTOCOL: - case BPF_S_ANC_PKTTYPE: - case BPF_S_ANC_IFINDEX: - case BPF_S_ANC_MARK: - case BPF_S_ANC_QUEUE: - case BPF_S_ANC_HATYPE: - case BPF_S_ANC_RXHASH: - case BPF_S_ANC_CPU: - case BPF_S_ANC_VLAN_TAG: - case BPF_S_ANC_VLAN_TAG_PRESENT: - case BPF_S_RET_K: + case BPF_LD | BPF_W | BPF_ABS: + case BPF_LD | BPF_H | BPF_ABS: + case BPF_LD | BPF_B | BPF_ABS: + case BPF_LD | BPF_W | BPF_LEN: + case BPF_LD | BPF_W | BPF_IND: + case BPF_LD | BPF_H | BPF_IND: + case BPF_LD | BPF_B | BPF_IND: + case BPF_LD | BPF_IMM: + case BPF_LD | BPF_MEM: + case BPF_MISC | BPF_TXA: + case BPF_RET | BPF_K: /* first instruction sets A register */ break; default: /* A = 0 */ @@ -304,15 +294,18 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter, unsigned int K; int offset; unsigned int mask; + u16 code; K = filter->k; - switch (filter->code) { - case BPF_S_ALU_ADD_X: /* A += X */ + code = bpf_anc_helper(filter); + + switch (code) { + case BPF_ALU | BPF_ADD | BPF_X: /* A += X */ jit->seen |= SEEN_XREG; /* ar %r5,%r12 */ EMIT2(0x1a5c); break; - case BPF_S_ALU_ADD_K: /* A += K */ + case BPF_ALU | BPF_ADD | BPF_K: /* A += K */ if (!K) break; if (K <= 16383) @@ -325,12 +318,12 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter, /* a %r5,(%r13) */ EMIT4_DISP(0x5a50d000, EMIT_CONST(K)); break; - case BPF_S_ALU_SUB_X: /* A -= X */ + case BPF_ALU | BPF_SUB | BPF_X: /* A -= X */ jit->seen |= SEEN_XREG; /* sr %r5,%r12 */ EMIT2(0x1b5c); break; - case BPF_S_ALU_SUB_K: /* A -= K */ + case BPF_ALU | BPF_SUB | BPF_K: /* A -= K */ if (!K) break; if (K <= 16384) @@ -343,12 +336,12 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter, /* s %r5,(%r13) */ EMIT4_DISP(0x5b50d000, EMIT_CONST(K)); break; - case BPF_S_ALU_MUL_X: /* A *= X */ + case BPF_ALU | BPF_MUL | BPF_X: /* A *= X */ jit->seen |= SEEN_XREG; /* msr %r5,%r12 */ EMIT4(0xb252005c); break; - case BPF_S_ALU_MUL_K: /* A *= K */ + case BPF_ALU | BPF_MUL | BPF_K: /* A *= K */ if (K <= 16383) /* mhi %r5,K */ EMIT4_IMM(0xa75c0000, K); @@ -359,7 +352,7 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter, /* ms %r5,(%r13) */ EMIT4_DISP(0x7150d000, EMIT_CONST(K)); break; - case BPF_S_ALU_DIV_X: /* A /= X */ + case BPF_ALU | BPF_DIV | BPF_X: /* A /= X */ jit->seen |= SEEN_XREG | SEEN_RET0; /* ltr %r12,%r12 */ EMIT2(0x12cc); @@ -370,7 +363,7 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter, /* dlr %r4,%r12 */ EMIT4(0xb997004c); break; - case BPF_S_ALU_DIV_K: /* A /= K */ + case BPF_ALU | BPF_DIV | BPF_K: /* A /= K */ if (K == 1) break; /* lhi %r4,0 */ @@ -378,7 +371,7 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter, /* dl %r4,(%r13) */ EMIT6_DISP(0xe340d000, 0x0097, EMIT_CONST(K)); break; - case BPF_S_ALU_MOD_X: /* A %= X */ + case BPF_ALU | BPF_MOD | BPF_X: /* A %= X */ jit->seen |= SEEN_XREG | SEEN_RET0; /* ltr %r12,%r12 */ EMIT2(0x12cc); @@ -391,7 +384,7 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter, /* lr %r5,%r4 */ EMIT2(0x1854); break; - case BPF_S_ALU_MOD_K: /* A %= K */ + case BPF_ALU | BPF_MOD | BPF_K: /* A %= K */ if (K == 1) { /* lhi %r5,0 */ EMIT4(0xa7580000); @@ -404,12 +397,12 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter, /* lr %r5,%r4 */ EMIT2(0x1854); break; - case BPF_S_ALU_AND_X: /* A &= X */ + case BPF_ALU | BPF_AND | BPF_X: /* A &= X */ jit->seen |= SEEN_XREG; /* nr %r5,%r12 */ EMIT2(0x145c); break; - case BPF_S_ALU_AND_K: /* A &= K */ + case BPF_ALU | BPF_AND | BPF_K: /* A &= K */ if (test_facility(21)) /* nilf %r5, */ EMIT6_IMM(0xc05b0000, K); @@ -417,12 +410,12 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter, /* n %r5,(%r13) */ EMIT4_DISP(0x5450d000, EMIT_CONST(K)); break; - case BPF_S_ALU_OR_X: /* A |= X */ + case BPF_ALU | BPF_OR | BPF_X: /* A |= X */ jit->seen |= SEEN_XREG; /* or %r5,%r12 */ EMIT2(0x165c); break; - case BPF_S_ALU_OR_K: /* A |= K */ + case BPF_ALU | BPF_OR | BPF_K: /* A |= K */ if (test_facility(21)) /* oilf %r5, */ EMIT6_IMM(0xc05d0000, K); @@ -430,55 +423,55 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct sock_filter *filter, /* o %r5,(%r13) */ EMIT4_DISP(0x5650d000, EMIT_CONST(K)); break; - case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */ - case BPF_S_ALU_XOR_X: + case BPF_ANC | SKF_AD_ALU_XOR_X: /* A ^= X; */ + case BPF_ALU | BPF_XOR | BPF_X: jit->seen |= SEEN_XREG; /* xr %r5,%r12 */ EMIT2(0x175c); break; - case BPF_S_ALU_XOR_K: /* A ^= K */ + case BPF_ALU | BPF_XOR | BPF_K: /* A ^= K */ if (!K) break; /* x %r5,(%r13) */ EMIT4_DISP(0x5750d000, EMIT_CONST(K)); break; - case BPF_S_ALU_LSH_X: /* A <<= X; */ + case BPF_ALU | BPF_LSH | BPF_X: /* A <<= X; */ jit->seen |= SEEN_XREG; /* sll %r5,0(%r12) */ EMIT4(0x8950c000); break; - case BPF_S_ALU_LSH_K: /* A <<= K */ + case BPF_ALU | BPF_LSH | BPF_K: /* A <<= K */ if (K == 0) break; /* sll %r5,K */ EMIT4_DISP(0x89500000, K); break; - case BPF_S_ALU_RSH_X: /* A >>= X; */ + case BPF_ALU | BPF_RSH | BPF_X: /* A >>= X; */ jit->seen |= SEEN_XREG; /* srl %r5,0(%r12) */ EMIT4(0x8850c000); break; - case BPF_S_ALU_RSH_K: /* A >>= K; */ + case BPF_ALU | BPF_RSH | BPF_K: /* A >>= K; */ if (K == 0) break; /* srl %r5,K */ EMIT4_DISP(0x88500000, K); break; - case BPF_S_ALU_NEG: /* A = -A */ + case BPF_ALU | BPF_NEG: /* A = -A */ /* lnr %r5,%r5 */ EMIT2(0x1155); break; - case BPF_S_JMP_JA: /* ip += K */ + case BPF_JMP | BPF_JA: /* ip += K */ offset = addrs[i + K] + jit->start - jit->prg; EMIT4_PCREL(0xa7f40000, offset); break; - case BPF_S_JMP_JGT_K: /* ip += (A > K) ? jt : jf */ + case BPF_JMP | BPF_JGT | BPF_K: /* ip += (A > K) ? jt : jf */ mask = 0x200000; /* jh */ goto kbranch; - case BPF_S_JMP_JGE_K: /* ip += (A >= K) ? jt : jf */ + case BPF_JMP | BPF_JGE | BPF_K: /* ip += (A >= K) ? jt : jf */ mask = 0xa00000; /* jhe */ goto kbranch; - case BPF_S_JMP_JEQ_K: /* ip += (A == K) ? jt : jf */ + case BPF_JMP | BPF_JEQ | BPF_K: /* ip += (A == K) ? jt : jf */ mask = 0x800000; /* je */ kbranch: /* Emit compare if the branch targets are different */ if (filter->jt != filter->jf) { @@ -511,7 +504,7 @@ branch: if (filter->jt == filter->jf) { EMIT4_PCREL(0xa7040000 | (mask ^ 0xf00000), offset); } break; - case BPF_S_JMP_JSET_K: /* ip += (A & K) ? jt : jf */ + case BPF_JMP | BPF_JSET | BPF_K: /* ip += (A & K) ? jt : jf */ mask = 0x700000; /* jnz */ /* Emit test if the branch targets are different */ if (filter->jt != filter->jf) { @@ -525,13 +518,13 @@ branch: if (filter->jt == filter->jf) { EMIT4_IMM(0xa7510000, K); } goto branch; - case BPF_S_JMP_JGT_X: /* ip += (A > X) ? jt : jf */ + case BPF_JMP | BPF_JGT | BPF_X: /* ip += (A > X) ? jt : jf */ mask = 0x200000; /* jh */ goto xbranch; - case BPF_S_JMP_JGE_X: /* ip += (A >= X) ? jt : jf */ + case BPF_JMP | BPF_JGE | BPF_X: /* ip += (A >= X) ? jt : jf */ mask = 0xa00000; /* jhe */ goto xbranch; - case BPF_S_JMP_JEQ_X: /* ip += (A == X) ? jt : jf */ + case BPF_JMP | BPF_JEQ | BPF_X: /* ip += (A == X) ? jt : jf */ mask = 0x800000; /* je */ xbranch: /* Emit compare if the branch targets are different */ if (filter->jt != filter->jf) { @@ -540,7 +533,7 @@ xbranch: /* Emit compare if the branch targets are different */ EMIT2(0x195c); } goto branch; - case BPF_S_JMP_JSET_X: /* ip += (A & X) ? jt : jf */ + case BPF_JMP | BPF_JSET | BPF_X: /* ip += (A & X) ? jt : jf */ mask = 0x700000; /* jnz */ /* Emit test if the branch targets are different */ if (filter->jt != filter->jf) { @@ -551,15 +544,15 @@ xbranch: /* Emit compare if the branch targets are different */ EMIT2(0x144c); } goto branch; - case BPF_S_LD_W_ABS: /* A = *(u32 *) (skb->data+K) */ + case BPF_LD | BPF_W | BPF_ABS: /* A = *(u32 *) (skb->data+K) */ jit->seen |= SEEN_DATAREF | SEEN_RET0 | SEEN_LOAD_WORD; offset = jit->off_load_word; goto load_abs; - case BPF_S_LD_H_ABS: /* A = *(u16 *) (skb->data+K) */ + case BPF_LD | BPF_H | BPF_ABS: /* A = *(u16 *) (skb->data+K) */ jit->seen |= SEEN_DATAREF | SEEN_RET0 | SEEN_LOAD_HALF; offset = jit->off_load_half; goto load_abs; - case BPF_S_LD_B_ABS: /* A = *(u8 *) (skb->data+K) */ + case BPF_LD | BPF_B | BPF_ABS: /* A = *(u8 *) (skb->data+K) */ jit->seen |= SEEN_DATAREF | SEEN_RET0 | SEEN_LOAD_BYTE; offset = jit->off_load_byte; load_abs: if ((int) K < 0) @@ -573,19 +566,19 @@ call_fn: /* lg %r1,(%r13) */ /* jnz */ EMIT4_PCREL(0xa7740000, (jit->ret0_ip - jit->prg)); break; - case BPF_S_LD_W_IND: /* A = *(u32 *) (skb->data+K+X) */ + case BPF_LD | BPF_W | BPF_IND: /* A = *(u32 *) (skb->data+K+X) */ jit->seen |= SEEN_DATAREF | SEEN_RET0 | SEEN_LOAD_IWORD; offset = jit->off_load_iword; goto call_fn; - case BPF_S_LD_H_IND: /* A = *(u16 *) (skb->data+K+X) */ + case BPF_LD | BPF_H | BPF_IND: /* A = *(u16 *) (skb->data+K+X) */ jit->seen |= SEEN_DATAREF | SEEN_RET0 | SEEN_LOAD_IHALF; offset = jit->off_load_ihalf; goto call_fn; - case BPF_S_LD_B_IND: /* A = *(u8 *) (skb->data+K+X) */ + case BPF_LD | BPF_B | BPF_IND: /* A = *(u8 *) (skb->data+K+X) */ jit->seen |= SEEN_DATAREF | SEEN_RET0 | SEEN_LOAD_IBYTE; offset = jit->off_load_ibyte; goto call_fn; - case BPF_S_LDX_B_MSH: + case BPF_LDX | BPF_B | BPF_MSH: /* X = (*(u8 *)(skb->data+K) & 0xf) << 2 */ jit->seen |= SEEN_RET0; if ((int) K < 0) { @@ -596,17 +589,17 @@ call_fn: /* lg %r1,(%r13) */ jit->seen |= SEEN_DATAREF | SEEN_LOAD_BMSH; offset = jit->off_load_bmsh; goto call_fn; - case BPF_S_LD_W_LEN: /* A = skb->len; */ + case BPF_LD | BPF_W | BPF_LEN: /* A = skb->len; */ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); /* l %r5,(%r2) */ EMIT4_DISP(0x58502000, offsetof(struct sk_buff, len)); break; - case BPF_S_LDX_W_LEN: /* X = skb->len; */ + case BPF_LDX | BPF_W | BPF_LEN: /* X = skb->len; */ jit->seen |= SEEN_XREG; /* l %r12,(%r2) */ EMIT4_DISP(0x58c02000, offsetof(struct sk_buff, len)); break; - case BPF_S_LD_IMM: /* A = K */ + case BPF_LD | BPF_IMM: /* A = K */ if (K <= 16383) /* lhi %r5,K */ EMIT4_IMM(0xa7580000, K); @@ -617,7 +610,7 @@ call_fn: /* lg %r1,(%r13) */ /* l %r5,(%r13) */ EMIT4_DISP(0x5850d000, EMIT_CONST(K)); break; - case BPF_S_LDX_IMM: /* X = K */ + case BPF_LDX | BPF_IMM: /* X = K */ jit->seen |= SEEN_XREG; if (K <= 16383) /* lhi %r12, */ @@ -629,29 +622,29 @@ call_fn: /* lg %r1,(%r13) */ /* l %r12,(%r13) */ EMIT4_DISP(0x58c0d000, EMIT_CONST(K)); break; - case BPF_S_LD_MEM: /* A = mem[K] */ + case BPF_LD | BPF_MEM: /* A = mem[K] */ jit->seen |= SEEN_MEM; /* l %r5,(%r15) */ EMIT4_DISP(0x5850f000, (jit->seen & SEEN_DATAREF) ? 160 + K*4 : K*4); break; - case BPF_S_LDX_MEM: /* X = mem[K] */ + case BPF_LDX | BPF_MEM: /* X = mem[K] */ jit->seen |= SEEN_XREG | SEEN_MEM; /* l %r12,(%r15) */ EMIT4_DISP(0x58c0f000, (jit->seen & SEEN_DATAREF) ? 160 + K*4 : K*4); break; - case BPF_S_MISC_TAX: /* X = A */ + case BPF_MISC | BPF_TAX: /* X = A */ jit->seen |= SEEN_XREG; /* lr %r12,%r5 */ EMIT2(0x18c5); break; - case BPF_S_MISC_TXA: /* A = X */ + case BPF_MISC | BPF_TXA: /* A = X */ jit->seen |= SEEN_XREG; /* lr %r5,%r12 */ EMIT2(0x185c); break; - case BPF_S_RET_K: + case BPF_RET | BPF_K: if (K == 0) { jit->seen |= SEEN_RET0; if (last) @@ -671,33 +664,33 @@ call_fn: /* lg %r1,(%r13) */ EMIT4_PCREL(0xa7f40000, jit->exit_ip - jit->prg); } break; - case BPF_S_RET_A: + case BPF_RET | BPF_A: /* llgfr %r2,%r5 */ EMIT4(0xb9160025); /* j */ EMIT4_PCREL(0xa7f40000, jit->exit_ip - jit->prg); break; - case BPF_S_ST: /* mem[K] = A */ + case BPF_ST: /* mem[K] = A */ jit->seen |= SEEN_MEM; /* st %r5,(%r15) */ EMIT4_DISP(0x5050f000, (jit->seen & SEEN_DATAREF) ? 160 + K*4 : K*4); break; - case BPF_S_STX: /* mem[K] = X : mov %ebx,off8(%rbp) */ + case BPF_STX: /* mem[K] = X : mov %ebx,off8(%rbp) */ jit->seen |= SEEN_XREG | SEEN_MEM; /* st %r12,(%r15) */ EMIT4_DISP(0x50c0f000, (jit->seen & SEEN_DATAREF) ? 160 + K*4 : K*4); break; - case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */ + case BPF_ANC | SKF_AD_PROTOCOL: /* A = ntohs(skb->protocol); */ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); /* lhi %r5,0 */ EMIT4(0xa7580000); /* icm %r5,3,(%r2) */ EMIT4_DISP(0xbf532000, offsetof(struct sk_buff, protocol)); break; - case BPF_S_ANC_IFINDEX: /* if (!skb->dev) return 0; - * A = skb->dev->ifindex */ + case BPF_ANC | SKF_AD_IFINDEX: /* if (!skb->dev) return 0; + * A = skb->dev->ifindex */ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); jit->seen |= SEEN_RET0; /* lg %r1,(%r2) */ @@ -709,20 +702,20 @@ call_fn: /* lg %r1,(%r13) */ /* l %r5,(%r1) */ EMIT4_DISP(0x58501000, offsetof(struct net_device, ifindex)); break; - case BPF_S_ANC_MARK: /* A = skb->mark */ + case BPF_ANC | SKF_AD_MARK: /* A = skb->mark */ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); /* l %r5,(%r2) */ EMIT4_DISP(0x58502000, offsetof(struct sk_buff, mark)); break; - case BPF_S_ANC_QUEUE: /* A = skb->queue_mapping */ + case BPF_ANC | SKF_AD_QUEUE: /* A = skb->queue_mapping */ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); /* lhi %r5,0 */ EMIT4(0xa7580000); /* icm %r5,3,(%r2) */ EMIT4_DISP(0xbf532000, offsetof(struct sk_buff, queue_mapping)); break; - case BPF_S_ANC_HATYPE: /* if (!skb->dev) return 0; - * A = skb->dev->type */ + case BPF_ANC | SKF_AD_HATYPE: /* if (!skb->dev) return 0; + * A = skb->dev->type */ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); jit->seen |= SEEN_RET0; /* lg %r1,(%r2) */ @@ -736,20 +729,20 @@ call_fn: /* lg %r1,(%r13) */ /* icm %r5,3,(%r1) */ EMIT4_DISP(0xbf531000, offsetof(struct net_device, type)); break; - case BPF_S_ANC_RXHASH: /* A = skb->hash */ + case BPF_ANC | SKF_AD_RXHASH: /* A = skb->hash */ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); /* l %r5,(%r2) */ EMIT4_DISP(0x58502000, offsetof(struct sk_buff, hash)); break; - case BPF_S_ANC_VLAN_TAG: - case BPF_S_ANC_VLAN_TAG_PRESENT: + case BPF_ANC | SKF_AD_VLAN_TAG: + case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); /* lhi %r5,0 */ EMIT4(0xa7580000); /* icm %r5,3,(%r2) */ EMIT4_DISP(0xbf532000, offsetof(struct sk_buff, vlan_tci)); - if (filter->code == BPF_S_ANC_VLAN_TAG) { + if (code == (BPF_ANC | SKF_AD_VLAN_TAG)) { /* nill %r5,0xefff */ EMIT4_IMM(0xa5570000, ~VLAN_TAG_PRESENT); } else { @@ -759,7 +752,7 @@ call_fn: /* lg %r1,(%r13) */ EMIT4_DISP(0x88500000, 12); } break; - case BPF_S_ANC_PKTTYPE: + case BPF_ANC | SKF_AD_PKTTYPE: if (pkt_type_offset < 0) goto out; /* lhi %r5,0 */ @@ -769,7 +762,7 @@ call_fn: /* lg %r1,(%r13) */ /* srl %r5,5 */ EMIT4_DISP(0x88500000, 5); break; - case BPF_S_ANC_CPU: /* A = smp_processor_id() */ + case BPF_ANC | SKF_AD_CPU: /* A = smp_processor_id() */ #ifdef CONFIG_SMP /* l %r5, */ EMIT4_DISP(0x58500000, offsetof(struct _lowcore, cpu_nr)); diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index a82c6b2a9780..c88cf147deed 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -415,20 +415,11 @@ void bpf_jit_compile(struct sk_filter *fp) emit_reg_move(O7, r_saved_O7); switch (filter[0].code) { - case BPF_S_RET_K: - case BPF_S_LD_W_LEN: - case BPF_S_ANC_PROTOCOL: - case BPF_S_ANC_PKTTYPE: - case BPF_S_ANC_IFINDEX: - case BPF_S_ANC_MARK: - case BPF_S_ANC_RXHASH: - case BPF_S_ANC_VLAN_TAG: - case BPF_S_ANC_VLAN_TAG_PRESENT: - case BPF_S_ANC_CPU: - case BPF_S_ANC_QUEUE: - case BPF_S_LD_W_ABS: - case BPF_S_LD_H_ABS: - case BPF_S_LD_B_ABS: + case BPF_RET | BPF_K: + case BPF_LD | BPF_W | BPF_LEN: + case BPF_LD | BPF_W | BPF_ABS: + case BPF_LD | BPF_H | BPF_ABS: + case BPF_LD | BPF_B | BPF_ABS: /* The first instruction sets the A register (or is * a "RET 'constant'") */ @@ -445,59 +436,60 @@ void bpf_jit_compile(struct sk_filter *fp) unsigned int t_offset; unsigned int f_offset; u32 t_op, f_op; + u16 code = bpf_anc_helper(&filter[i]); int ilen; - switch (filter[i].code) { - case BPF_S_ALU_ADD_X: /* A += X; */ + switch (code) { + case BPF_ALU | BPF_ADD | BPF_X: /* A += X; */ emit_alu_X(ADD); break; - case BPF_S_ALU_ADD_K: /* A += K; */ + case BPF_ALU | BPF_ADD | BPF_K: /* A += K; */ emit_alu_K(ADD, K); break; - case BPF_S_ALU_SUB_X: /* A -= X; */ + case BPF_ALU | BPF_SUB | BPF_X: /* A -= X; */ emit_alu_X(SUB); break; - case BPF_S_ALU_SUB_K: /* A -= K */ + case BPF_ALU | BPF_SUB | BPF_K: /* A -= K */ emit_alu_K(SUB, K); break; - case BPF_S_ALU_AND_X: /* A &= X */ + case BPF_ALU | BPF_AND | BPF_X: /* A &= X */ emit_alu_X(AND); break; - case BPF_S_ALU_AND_K: /* A &= K */ + case BPF_ALU | BPF_AND | BPF_K: /* A &= K */ emit_alu_K(AND, K); break; - case BPF_S_ALU_OR_X: /* A |= X */ + case BPF_ALU | BPF_OR | BPF_X: /* A |= X */ emit_alu_X(OR); break; - case BPF_S_ALU_OR_K: /* A |= K */ + case BPF_ALU | BPF_OR | BPF_K: /* A |= K */ emit_alu_K(OR, K); break; - case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */ - case BPF_S_ALU_XOR_X: + case BPF_ANC | SKF_AD_ALU_XOR_X: /* A ^= X; */ + case BPF_ALU | BPF_XOR | BPF_X: emit_alu_X(XOR); break; - case BPF_S_ALU_XOR_K: /* A ^= K */ + case BPF_ALU | BPF_XOR | BPF_K: /* A ^= K */ emit_alu_K(XOR, K); break; - case BPF_S_ALU_LSH_X: /* A <<= X */ + case BPF_ALU | BPF_LSH | BPF_X: /* A <<= X */ emit_alu_X(SLL); break; - case BPF_S_ALU_LSH_K: /* A <<= K */ + case BPF_ALU | BPF_LSH | BPF_K: /* A <<= K */ emit_alu_K(SLL, K); break; - case BPF_S_ALU_RSH_X: /* A >>= X */ + case BPF_ALU | BPF_RSH | BPF_X: /* A >>= X */ emit_alu_X(SRL); break; - case BPF_S_ALU_RSH_K: /* A >>= K */ + case BPF_ALU | BPF_RSH | BPF_K: /* A >>= K */ emit_alu_K(SRL, K); break; - case BPF_S_ALU_MUL_X: /* A *= X; */ + case BPF_ALU | BPF_MUL | BPF_X: /* A *= X; */ emit_alu_X(MUL); break; - case BPF_S_ALU_MUL_K: /* A *= K */ + case BPF_ALU | BPF_MUL | BPF_K: /* A *= K */ emit_alu_K(MUL, K); break; - case BPF_S_ALU_DIV_K: /* A /= K with K != 0*/ + case BPF_ALU | BPF_DIV | BPF_K: /* A /= K with K != 0*/ if (K == 1) break; emit_write_y(G0); @@ -512,7 +504,7 @@ void bpf_jit_compile(struct sk_filter *fp) #endif emit_alu_K(DIV, K); break; - case BPF_S_ALU_DIV_X: /* A /= X; */ + case BPF_ALU | BPF_DIV | BPF_X: /* A /= X; */ emit_cmpi(r_X, 0); if (pc_ret0 > 0) { t_offset = addrs[pc_ret0 - 1]; @@ -544,10 +536,10 @@ void bpf_jit_compile(struct sk_filter *fp) #endif emit_alu_X(DIV); break; - case BPF_S_ALU_NEG: + case BPF_ALU | BPF_NEG: emit_neg(); break; - case BPF_S_RET_K: + case BPF_RET | BPF_K: if (!K) { if (pc_ret0 == -1) pc_ret0 = i; @@ -556,7 +548,7 @@ void bpf_jit_compile(struct sk_filter *fp) emit_loadimm(K, r_A); } /* Fallthrough */ - case BPF_S_RET_A: + case BPF_RET | BPF_A: if (seen_or_pass0) { if (i != flen - 1) { emit_jump(cleanup_addr); @@ -573,18 +565,18 @@ void bpf_jit_compile(struct sk_filter *fp) emit_jmpl(r_saved_O7, 8, G0); emit_reg_move(r_A, O0); /* delay slot */ break; - case BPF_S_MISC_TAX: + case BPF_MISC | BPF_TAX: seen |= SEEN_XREG; emit_reg_move(r_A, r_X); break; - case BPF_S_MISC_TXA: + case BPF_MISC | BPF_TXA: seen |= SEEN_XREG; emit_reg_move(r_X, r_A); break; - case BPF_S_ANC_CPU: + case BPF_ANC | SKF_AD_CPU: emit_load_cpu(r_A); break; - case BPF_S_ANC_PROTOCOL: + case BPF_ANC | SKF_AD_PROTOCOL: emit_skb_load16(protocol, r_A); break; #if 0 @@ -592,38 +584,38 @@ void bpf_jit_compile(struct sk_filter *fp) * a bit field even though we very much * know what we are doing here. */ - case BPF_S_ANC_PKTTYPE: + case BPF_ANC | SKF_AD_PKTTYPE: __emit_skb_load8(pkt_type, r_A); emit_alu_K(SRL, 5); break; #endif - case BPF_S_ANC_IFINDEX: + case BPF_ANC | SKF_AD_IFINDEX: emit_skb_loadptr(dev, r_A); emit_cmpi(r_A, 0); emit_branch(BNE_PTR, cleanup_addr + 4); emit_nop(); emit_load32(r_A, struct net_device, ifindex, r_A); break; - case BPF_S_ANC_MARK: + case BPF_ANC | SKF_AD_MARK: emit_skb_load32(mark, r_A); break; - case BPF_S_ANC_QUEUE: + case BPF_ANC | SKF_AD_QUEUE: emit_skb_load16(queue_mapping, r_A); break; - case BPF_S_ANC_HATYPE: + case BPF_ANC | SKF_AD_HATYPE: emit_skb_loadptr(dev, r_A); emit_cmpi(r_A, 0); emit_branch(BNE_PTR, cleanup_addr + 4); emit_nop(); emit_load16(r_A, struct net_device, type, r_A); break; - case BPF_S_ANC_RXHASH: + case BPF_ANC | SKF_AD_RXHASH: emit_skb_load32(hash, r_A); break; - case BPF_S_ANC_VLAN_TAG: - case BPF_S_ANC_VLAN_TAG_PRESENT: + case BPF_ANC | SKF_AD_VLAN_TAG: + case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT: emit_skb_load16(vlan_tci, r_A); - if (filter[i].code == BPF_S_ANC_VLAN_TAG) { + if (code == (BPF_ANC | SKF_AD_VLAN_TAG)) { emit_andi(r_A, VLAN_VID_MASK, r_A); } else { emit_loadimm(VLAN_TAG_PRESENT, r_TMP); @@ -631,44 +623,44 @@ void bpf_jit_compile(struct sk_filter *fp) } break; - case BPF_S_LD_IMM: + case BPF_LD | BPF_IMM: emit_loadimm(K, r_A); break; - case BPF_S_LDX_IMM: + case BPF_LDX | BPF_IMM: emit_loadimm(K, r_X); break; - case BPF_S_LD_MEM: + case BPF_LD | BPF_MEM: emit_ldmem(K * 4, r_A); break; - case BPF_S_LDX_MEM: + case BPF_LDX | BPF_MEM: emit_ldmem(K * 4, r_X); break; - case BPF_S_ST: + case BPF_ST: emit_stmem(K * 4, r_A); break; - case BPF_S_STX: + case BPF_STX: emit_stmem(K * 4, r_X); break; #define CHOOSE_LOAD_FUNC(K, func) \ ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) - case BPF_S_LD_W_ABS: + case BPF_LD | BPF_W | BPF_ABS: func = CHOOSE_LOAD_FUNC(K, bpf_jit_load_word); common_load: seen |= SEEN_DATAREF; emit_loadimm(K, r_OFF); emit_call(func); break; - case BPF_S_LD_H_ABS: + case BPF_LD | BPF_H | BPF_ABS: func = CHOOSE_LOAD_FUNC(K, bpf_jit_load_half); goto common_load; - case BPF_S_LD_B_ABS: + case BPF_LD | BPF_B | BPF_ABS: func = CHOOSE_LOAD_FUNC(K, bpf_jit_load_byte); goto common_load; - case BPF_S_LDX_B_MSH: + case BPF_LDX | BPF_B | BPF_MSH: func = CHOOSE_LOAD_FUNC(K, bpf_jit_load_byte_msh); goto common_load; - case BPF_S_LD_W_IND: + case BPF_LD | BPF_W | BPF_IND: func = bpf_jit_load_word; common_load_ind: seen |= SEEN_DATAREF | SEEN_XREG; if (K) { @@ -683,13 +675,13 @@ common_load_ind: seen |= SEEN_DATAREF | SEEN_XREG; } emit_call(func); break; - case BPF_S_LD_H_IND: + case BPF_LD | BPF_H | BPF_IND: func = bpf_jit_load_half; goto common_load_ind; - case BPF_S_LD_B_IND: + case BPF_LD | BPF_B | BPF_IND: func = bpf_jit_load_byte; goto common_load_ind; - case BPF_S_JMP_JA: + case BPF_JMP | BPF_JA: emit_jump(addrs[i + K]); emit_nop(); break; @@ -700,14 +692,14 @@ common_load_ind: seen |= SEEN_DATAREF | SEEN_XREG; f_op = FOP; \ goto cond_branch - COND_SEL(BPF_S_JMP_JGT_K, BGU, BLEU); - COND_SEL(BPF_S_JMP_JGE_K, BGEU, BLU); - COND_SEL(BPF_S_JMP_JEQ_K, BE, BNE); - COND_SEL(BPF_S_JMP_JSET_K, BNE, BE); - COND_SEL(BPF_S_JMP_JGT_X, BGU, BLEU); - COND_SEL(BPF_S_JMP_JGE_X, BGEU, BLU); - COND_SEL(BPF_S_JMP_JEQ_X, BE, BNE); - COND_SEL(BPF_S_JMP_JSET_X, BNE, BE); + COND_SEL(BPF_JMP | BPF_JGT | BPF_K, BGU, BLEU); + COND_SEL(BPF_JMP | BPF_JGE | BPF_K, BGEU, BLU); + COND_SEL(BPF_JMP | BPF_JEQ | BPF_K, BE, BNE); + COND_SEL(BPF_JMP | BPF_JSET | BPF_K, BNE, BE); + COND_SEL(BPF_JMP | BPF_JGT | BPF_X, BGU, BLEU); + COND_SEL(BPF_JMP | BPF_JGE | BPF_X, BGEU, BLU); + COND_SEL(BPF_JMP | BPF_JEQ | BPF_X, BE, BNE); + COND_SEL(BPF_JMP | BPF_JSET | BPF_X, BNE, BE); cond_branch: f_offset = addrs[i + filter[i].jf]; t_offset = addrs[i + filter[i].jt]; @@ -719,20 +711,20 @@ cond_branch: f_offset = addrs[i + filter[i].jf]; break; } - switch (filter[i].code) { - case BPF_S_JMP_JGT_X: - case BPF_S_JMP_JGE_X: - case BPF_S_JMP_JEQ_X: + switch (code) { + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JEQ | BPF_X: seen |= SEEN_XREG; emit_cmp(r_A, r_X); break; - case BPF_S_JMP_JSET_X: + case BPF_JMP | BPF_JSET | BPF_X: seen |= SEEN_XREG; emit_btst(r_A, r_X); break; - case BPF_S_JMP_JEQ_K: - case BPF_S_JMP_JGT_K: - case BPF_S_JMP_JGE_K: + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: if (is_simm13(K)) { emit_cmpi(r_A, K); } else { @@ -740,7 +732,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf]; emit_cmp(r_A, r_TMP); } break; - case BPF_S_JMP_JSET_K: + case BPF_JMP | BPF_JSET | BPF_K: if (is_simm13(K)) { emit_btsti(r_A, K); } else { diff --git a/include/linux/filter.h b/include/linux/filter.h index 625f4de9bdf2..49ef7a298c92 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -197,7 +197,6 @@ int sk_detach_filter(struct sock *sk); int sk_chk_filter(struct sock_filter *filter, unsigned int flen); int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); -void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to); void sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); @@ -205,6 +204,41 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); void bpf_int_jit_compile(struct sk_filter *fp); +#define BPF_ANC BIT(15) + +static inline u16 bpf_anc_helper(const struct sock_filter *ftest) +{ + BUG_ON(ftest->code & BPF_ANC); + + switch (ftest->code) { + case BPF_LD | BPF_W | BPF_ABS: + case BPF_LD | BPF_H | BPF_ABS: + case BPF_LD | BPF_B | BPF_ABS: +#define BPF_ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \ + return BPF_ANC | SKF_AD_##CODE + switch (ftest->k) { + BPF_ANCILLARY(PROTOCOL); + BPF_ANCILLARY(PKTTYPE); + BPF_ANCILLARY(IFINDEX); + BPF_ANCILLARY(NLATTR); + BPF_ANCILLARY(NLATTR_NEST); + BPF_ANCILLARY(MARK); + BPF_ANCILLARY(QUEUE); + BPF_ANCILLARY(HATYPE); + BPF_ANCILLARY(RXHASH); + BPF_ANCILLARY(CPU); + BPF_ANCILLARY(ALU_XOR_X); + BPF_ANCILLARY(VLAN_TAG); + BPF_ANCILLARY(VLAN_TAG_PRESENT); + BPF_ANCILLARY(PAY_OFFSET); + BPF_ANCILLARY(RANDOM); + } + /* Fallthrough. */ + default: + return ftest->code; + } +} + #ifdef CONFIG_BPF_JIT #include #include @@ -224,86 +258,20 @@ static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, } #else #include + static inline void bpf_jit_compile(struct sk_filter *fp) { } + static inline void bpf_jit_free(struct sk_filter *fp) { kfree(fp); } -#endif +#endif /* CONFIG_BPF_JIT */ static inline int bpf_tell_extensions(void) { return SKF_AD_MAX; } -enum { - BPF_S_RET_K = 1, - BPF_S_RET_A, - BPF_S_ALU_ADD_K, - BPF_S_ALU_ADD_X, - BPF_S_ALU_SUB_K, - BPF_S_ALU_SUB_X, - BPF_S_ALU_MUL_K, - BPF_S_ALU_MUL_X, - BPF_S_ALU_DIV_X, - BPF_S_ALU_MOD_K, - BPF_S_ALU_MOD_X, - BPF_S_ALU_AND_K, - BPF_S_ALU_AND_X, - BPF_S_ALU_OR_K, - BPF_S_ALU_OR_X, - BPF_S_ALU_XOR_K, - BPF_S_ALU_XOR_X, - BPF_S_ALU_LSH_K, - BPF_S_ALU_LSH_X, - BPF_S_ALU_RSH_K, - BPF_S_ALU_RSH_X, - BPF_S_ALU_NEG, - BPF_S_LD_W_ABS, - BPF_S_LD_H_ABS, - BPF_S_LD_B_ABS, - BPF_S_LD_W_LEN, - BPF_S_LD_W_IND, - BPF_S_LD_H_IND, - BPF_S_LD_B_IND, - BPF_S_LD_IMM, - BPF_S_LDX_W_LEN, - BPF_S_LDX_B_MSH, - BPF_S_LDX_IMM, - BPF_S_MISC_TAX, - BPF_S_MISC_TXA, - BPF_S_ALU_DIV_K, - BPF_S_LD_MEM, - BPF_S_LDX_MEM, - BPF_S_ST, - BPF_S_STX, - BPF_S_JMP_JA, - BPF_S_JMP_JEQ_K, - BPF_S_JMP_JEQ_X, - BPF_S_JMP_JGE_K, - BPF_S_JMP_JGE_X, - BPF_S_JMP_JGT_K, - BPF_S_JMP_JGT_X, - BPF_S_JMP_JSET_K, - BPF_S_JMP_JSET_X, - /* Ancillary data */ - BPF_S_ANC_PROTOCOL, - BPF_S_ANC_PKTTYPE, - BPF_S_ANC_IFINDEX, - BPF_S_ANC_NLATTR, - BPF_S_ANC_NLATTR_NEST, - BPF_S_ANC_MARK, - BPF_S_ANC_QUEUE, - BPF_S_ANC_HATYPE, - BPF_S_ANC_RXHASH, - BPF_S_ANC_CPU, - BPF_S_ANC_ALU_XOR_X, - BPF_S_ANC_VLAN_TAG, - BPF_S_ANC_VLAN_TAG_PRESENT, - BPF_S_ANC_PAY_OFFSET, - BPF_S_ANC_RANDOM, -}; - #endif /* __LINUX_FILTER_H__ */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 1036b6f2fded..44e69483b383 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -103,60 +103,59 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) u32 k = ftest->k; switch (code) { - case BPF_S_LD_W_ABS: + case BPF_LD | BPF_W | BPF_ABS: ftest->code = BPF_LDX | BPF_W | BPF_ABS; /* 32-bit aligned and not out of bounds. */ if (k >= sizeof(struct seccomp_data) || k & 3) return -EINVAL; continue; - case BPF_S_LD_W_LEN: + case BPF_LD | BPF_W | BPF_LEN: ftest->code = BPF_LD | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; - case BPF_S_LDX_W_LEN: + case BPF_LDX | BPF_W | BPF_LEN: ftest->code = BPF_LDX | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; /* Explicitly include allowed calls. */ - case BPF_S_RET_K: - case BPF_S_RET_A: - case BPF_S_ALU_ADD_K: - case BPF_S_ALU_ADD_X: - case BPF_S_ALU_SUB_K: - case BPF_S_ALU_SUB_X: - case BPF_S_ALU_MUL_K: - case BPF_S_ALU_MUL_X: - case BPF_S_ALU_DIV_X: - case BPF_S_ALU_AND_K: - case BPF_S_ALU_AND_X: - case BPF_S_ALU_OR_K: - case BPF_S_ALU_OR_X: - case BPF_S_ALU_XOR_K: - case BPF_S_ALU_XOR_X: - case BPF_S_ALU_LSH_K: - case BPF_S_ALU_LSH_X: - case BPF_S_ALU_RSH_K: - case BPF_S_ALU_RSH_X: - case BPF_S_ALU_NEG: - case BPF_S_LD_IMM: - case BPF_S_LDX_IMM: - case BPF_S_MISC_TAX: - case BPF_S_MISC_TXA: - case BPF_S_ALU_DIV_K: - case BPF_S_LD_MEM: - case BPF_S_LDX_MEM: - case BPF_S_ST: - case BPF_S_STX: - case BPF_S_JMP_JA: - case BPF_S_JMP_JEQ_K: - case BPF_S_JMP_JEQ_X: - case BPF_S_JMP_JGE_K: - case BPF_S_JMP_JGE_X: - case BPF_S_JMP_JGT_K: - case BPF_S_JMP_JGT_X: - case BPF_S_JMP_JSET_K: - case BPF_S_JMP_JSET_X: - sk_decode_filter(ftest, ftest); + case BPF_RET | BPF_K: + case BPF_RET | BPF_A: + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_ADD | BPF_X: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_SUB | BPF_X: + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_MUL | BPF_X: + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_DIV | BPF_X: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_AND | BPF_X: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_OR | BPF_X: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_XOR | BPF_X: + case BPF_ALU | BPF_LSH | BPF_K: + case BPF_ALU | BPF_LSH | BPF_X: + case BPF_ALU | BPF_RSH | BPF_K: + case BPF_ALU | BPF_RSH | BPF_X: + case BPF_ALU | BPF_NEG: + case BPF_LD | BPF_IMM: + case BPF_LDX | BPF_IMM: + case BPF_MISC | BPF_TAX: + case BPF_MISC | BPF_TXA: + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: + case BPF_ST: + case BPF_STX: + case BPF_JMP | BPF_JA: + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: continue; default: return -EINVAL; diff --git a/net/core/filter.c b/net/core/filter.c index 2c2d35d9d101..328aaf6ff4d1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -536,11 +536,13 @@ load_word: * Output: * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness */ + ptr = load_pointer((struct sk_buff *) ctx, off, 4, &tmp); if (likely(ptr != NULL)) { BPF_R0 = get_unaligned_be32(ptr); CONT; } + return 0; LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + K)) */ off = K; @@ -550,6 +552,7 @@ load_half: BPF_R0 = get_unaligned_be16(ptr); CONT; } + return 0; LD_ABS_B: /* BPF_R0 = *(u8 *) (ctx + K) */ off = K; @@ -559,6 +562,7 @@ load_byte: BPF_R0 = *(u8 *)ptr; CONT; } + return 0; LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + X + K)) */ off = K + X; @@ -1136,44 +1140,46 @@ err: */ static int check_load_and_stores(struct sock_filter *filter, int flen) { - u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */ + u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ int pc, ret = 0; BUILD_BUG_ON(BPF_MEMWORDS > 16); + masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL); if (!masks) return -ENOMEM; + memset(masks, 0xff, flen * sizeof(*masks)); for (pc = 0; pc < flen; pc++) { memvalid &= masks[pc]; switch (filter[pc].code) { - case BPF_S_ST: - case BPF_S_STX: + case BPF_ST: + case BPF_STX: memvalid |= (1 << filter[pc].k); break; - case BPF_S_LD_MEM: - case BPF_S_LDX_MEM: + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: if (!(memvalid & (1 << filter[pc].k))) { ret = -EINVAL; goto error; } break; - case BPF_S_JMP_JA: - /* a jump must set masks on target */ + case BPF_JMP | BPF_JA: + /* A jump must set masks on target */ masks[pc + 1 + filter[pc].k] &= memvalid; memvalid = ~0; break; - case BPF_S_JMP_JEQ_K: - case BPF_S_JMP_JEQ_X: - case BPF_S_JMP_JGE_K: - case BPF_S_JMP_JGE_X: - case BPF_S_JMP_JGT_K: - case BPF_S_JMP_JGT_X: - case BPF_S_JMP_JSET_X: - case BPF_S_JMP_JSET_K: - /* a jump must set masks on targets */ + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + /* A jump must set masks on targets */ masks[pc + 1 + filter[pc].jt] &= memvalid; masks[pc + 1 + filter[pc].jf] &= memvalid; memvalid = ~0; @@ -1185,6 +1191,72 @@ error: return ret; } +static bool chk_code_allowed(u16 code_to_probe) +{ + static const bool codes[] = { + /* 32 bit ALU operations */ + [BPF_ALU | BPF_ADD | BPF_K] = true, + [BPF_ALU | BPF_ADD | BPF_X] = true, + [BPF_ALU | BPF_SUB | BPF_K] = true, + [BPF_ALU | BPF_SUB | BPF_X] = true, + [BPF_ALU | BPF_MUL | BPF_K] = true, + [BPF_ALU | BPF_MUL | BPF_X] = true, + [BPF_ALU | BPF_DIV | BPF_K] = true, + [BPF_ALU | BPF_DIV | BPF_X] = true, + [BPF_ALU | BPF_MOD | BPF_K] = true, + [BPF_ALU | BPF_MOD | BPF_X] = true, + [BPF_ALU | BPF_AND | BPF_K] = true, + [BPF_ALU | BPF_AND | BPF_X] = true, + [BPF_ALU | BPF_OR | BPF_K] = true, + [BPF_ALU | BPF_OR | BPF_X] = true, + [BPF_ALU | BPF_XOR | BPF_K] = true, + [BPF_ALU | BPF_XOR | BPF_X] = true, + [BPF_ALU | BPF_LSH | BPF_K] = true, + [BPF_ALU | BPF_LSH | BPF_X] = true, + [BPF_ALU | BPF_RSH | BPF_K] = true, + [BPF_ALU | BPF_RSH | BPF_X] = true, + [BPF_ALU | BPF_NEG] = true, + /* Load instructions */ + [BPF_LD | BPF_W | BPF_ABS] = true, + [BPF_LD | BPF_H | BPF_ABS] = true, + [BPF_LD | BPF_B | BPF_ABS] = true, + [BPF_LD | BPF_W | BPF_LEN] = true, + [BPF_LD | BPF_W | BPF_IND] = true, + [BPF_LD | BPF_H | BPF_IND] = true, + [BPF_LD | BPF_B | BPF_IND] = true, + [BPF_LD | BPF_IMM] = true, + [BPF_LD | BPF_MEM] = true, + [BPF_LDX | BPF_W | BPF_LEN] = true, + [BPF_LDX | BPF_B | BPF_MSH] = true, + [BPF_LDX | BPF_IMM] = true, + [BPF_LDX | BPF_MEM] = true, + /* Store instructions */ + [BPF_ST] = true, + [BPF_STX] = true, + /* Misc instructions */ + [BPF_MISC | BPF_TAX] = true, + [BPF_MISC | BPF_TXA] = true, + /* Return instructions */ + [BPF_RET | BPF_K] = true, + [BPF_RET | BPF_A] = true, + /* Jump instructions */ + [BPF_JMP | BPF_JA] = true, + [BPF_JMP | BPF_JEQ | BPF_K] = true, + [BPF_JMP | BPF_JEQ | BPF_X] = true, + [BPF_JMP | BPF_JGE | BPF_K] = true, + [BPF_JMP | BPF_JGE | BPF_X] = true, + [BPF_JMP | BPF_JGT | BPF_K] = true, + [BPF_JMP | BPF_JGT | BPF_X] = true, + [BPF_JMP | BPF_JSET | BPF_K] = true, + [BPF_JMP | BPF_JSET | BPF_X] = true, + }; + + if (code_to_probe >= ARRAY_SIZE(codes)) + return false; + + return codes[code_to_probe]; +} + /** * sk_chk_filter - verify socket filter code * @filter: filter to verify @@ -1201,154 +1273,76 @@ error: */ int sk_chk_filter(struct sock_filter *filter, unsigned int flen) { - /* - * Valid instructions are initialized to non-0. - * Invalid instructions are initialized to 0. - */ - static const u8 codes[] = { - [BPF_ALU|BPF_ADD|BPF_K] = BPF_S_ALU_ADD_K, - [BPF_ALU|BPF_ADD|BPF_X] = BPF_S_ALU_ADD_X, - [BPF_ALU|BPF_SUB|BPF_K] = BPF_S_ALU_SUB_K, - [BPF_ALU|BPF_SUB|BPF_X] = BPF_S_ALU_SUB_X, - [BPF_ALU|BPF_MUL|BPF_K] = BPF_S_ALU_MUL_K, - [BPF_ALU|BPF_MUL|BPF_X] = BPF_S_ALU_MUL_X, - [BPF_ALU|BPF_DIV|BPF_X] = BPF_S_ALU_DIV_X, - [BPF_ALU|BPF_MOD|BPF_K] = BPF_S_ALU_MOD_K, - [BPF_ALU|BPF_MOD|BPF_X] = BPF_S_ALU_MOD_X, - [BPF_ALU|BPF_AND|BPF_K] = BPF_S_ALU_AND_K, - [BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X, - [BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K, - [BPF_ALU|BPF_OR|BPF_X] = BPF_S_ALU_OR_X, - [BPF_ALU|BPF_XOR|BPF_K] = BPF_S_ALU_XOR_K, - [BPF_ALU|BPF_XOR|BPF_X] = BPF_S_ALU_XOR_X, - [BPF_ALU|BPF_LSH|BPF_K] = BPF_S_ALU_LSH_K, - [BPF_ALU|BPF_LSH|BPF_X] = BPF_S_ALU_LSH_X, - [BPF_ALU|BPF_RSH|BPF_K] = BPF_S_ALU_RSH_K, - [BPF_ALU|BPF_RSH|BPF_X] = BPF_S_ALU_RSH_X, - [BPF_ALU|BPF_NEG] = BPF_S_ALU_NEG, - [BPF_LD|BPF_W|BPF_ABS] = BPF_S_LD_W_ABS, - [BPF_LD|BPF_H|BPF_ABS] = BPF_S_LD_H_ABS, - [BPF_LD|BPF_B|BPF_ABS] = BPF_S_LD_B_ABS, - [BPF_LD|BPF_W|BPF_LEN] = BPF_S_LD_W_LEN, - [BPF_LD|BPF_W|BPF_IND] = BPF_S_LD_W_IND, - [BPF_LD|BPF_H|BPF_IND] = BPF_S_LD_H_IND, - [BPF_LD|BPF_B|BPF_IND] = BPF_S_LD_B_IND, - [BPF_LD|BPF_IMM] = BPF_S_LD_IMM, - [BPF_LDX|BPF_W|BPF_LEN] = BPF_S_LDX_W_LEN, - [BPF_LDX|BPF_B|BPF_MSH] = BPF_S_LDX_B_MSH, - [BPF_LDX|BPF_IMM] = BPF_S_LDX_IMM, - [BPF_MISC|BPF_TAX] = BPF_S_MISC_TAX, - [BPF_MISC|BPF_TXA] = BPF_S_MISC_TXA, - [BPF_RET|BPF_K] = BPF_S_RET_K, - [BPF_RET|BPF_A] = BPF_S_RET_A, - [BPF_ALU|BPF_DIV|BPF_K] = BPF_S_ALU_DIV_K, - [BPF_LD|BPF_MEM] = BPF_S_LD_MEM, - [BPF_LDX|BPF_MEM] = BPF_S_LDX_MEM, - [BPF_ST] = BPF_S_ST, - [BPF_STX] = BPF_S_STX, - [BPF_JMP|BPF_JA] = BPF_S_JMP_JA, - [BPF_JMP|BPF_JEQ|BPF_K] = BPF_S_JMP_JEQ_K, - [BPF_JMP|BPF_JEQ|BPF_X] = BPF_S_JMP_JEQ_X, - [BPF_JMP|BPF_JGE|BPF_K] = BPF_S_JMP_JGE_K, - [BPF_JMP|BPF_JGE|BPF_X] = BPF_S_JMP_JGE_X, - [BPF_JMP|BPF_JGT|BPF_K] = BPF_S_JMP_JGT_K, - [BPF_JMP|BPF_JGT|BPF_X] = BPF_S_JMP_JGT_X, - [BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K, - [BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X, - }; - int pc; bool anc_found; + int pc; if (flen == 0 || flen > BPF_MAXINSNS) return -EINVAL; - /* check the filter code now */ + /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { struct sock_filter *ftest = &filter[pc]; - u16 code = ftest->code; - if (code >= ARRAY_SIZE(codes)) - return -EINVAL; - code = codes[code]; - if (!code) + /* May we actually operate on this code? */ + if (!chk_code_allowed(ftest->code)) return -EINVAL; + /* Some instructions need special checks */ - switch (code) { - case BPF_S_ALU_DIV_K: - case BPF_S_ALU_MOD_K: - /* check for division by zero */ + switch (ftest->code) { + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_MOD | BPF_K: + /* Check for division by zero */ if (ftest->k == 0) return -EINVAL; break; - case BPF_S_LD_MEM: - case BPF_S_LDX_MEM: - case BPF_S_ST: - case BPF_S_STX: - /* check for invalid memory addresses */ + case BPF_LD | BPF_MEM: + case BPF_LDX | BPF_MEM: + case BPF_ST: + case BPF_STX: + /* Check for invalid memory addresses */ if (ftest->k >= BPF_MEMWORDS) return -EINVAL; break; - case BPF_S_JMP_JA: - /* - * Note, the large ftest->k might cause loops. + case BPF_JMP | BPF_JA: + /* Note, the large ftest->k might cause loops. * Compare this with conditional jumps below, * where offsets are limited. --ANK (981016) */ - if (ftest->k >= (unsigned int)(flen-pc-1)) + if (ftest->k >= (unsigned int)(flen - pc - 1)) return -EINVAL; break; - case BPF_S_JMP_JEQ_K: - case BPF_S_JMP_JEQ_X: - case BPF_S_JMP_JGE_K: - case BPF_S_JMP_JGE_X: - case BPF_S_JMP_JGT_K: - case BPF_S_JMP_JGT_X: - case BPF_S_JMP_JSET_X: - case BPF_S_JMP_JSET_K: - /* for conditionals both must be safe */ + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JEQ | BPF_X: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JSET | BPF_K: + case BPF_JMP | BPF_JSET | BPF_X: + /* Both conditionals must be safe */ if (pc + ftest->jt + 1 >= flen || pc + ftest->jf + 1 >= flen) return -EINVAL; break; - case BPF_S_LD_W_ABS: - case BPF_S_LD_H_ABS: - case BPF_S_LD_B_ABS: + case BPF_LD | BPF_W | BPF_ABS: + case BPF_LD | BPF_H | BPF_ABS: + case BPF_LD | BPF_B | BPF_ABS: anc_found = false; -#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \ - code = BPF_S_ANC_##CODE; \ - anc_found = true; \ - break - switch (ftest->k) { - ANCILLARY(PROTOCOL); - ANCILLARY(PKTTYPE); - ANCILLARY(IFINDEX); - ANCILLARY(NLATTR); - ANCILLARY(NLATTR_NEST); - ANCILLARY(MARK); - ANCILLARY(QUEUE); - ANCILLARY(HATYPE); - ANCILLARY(RXHASH); - ANCILLARY(CPU); - ANCILLARY(ALU_XOR_X); - ANCILLARY(VLAN_TAG); - ANCILLARY(VLAN_TAG_PRESENT); - ANCILLARY(PAY_OFFSET); - ANCILLARY(RANDOM); - } - - /* ancillary operation unknown or unsupported */ + if (bpf_anc_helper(ftest) & BPF_ANC) + anc_found = true; + /* Ancillary operation unknown or unsupported */ if (anc_found == false && ftest->k >= SKF_AD_OFF) return -EINVAL; } - ftest->code = code; } - /* last instruction must be a RET code */ + /* Last instruction must be a RET code */ switch (filter[flen - 1].code) { - case BPF_S_RET_K: - case BPF_S_RET_A: + case BPF_RET | BPF_K: + case BPF_RET | BPF_A: return check_load_and_stores(filter, flen); } + return -EINVAL; } EXPORT_SYMBOL(sk_chk_filter); @@ -1448,7 +1442,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, { struct sock_filter *old_prog; struct sk_filter *old_fp; - int i, err, new_len, old_len = fp->len; + int err, new_len, old_len = fp->len; /* We are free to overwrite insns et al right here as it * won't be used at this point in time anymore internally @@ -1458,13 +1452,6 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp, BUILD_BUG_ON(sizeof(struct sock_filter) != sizeof(struct sock_filter_int)); - /* For now, we need to unfiddle BPF_S_* identifiers in place. - * This can sooner or later on be subject to removal, e.g. when - * JITs have been converted. - */ - for (i = 0; i < fp->len; i++) - sk_decode_filter(&fp->insns[i], &fp->insns[i]); - /* Conversion cannot happen on overlapping memory areas, * so we need to keep the user BPF around until the 2nd * pass. At this time, the user BPF is stored in fp->insns. @@ -1706,84 +1693,6 @@ int sk_detach_filter(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_detach_filter); -void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to) -{ - static const u16 decodes[] = { - [BPF_S_ALU_ADD_K] = BPF_ALU|BPF_ADD|BPF_K, - [BPF_S_ALU_ADD_X] = BPF_ALU|BPF_ADD|BPF_X, - [BPF_S_ALU_SUB_K] = BPF_ALU|BPF_SUB|BPF_K, - [BPF_S_ALU_SUB_X] = BPF_ALU|BPF_SUB|BPF_X, - [BPF_S_ALU_MUL_K] = BPF_ALU|BPF_MUL|BPF_K, - [BPF_S_ALU_MUL_X] = BPF_ALU|BPF_MUL|BPF_X, - [BPF_S_ALU_DIV_X] = BPF_ALU|BPF_DIV|BPF_X, - [BPF_S_ALU_MOD_K] = BPF_ALU|BPF_MOD|BPF_K, - [BPF_S_ALU_MOD_X] = BPF_ALU|BPF_MOD|BPF_X, - [BPF_S_ALU_AND_K] = BPF_ALU|BPF_AND|BPF_K, - [BPF_S_ALU_AND_X] = BPF_ALU|BPF_AND|BPF_X, - [BPF_S_ALU_OR_K] = BPF_ALU|BPF_OR|BPF_K, - [BPF_S_ALU_OR_X] = BPF_ALU|BPF_OR|BPF_X, - [BPF_S_ALU_XOR_K] = BPF_ALU|BPF_XOR|BPF_K, - [BPF_S_ALU_XOR_X] = BPF_ALU|BPF_XOR|BPF_X, - [BPF_S_ALU_LSH_K] = BPF_ALU|BPF_LSH|BPF_K, - [BPF_S_ALU_LSH_X] = BPF_ALU|BPF_LSH|BPF_X, - [BPF_S_ALU_RSH_K] = BPF_ALU|BPF_RSH|BPF_K, - [BPF_S_ALU_RSH_X] = BPF_ALU|BPF_RSH|BPF_X, - [BPF_S_ALU_NEG] = BPF_ALU|BPF_NEG, - [BPF_S_LD_W_ABS] = BPF_LD|BPF_W|BPF_ABS, - [BPF_S_LD_H_ABS] = BPF_LD|BPF_H|BPF_ABS, - [BPF_S_LD_B_ABS] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_PROTOCOL] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_PKTTYPE] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_IFINDEX] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_NLATTR] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_NLATTR_NEST] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_MARK] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_QUEUE] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_HATYPE] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_RXHASH] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_CPU] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_ALU_XOR_X] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_VLAN_TAG] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_PAY_OFFSET] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_RANDOM] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_LD_W_LEN] = BPF_LD|BPF_W|BPF_LEN, - [BPF_S_LD_W_IND] = BPF_LD|BPF_W|BPF_IND, - [BPF_S_LD_H_IND] = BPF_LD|BPF_H|BPF_IND, - [BPF_S_LD_B_IND] = BPF_LD|BPF_B|BPF_IND, - [BPF_S_LD_IMM] = BPF_LD|BPF_IMM, - [BPF_S_LDX_W_LEN] = BPF_LDX|BPF_W|BPF_LEN, - [BPF_S_LDX_B_MSH] = BPF_LDX|BPF_B|BPF_MSH, - [BPF_S_LDX_IMM] = BPF_LDX|BPF_IMM, - [BPF_S_MISC_TAX] = BPF_MISC|BPF_TAX, - [BPF_S_MISC_TXA] = BPF_MISC|BPF_TXA, - [BPF_S_RET_K] = BPF_RET|BPF_K, - [BPF_S_RET_A] = BPF_RET|BPF_A, - [BPF_S_ALU_DIV_K] = BPF_ALU|BPF_DIV|BPF_K, - [BPF_S_LD_MEM] = BPF_LD|BPF_MEM, - [BPF_S_LDX_MEM] = BPF_LDX|BPF_MEM, - [BPF_S_ST] = BPF_ST, - [BPF_S_STX] = BPF_STX, - [BPF_S_JMP_JA] = BPF_JMP|BPF_JA, - [BPF_S_JMP_JEQ_K] = BPF_JMP|BPF_JEQ|BPF_K, - [BPF_S_JMP_JEQ_X] = BPF_JMP|BPF_JEQ|BPF_X, - [BPF_S_JMP_JGE_K] = BPF_JMP|BPF_JGE|BPF_K, - [BPF_S_JMP_JGE_X] = BPF_JMP|BPF_JGE|BPF_X, - [BPF_S_JMP_JGT_K] = BPF_JMP|BPF_JGT|BPF_K, - [BPF_S_JMP_JGT_X] = BPF_JMP|BPF_JGT|BPF_X, - [BPF_S_JMP_JSET_K] = BPF_JMP|BPF_JSET|BPF_K, - [BPF_S_JMP_JSET_X] = BPF_JMP|BPF_JSET|BPF_X, - }; - u16 code; - - code = filt->code; - - to->code = decodes[code]; - to->jt = filt->jt; - to->jf = filt->jf; - to->k = filt->k; -} - int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len) { -- cgit From e3172181946fadd3bd0e4c362931094ba87e5718 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 2 Jun 2014 13:33:12 +0900 Subject: tracing: Print max callstack on stacktrace bug While I played with my own feature(ex, something on the way to reclaim), the kernel would easily oops. I guessed that the reason had to do with stack overflow and wanted to prove it. I discovered the stack tracer which proved to be very useful for me but the kernel would oops before my user program gather the information via "watch cat /sys/kernel/debug/tracing/stack_trace" so I couldn't get any message from that. What I needed was to have the stack tracer emit the kernel stack usage before it does the oops so I could find what was hogging the stack. This patch shows the callstack of max stack usage right before an oops so we can find a culprit. So, the result is as follows. [ 1116.522206] init: lightdm main process (1246) terminated with status 1 [ 1119.922916] init: failsafe-x main process (1272) terminated with status 1 [ 3887.728131] kworker/u24:1 (6637) used greatest stack depth: 256 bytes left [ 6397.629227] cc1 (9554) used greatest stack depth: 128 bytes left [ 7174.467392] Depth Size Location (47 entries) [ 7174.467392] ----- ---- -------- [ 7174.467785] 0) 7248 256 get_page_from_freelist+0xa7/0x920 [ 7174.468506] 1) 6992 352 __alloc_pages_nodemask+0x1cd/0xb20 [ 7174.469224] 2) 6640 8 alloc_pages_current+0x10f/0x1f0 [ 7174.469413] 3) 6632 168 new_slab+0x2c5/0x370 [ 7174.469413] 4) 6464 8 __slab_alloc+0x3a9/0x501 [ 7174.469413] 5) 6456 80 __kmalloc+0x1cb/0x200 [ 7174.469413] 6) 6376 376 vring_add_indirect+0x36/0x200 [ 7174.469413] 7) 6000 144 virtqueue_add_sgs+0x2e2/0x320 [ 7174.469413] 8) 5856 288 __virtblk_add_req+0xda/0x1b0 [ 7174.469413] 9) 5568 96 virtio_queue_rq+0xd3/0x1d0 [ 7174.469413] 10) 5472 128 __blk_mq_run_hw_queue+0x1ef/0x440 [ 7174.469413] 11) 5344 16 blk_mq_run_hw_queue+0x35/0x40 [ 7174.469413] 12) 5328 96 blk_mq_insert_requests+0xdb/0x160 [ 7174.469413] 13) 5232 112 blk_mq_flush_plug_list+0x12b/0x140 [ 7174.469413] 14) 5120 112 blk_flush_plug_list+0xc7/0x220 [ 7174.469413] 15) 5008 64 io_schedule_timeout+0x88/0x100 [ 7174.469413] 16) 4944 128 mempool_alloc+0x145/0x170 [ 7174.469413] 17) 4816 96 bio_alloc_bioset+0x10b/0x1d0 [ 7174.469413] 18) 4720 48 get_swap_bio+0x30/0x90 [ 7174.469413] 19) 4672 160 __swap_writepage+0x150/0x230 [ 7174.469413] 20) 4512 32 swap_writepage+0x42/0x90 [ 7174.469413] 21) 4480 320 shrink_page_list+0x676/0xa80 [ 7174.469413] 22) 4160 208 shrink_inactive_list+0x262/0x4e0 [ 7174.469413] 23) 3952 304 shrink_lruvec+0x3e1/0x6a0 [ 7174.469413] 24) 3648 80 shrink_zone+0x3f/0x110 [ 7174.469413] 25) 3568 128 do_try_to_free_pages+0x156/0x4c0 [ 7174.469413] 26) 3440 208 try_to_free_pages+0xf7/0x1e0 [ 7174.469413] 27) 3232 352 __alloc_pages_nodemask+0x783/0xb20 [ 7174.469413] 28) 2880 8 alloc_pages_current+0x10f/0x1f0 [ 7174.469413] 29) 2872 200 __page_cache_alloc+0x13f/0x160 [ 7174.469413] 30) 2672 80 find_or_create_page+0x4c/0xb0 [ 7174.469413] 31) 2592 80 ext4_mb_load_buddy+0x1e9/0x370 [ 7174.469413] 32) 2512 176 ext4_mb_regular_allocator+0x1b7/0x460 [ 7174.469413] 33) 2336 128 ext4_mb_new_blocks+0x458/0x5f0 [ 7174.469413] 34) 2208 256 ext4_ext_map_blocks+0x70b/0x1010 [ 7174.469413] 35) 1952 160 ext4_map_blocks+0x325/0x530 [ 7174.469413] 36) 1792 384 ext4_writepages+0x6d1/0xce0 [ 7174.469413] 37) 1408 16 do_writepages+0x23/0x40 [ 7174.469413] 38) 1392 96 __writeback_single_inode+0x45/0x2e0 [ 7174.469413] 39) 1296 176 writeback_sb_inodes+0x2ad/0x500 [ 7174.469413] 40) 1120 80 __writeback_inodes_wb+0x9e/0xd0 [ 7174.469413] 41) 1040 160 wb_writeback+0x29b/0x350 [ 7174.469413] 42) 880 208 bdi_writeback_workfn+0x11c/0x480 [ 7174.469413] 43) 672 144 process_one_work+0x1d2/0x570 [ 7174.469413] 44) 528 112 worker_thread+0x116/0x370 [ 7174.469413] 45) 416 240 kthread+0xf3/0x110 [ 7174.469413] 46) 176 176 ret_from_fork+0x7c/0xb0 [ 7174.469413] ------------[ cut here ]------------ [ 7174.469413] kernel BUG at kernel/trace/trace_stack.c:174! [ 7174.469413] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC [ 7174.469413] Dumping ftrace buffer: [ 7174.469413] (ftrace buffer empty) [ 7174.469413] Modules linked in: [ 7174.469413] CPU: 0 PID: 440 Comm: kworker/u24:0 Not tainted 3.14.0+ #212 [ 7174.469413] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 7174.469413] Workqueue: writeback bdi_writeback_workfn (flush-253:0) [ 7174.469413] task: ffff880034170000 ti: ffff880029518000 task.ti: ffff880029518000 [ 7174.469413] RIP: 0010:[] [] stack_trace_call+0x2de/0x340 [ 7174.469413] RSP: 0000:ffff880029518290 EFLAGS: 00010046 [ 7174.469413] RAX: 0000000000000030 RBX: 000000000000002f RCX: 0000000000000000 [ 7174.469413] RDX: 0000000000000000 RSI: 000000000000002f RDI: ffffffff810b7159 [ 7174.469413] RBP: ffff8800295182f0 R08: ffffffffffffffff R09: 0000000000000000 [ 7174.469413] R10: 0000000000000001 R11: 0000000000000001 R12: ffffffff82768dfc [ 7174.469413] R13: 000000000000f2e8 R14: ffff8800295182b8 R15: 00000000000000f8 [ 7174.469413] FS: 0000000000000000(0000) GS:ffff880037c00000(0000) knlGS:0000000000000000 [ 7174.469413] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 7174.469413] CR2: 00002acd0b994000 CR3: 0000000001c0b000 CR4: 00000000000006f0 [ 7174.469413] Stack: [ 7174.469413] 0000000000000000 ffffffff8114fdb7 0000000000000087 0000000000001c50 [ 7174.469413] 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 7174.469413] 0000000000000002 ffff880034170000 ffff880034171028 0000000000000000 [ 7174.469413] Call Trace: [ 7174.469413] [] ? get_page_from_freelist+0xa7/0x920 [ 7174.469413] [] ftrace_call+0x5/0x2f [ 7174.469413] [] ? next_zones_zonelist+0x5/0x70 [ 7174.469413] [] ? __bfs+0x11a/0x270 [ 7174.469413] [] ? next_zones_zonelist+0x5/0x70 [ 7174.469413] [] ? get_page_from_freelist+0xa7/0x920 [ 7174.469413] [] ? alloc_pages_current+0x10f/0x1f0 [ 7174.469413] [] __alloc_pages_nodemask+0x1cd/0xb20 [ 7174.469413] [] ? check_irq_usage+0x96/0xe0 [ 7174.469413] [] ? ftrace_call+0x5/0x2f [ 7174.469413] [] alloc_pages_current+0x10f/0x1f0 [ 7174.469413] [] ? new_slab+0x2c5/0x370 [ 7174.469413] [] new_slab+0x2c5/0x370 [ 7174.469413] [] ? ftrace_call+0x5/0x2f [ 7174.469413] [] __slab_alloc+0x3a9/0x501 [ 7174.469413] [] ? __kmalloc+0x1cb/0x200 [ 7174.469413] [] ? vring_add_indirect+0x36/0x200 [ 7174.469413] [] ? vring_add_indirect+0x36/0x200 [ 7174.469413] [] ? vring_add_indirect+0x36/0x200 [ 7174.469413] [] __kmalloc+0x1cb/0x200 [ 7174.469413] [] ? vring_add_indirect+0x200/0x200 [ 7174.469413] [] vring_add_indirect+0x36/0x200 [ 7174.469413] [] virtqueue_add_sgs+0x2e2/0x320 [ 7174.469413] [] __virtblk_add_req+0xda/0x1b0 [ 7174.469413] [] virtio_queue_rq+0xd3/0x1d0 [ 7174.469413] [] __blk_mq_run_hw_queue+0x1ef/0x440 [ 7174.469413] [] blk_mq_run_hw_queue+0x35/0x40 [ 7174.469413] [] blk_mq_insert_requests+0xdb/0x160 [ 7174.469413] [] blk_mq_flush_plug_list+0x12b/0x140 [ 7174.469413] [] blk_flush_plug_list+0xc7/0x220 [ 7174.469413] [] ? _raw_spin_unlock_irqrestore+0x3f/0x70 [ 7174.469413] [] io_schedule_timeout+0x88/0x100 [ 7174.469413] [] ? io_schedule_timeout+0x5/0x100 [ 7174.469413] [] mempool_alloc+0x145/0x170 [ 7174.469413] [] ? __init_waitqueue_head+0x60/0x60 [ 7174.469413] [] bio_alloc_bioset+0x10b/0x1d0 [ 7174.469413] [] ? end_swap_bio_read+0xc0/0xc0 [ 7174.469413] [] ? end_swap_bio_read+0xc0/0xc0 [ 7174.469413] [] get_swap_bio+0x30/0x90 [ 7174.469413] [] ? end_swap_bio_read+0xc0/0xc0 [ 7174.469413] [] __swap_writepage+0x150/0x230 [ 7174.469413] [] ? do_raw_spin_unlock+0x5/0xa0 [ 7174.469413] [] ? end_swap_bio_read+0xc0/0xc0 [ 7174.469413] [] ? __swap_writepage+0x5/0x230 [ 7174.469413] [] swap_writepage+0x42/0x90 [ 7174.469413] [] shrink_page_list+0x676/0xa80 [ 7174.469413] [] ? ftrace_call+0x5/0x2f [ 7174.469413] [] shrink_inactive_list+0x262/0x4e0 [ 7174.469413] [] shrink_lruvec+0x3e1/0x6a0 [ 7174.469413] [] shrink_zone+0x3f/0x110 [ 7174.469413] [] ? ftrace_call+0x5/0x2f [ 7174.469413] [] do_try_to_free_pages+0x156/0x4c0 [ 7174.469413] [] try_to_free_pages+0xf7/0x1e0 [ 7174.469413] [] __alloc_pages_nodemask+0x783/0xb20 [ 7174.469413] [] alloc_pages_current+0x10f/0x1f0 [ 7174.469413] [] ? __page_cache_alloc+0x13f/0x160 [ 7174.469413] [] __page_cache_alloc+0x13f/0x160 [ 7174.469413] [] find_or_create_page+0x4c/0xb0 [ 7174.469413] [] ? find_get_page+0x5/0x130 [ 7174.469413] [] ext4_mb_load_buddy+0x1e9/0x370 [ 7174.469413] [] ext4_mb_regular_allocator+0x1b7/0x460 [ 7174.469413] [] ? ext4_mb_use_preallocated+0x40/0x360 [ 7174.469413] [] ? ftrace_call+0x5/0x2f [ 7174.469413] [] ext4_mb_new_blocks+0x458/0x5f0 [ 7174.469413] [] ext4_ext_map_blocks+0x70b/0x1010 [ 7174.469413] [] ext4_map_blocks+0x325/0x530 [ 7174.469413] [] ext4_writepages+0x6d1/0xce0 [ 7174.469413] [] ? ext4_journalled_write_end+0x330/0x330 [ 7174.469413] [] do_writepages+0x23/0x40 [ 7174.469413] [] __writeback_single_inode+0x45/0x2e0 [ 7174.469413] [] writeback_sb_inodes+0x2ad/0x500 [ 7174.469413] [] __writeback_inodes_wb+0x9e/0xd0 [ 7174.469413] [] wb_writeback+0x29b/0x350 [ 7174.469413] [] ? __local_bh_enable_ip+0x6d/0xd0 [ 7174.469413] [] bdi_writeback_workfn+0x11c/0x480 [ 7174.469413] [] ? process_one_work+0x170/0x570 [ 7174.469413] [] process_one_work+0x1d2/0x570 [ 7174.469413] [] ? process_one_work+0x170/0x570 [ 7174.469413] [] worker_thread+0x116/0x370 [ 7174.469413] [] ? manage_workers.isra.19+0x2e0/0x2e0 [ 7174.469413] [] kthread+0xf3/0x110 [ 7174.469413] [] ? flush_kthread_worker+0x150/0x150 [ 7174.469413] [] ret_from_fork+0x7c/0xb0 [ 7174.469413] [] ? flush_kthread_worker+0x150/0x150 [ 7174.469413] Code: c0 49 bc fc 8d 76 82 ff ff ff ff e8 44 5a 5b 00 31 f6 8b 05 95 2b b3 00 48 39 c6 7d 0e 4c 8b 04 f5 20 5f c5 81 49 83 f8 ff 75 11 <0f> 0b 48 63 05 71 5a 64 01 48 29 c3 e9 d0 fd ff ff 48 8d 5e 01 [ 7174.469413] RIP [] stack_trace_call+0x2de/0x340 [ 7174.469413] RSP [ 7174.469413] ---[ end trace c97d325b36b718f3 ]--- Link: http://lkml.kernel.org/p/1401683592-1651-1-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 5aa9a5b9b6e2..8a4e5cb66a4c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -51,11 +51,33 @@ static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; static int last_stack_tracer_enabled; +static inline void print_max_stack(void) +{ + long i; + int size; + + pr_emerg(" Depth Size Location (%d entries)\n" + " ----- ---- --------\n", + max_stack_trace.nr_entries - 1); + + for (i = 0; i < max_stack_trace.nr_entries; i++) { + if (stack_dump_trace[i] == ULONG_MAX) + break; + if (i+1 == max_stack_trace.nr_entries || + stack_dump_trace[i+1] == ULONG_MAX) + size = stack_dump_index[i]; + else + size = stack_dump_index[i] - stack_dump_index[i+1]; + + pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i], + size, (void *)stack_dump_trace[i]); + } +} + static inline void check_stack(unsigned long ip, unsigned long *stack) { - unsigned long this_size, flags; - unsigned long *p, *top, *start; + unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; int frame_size = ACCESS_ONCE(tracer_frame); int i; @@ -149,8 +171,12 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - BUG_ON(current != &init_task && - *(end_of_stack(current)) != STACK_END_MAGIC); + if ((current != &init_task && + *(end_of_stack(current)) != STACK_END_MAGIC)) { + print_max_stack(); + BUG(); + } + out: arch_spin_unlock(&max_stack_lock); local_irq_restore(flags); -- cgit From c9482a5bdcc09be9096f40e858c5fe39c389cd52 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Sat, 26 Apr 2014 15:40:28 +0800 Subject: kernfs: move the last knowledge of sysfs out from kernfs There is still one residue of sysfs remaining: the sb_magic SYSFS_MAGIC. However this should be kernfs user specific, so this patch moves it out. Kerrnfs user should specify their magic number while mouting. Signed-off-by: Jianyu Zhan Acked-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- fs/kernfs/mount.c | 11 ++++++----- fs/sysfs/mount.c | 4 +++- include/linux/kernfs.h | 13 ++++++++----- kernel/cgroup.c | 4 +++- 4 files changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 6a5f04ac8704..95dcd1d558bb 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -62,7 +62,7 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) return NULL; } -static int kernfs_fill_super(struct super_block *sb) +static int kernfs_fill_super(struct super_block *sb, unsigned long magic) { struct kernfs_super_info *info = kernfs_info(sb); struct inode *inode; @@ -70,7 +70,7 @@ static int kernfs_fill_super(struct super_block *sb) sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = SYSFS_MAGIC; + sb->s_magic = magic; sb->s_op = &kernfs_sops; sb->s_time_gran = 1; @@ -131,6 +131,7 @@ const void *kernfs_super_ns(struct super_block *sb) * @fs_type: file_system_type of the fs being mounted * @flags: mount flags specified for the mount * @root: kernfs_root of the hierarchy being mounted + * @magic: file system specific magic number * @new_sb_created: tell the caller if we allocated a new superblock * @ns: optional namespace tag of the mount * @@ -142,8 +143,8 @@ const void *kernfs_super_ns(struct super_block *sb) * The return value can be passed to the vfs layer verbatim. */ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, bool *new_sb_created, - const void *ns) + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created, const void *ns) { struct super_block *sb; struct kernfs_super_info *info; @@ -166,7 +167,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, *new_sb_created = !sb->s_root; if (!sb->s_root) { - error = kernfs_fill_super(sb); + error = kernfs_fill_super(sb, magic); if (error) { deactivate_locked_super(sb); return ERR_PTR(error); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8794423f7efb..8a49486bf30c 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -13,6 +13,7 @@ #define DEBUG #include +#include #include #include #include @@ -38,7 +39,8 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, } ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); - root = kernfs_mount_ns(fs_type, flags, sysfs_root, &new_sb, ns); + root = kernfs_mount_ns(fs_type, flags, sysfs_root, + SYSFS_MAGIC, &new_sb, ns); if (IS_ERR(root) || !new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); return root; diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index ca1be5c9136c..52bf5677db0b 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -297,8 +297,8 @@ void kernfs_notify(struct kernfs_node *kn); const void *kernfs_super_ns(struct super_block *sb); struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, bool *new_sb_created, - const void *ns); + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created, const void *ns); void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); @@ -391,7 +391,8 @@ static inline const void *kernfs_super_ns(struct super_block *sb) static inline struct dentry * kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, bool *new_sb_created, const void *ns) + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created, const void *ns) { return ERR_PTR(-ENOSYS); } static inline void kernfs_kill_sb(struct super_block *sb) { } @@ -449,9 +450,11 @@ static inline int kernfs_rename(struct kernfs_node *kn, static inline struct dentry * kernfs_mount(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, bool *new_sb_created) + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created) { - return kernfs_mount_ns(fs_type, flags, root, new_sb_created, NULL); + return kernfs_mount_ns(fs_type, flags, root, + magic, new_sb_created, NULL); } #endif /* __LINUX_KERNFS_H */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3f1ca934a237..ceee0c54c6a4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1604,7 +1605,8 @@ out_unlock: if (ret) return ERR_PTR(ret); - dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb); + dentry = kernfs_mount(fs_type, flags, root->kf_root, + CGROUP_SUPER_MAGIC, &new_sb); if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); return dentry; -- cgit From 198376cd8877a612cc6494201b815c36d1e40391 Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Tue, 3 Jun 2014 13:28:03 +0900 Subject: tracing: Eliminate double free on failure of allocation on boot up If allocation of the max_buffer fails on boot up, the error path will free both per_cpu data structures from the buffers. With the new redesign of the code, those structures are freed if allocations failed. That is, the helper function that allocates the buffers will free the per cpu data on failure. No need to do it again. In fact, the second free will cause a bug as the code can not handle a double free. Link: http://lkml.kernel.org/p/20140603042803.27308.30956.stgit@yunodevel Signed-off-by: Yoshihiro YUNOMAE Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 626dbfde5d56..135af323608b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6671,10 +6671,6 @@ __init static int tracer_alloc_buffers(void) out_free_temp_buffer: ring_buffer_free(temp_buffer); out_free_cpumask: - free_percpu(global_trace.trace_buffer.data); -#ifdef CONFIG_TRACER_MAX_TRACE - free_percpu(global_trace.max_buffer.data); -#endif free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); -- cgit From 195d28020147843dcf809730b2e9d876722371e6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 4 Jun 2014 13:36:29 -0400 Subject: tracing: Remove unused variable in trace_benchmark Somehow this unused variable warning sneaked past my warnings check (probably due to it depending on a new config). kernel/trace/trace_benchmark.c: In function 'trace_do_benchmark': kernel/trace/trace_benchmark.c:38:6: warning: unused variable 'seedsq' [-Wunused-variable] u64 seedsq; ^ Link: http://lkml.kernel.org/r/20140604160921.4f4e69c4@canb.auug.org.au Reported-by: Stephen Rothwell Signed-off-by: Steven Rostedt --- kernel/trace/trace_benchmark.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 7dc1c42dfee2..a10adc7095cd 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -35,7 +35,6 @@ static void trace_do_benchmark(void) u64 delta; s64 stddev; u64 seed; - u64 seedsq; u64 last_seed; unsigned int avg; unsigned int std = 0; -- cgit From 8fe6929cfd43c44834858a53e129ffdc7c166298 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 4 Jun 2014 16:05:36 -0700 Subject: kthread: fix return value of kthread_create() upon SIGKILL. Commit 786235eeba0e ("kthread: make kthread_create() killable") meant for allowing kthread_create() to abort as soon as killed by the OOM-killer. But returning -ENOMEM is wrong if killed by SIGKILL from userspace. Change kthread_create() to return -EINTR upon SIGKILL. Signed-off-by: Tetsuo Handa Cc: Oleg Nesterov Acked-by: David Rientjes Cc: [3.13+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 9a130ec06f7a..c2390f41307b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -262,7 +262,7 @@ static void create_kthread(struct kthread_create_info *create) * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * - * Returns a task_struct or ERR_PTR(-ENOMEM). + * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, @@ -298,7 +298,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), * that thread. */ if (xchg(&create->done, NULL)) - return ERR_PTR(-ENOMEM); + return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. -- cgit From 52383431b37cdbec63944e953ffc2698a7ad9722 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:06:39 -0700 Subject: mm: get rid of __GFP_KMEMCG Currently to allocate a page that should be charged to kmemcg (e.g. threadinfo), we pass __GFP_KMEMCG flag to the page allocator. The page allocated is then to be freed by free_memcg_kmem_pages. Apart from looking asymmetrical, this also requires intrusion to the general allocation path. So let's introduce separate functions that will alloc/free pages charged to kmemcg. The new functions are called alloc_kmem_pages and free_kmem_pages. They should be used when the caller actually would like to use kmalloc, but has to fall back to the page allocator for the allocation is large. They only differ from alloc_pages and free_pages in that besides allocating or freeing pages they also charge them to the kmem resource counter of the current memory cgroup. [sfr@canb.auug.org.au: export kmalloc_order() to modules] Signed-off-by: Vladimir Davydov Acked-by: Greg Thelen Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Glauber Costa Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 10 +++++--- include/linux/memcontrol.h | 2 +- include/linux/slab.h | 11 +------- include/linux/thread_info.h | 2 -- include/trace/events/gfpflags.h | 1 - kernel/fork.c | 6 ++--- mm/memcontrol.c | 11 ++++---- mm/page_alloc.c | 56 +++++++++++++++++++++++++---------------- mm/slab_common.c | 13 ++++++++++ mm/slub.c | 6 ++--- 10 files changed, 68 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 39b81dc7d01a..d382db71e300 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -31,7 +31,6 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u -#define ___GFP_KMEMCG 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u @@ -91,7 +90,6 @@ struct vm_area_struct; #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ -#define __GFP_KMEMCG ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ /* @@ -353,6 +351,10 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ alloc_pages_vma(gfp_mask, 0, vma, addr, node) +extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order); +extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, + unsigned int order); + extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); @@ -372,8 +374,8 @@ extern void free_pages(unsigned long addr, unsigned int order); extern void free_hot_cold_page(struct page *page, int cold); extern void free_hot_cold_page_list(struct list_head *list, int cold); -extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); -extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); +extern void __free_kmem_pages(struct page *page, unsigned int order); +extern void free_kmem_pages(unsigned long addr, unsigned int order); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 96e5d2573eb0..5155d09e749d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -537,7 +537,7 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) * res_counter_charge_nofail, but we hope those allocations are rare, * and won't be worth the trouble. */ - if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL)) + if (gfp & __GFP_NOFAIL) return true; if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) return true; diff --git a/include/linux/slab.h b/include/linux/slab.h index 307bfbe62387..a6aab2c0dfc5 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -369,16 +369,7 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, #include #endif -static __always_inline void * -kmalloc_order(size_t size, gfp_t flags, unsigned int order) -{ - void *ret; - - flags |= (__GFP_COMP | __GFP_KMEMCG); - ret = (void *) __get_free_pages(flags, order); - kmemleak_alloc(ret, size, 1, flags); - return ret; -} +extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order); #ifdef CONFIG_TRACING extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order); diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index cb0cec94fda3..ff307b548ed3 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -61,8 +61,6 @@ extern long do_no_restart_syscall(struct restart_block *parm); # define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) #endif -#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG) - /* * flag set/clear/test wrappers * - pass TIF_xxxx constants to these functions diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index 1eddbf1557f2..d6fd8e5b14b7 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -34,7 +34,6 @@ {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ - {(unsigned long)__GFP_KMEMCG, "GFP_KMEMCG"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ diff --git a/kernel/fork.c b/kernel/fork.c index 54a8d26f612f..59e3dcc5b8f2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti) static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { - struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, - THREAD_SIZE_ORDER); + struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { - free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 56a768b3d5a8..7bab1de50f48 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3540,11 +3540,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) /* * Disabling accounting is only relevant for some specific memcg * internal allocations. Therefore we would initially not have such - * check here, since direct calls to the page allocator that are marked - * with GFP_KMEMCG only happen outside memcg core. We are mostly - * concerned with cache allocations, and by having this test at - * memcg_kmem_get_cache, we are already able to relay the allocation to - * the root cache and bypass the memcg cache altogether. + * check here, since direct calls to the page allocator that are + * accounted to kmemcg (alloc_kmem_pages and friends) only happen + * outside memcg core. We are mostly concerned with cache allocations, + * and by having this test at memcg_kmem_get_cache, we are already able + * to relay the allocation to the root cache and bypass the memcg cache + * altogether. * * There is one exception, though: the SLUB allocator does not create * large order caches, but rather service large kmallocs directly from diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5dba2933c9c0..7cfdcd808f52 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2697,7 +2697,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; - struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2716,13 +2715,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - /* - * Will only have any effect when __GFP_KMEMCG is set. This is - * verified in the (always inline) callee - */ - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; - retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); @@ -2782,8 +2774,6 @@ out: if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; - memcg_kmem_commit_charge(page, memcg, order); - return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2837,27 +2827,51 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); /* - * __free_memcg_kmem_pages and free_memcg_kmem_pages will free - * pages allocated with __GFP_KMEMCG. + * alloc_kmem_pages charges newly allocated pages to the kmem resource counter + * of the current memory cgroup. * - * Those pages are accounted to a particular memcg, embedded in the - * corresponding page_cgroup. To avoid adding a hit in the allocator to search - * for that information only to find out that it is NULL for users who have no - * interest in that whatsoever, we provide these functions. - * - * The caller knows better which flags it relies on. + * It should be used when the caller would like to use kmalloc, but since the + * allocation is large, it has to fall back to the page allocator. + */ +struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages(gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages_node(nid, gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +/* + * __free_kmem_pages and free_kmem_pages will free pages allocated with + * alloc_kmem_pages. */ -void __free_memcg_kmem_pages(struct page *page, unsigned int order) +void __free_kmem_pages(struct page *page, unsigned int order) { memcg_kmem_uncharge_pages(page, order); __free_pages(page, order); } -void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +void free_kmem_pages(unsigned long addr, unsigned int order) { if (addr != 0) { VM_BUG_ON(!virt_addr_valid((void *)addr)); - __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + __free_kmem_pages(virt_to_page((void *)addr), order); } } diff --git a/mm/slab_common.c b/mm/slab_common.c index 06f0c6125632..1950c8f4d1a6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -582,6 +582,19 @@ void __init create_kmalloc_caches(unsigned long flags) } #endif /* !CONFIG_SLOB */ +void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) +{ + void *ret; + struct page *page; + + flags |= __GFP_COMP; + page = alloc_kmem_pages(flags, order); + ret = page ? page_address(page) : NULL; + kmemleak_alloc(ret, size, 1, flags); + return ret; +} +EXPORT_SYMBOL(kmalloc_order); + #ifdef CONFIG_TRACING void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) { diff --git a/mm/slub.c b/mm/slub.c index fc9831851be6..ddb60795f373 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3311,8 +3311,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; - page = alloc_pages_node(node, flags, get_order(size)); + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_kmem_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3381,7 +3381,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kfree_hook(x); - __free_memcg_kmem_pages(page, compound_order(page)); + __free_kmem_pages(page, compound_order(page)); return; } slab_free(page->slab_cache, page, object, _RET_IP_); -- cgit From f98bafa06a28fdfdd5c49f820f4d6560f636fc46 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 4 Jun 2014 16:07:34 -0700 Subject: memcg: kill CONFIG_MM_OWNER CONFIG_MM_OWNER makes no sense. It is not user-selectable, it is only selected by CONFIG_MEMCG automatically. So we can kill this option in init/Kconfig and do s/CONFIG_MM_OWNER/CONFIG_MEMCG/ globally. Signed-off-by: Oleg Nesterov Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- include/linux/sched.h | 4 ++-- init/Kconfig | 7 ------- kernel/exit.c | 4 ++-- kernel/fork.c | 4 ++-- 5 files changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8967e20cbe57..de1627232af0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -406,7 +406,7 @@ struct mm_struct { spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; #endif -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG /* * "owner" points to a task that is regarded as the canonical * user/owner of this mm. All of the following must be true in diff --git a/include/linux/sched.h b/include/linux/sched.h index 70f67e4e6156..2f2dd7d932a2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2967,7 +2967,7 @@ static inline void inc_syscw(struct task_struct *tsk) #define TASK_SIZE_OF(tsk) TASK_SIZE #endif -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); #else @@ -2978,7 +2978,7 @@ static inline void mm_update_next_owner(struct mm_struct *mm) static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ static inline unsigned long task_rlimit(const struct task_struct *tsk, unsigned int limit) diff --git a/init/Kconfig b/init/Kconfig index 4a1822a1a680..0a2f09a80e90 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -933,7 +933,6 @@ config RESOURCE_COUNTERS config MEMCG bool "Memory Resource Controller for Control Groups" depends on RESOURCE_COUNTERS - select MM_OWNER select EVENTFD help Provides a memory resource controller that manages both anonymous @@ -951,9 +950,6 @@ config MEMCG disable memory resource controller and you can avoid overheads. (and lose benefits of memory resource controller) - This config option also selects MM_OWNER config option, which - could in turn add some fork/exit overhead. - config MEMCG_SWAP bool "Memory Resource Controller Swap Extension" depends on MEMCG && SWAP @@ -1179,9 +1175,6 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. -config MM_OWNER - bool - config SYSFS_DEPRECATED bool "Enable deprecated sysfs features to support old userspace tools" depends on SYSFS diff --git a/kernel/exit.c b/kernel/exit.c index 6ed6a1d552b5..da1b838de8a6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -352,7 +352,7 @@ int disallow_signal(int sig) EXPORT_SYMBOL(disallow_signal); -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ @@ -434,7 +434,7 @@ assign_new_owner: task_unlock(c); put_task_struct(c); } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we diff --git a/kernel/fork.c b/kernel/fork.c index 59e3dcc5b8f2..0d53eb0dfb6f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1099,12 +1099,12 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { mm->owner = p; } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ /* * Initialize POSIX timer handling for a single task. -- cgit From f87fb599ae4d2a152a93f9821b94f3158146d097 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 4 Jun 2014 16:07:52 -0700 Subject: memcg: mm_update_next_owner() should skip kthreads "Search through everything else" in mm_update_next_owner() can hit a kthread which adopted this "mm" via use_mm(), it should not be used as mm->owner. Add the PF_KTHREAD check. While at it, change this code to use for_each_process_thread() instead of deprecated do_each_thread/while_each_thread. Signed-off-by: Oleg Nesterov Reviewed-by: Michal Hocko Cc: Balbir Singh Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Peter Chiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index da1b838de8a6..5ac3c19c245c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -395,14 +395,12 @@ retry: } /* - * Search through everything else. We should not get - * here often + * Search through everything else, we should not get here often. */ - do_each_thread(g, c) { - if (c->mm == mm) + for_each_process_thread(g, c) { + if (!(c->flags & PF_KTHREAD) && c->mm == mm) goto assign_new_owner; - } while_each_thread(g, c); - + } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are -- cgit From 39af1765f1255b2bbadc3064e16270781abf24a1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 4 Jun 2014 16:07:54 -0700 Subject: memcg: optimize the "Search everything else" loop in mm_update_next_owner() for_each_process_thread() is sub-optimal. All threads share the same ->mm, we can swicth to the next process once we found a thread with ->mm != NULL and ->mm != mm. Signed-off-by: Oleg Nesterov Reviewed-by: Michal Hocko Cc: Balbir Singh Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Peter Chiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 5ac3c19c245c..750c2e594617 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -397,9 +397,15 @@ retry: /* * Search through everything else, we should not get here often. */ - for_each_process_thread(g, c) { - if (!(c->flags & PF_KTHREAD) && c->mm == mm) - goto assign_new_owner; + for_each_process(g) { + if (g->flags & PF_KTHREAD) + continue; + for_each_thread(g, c) { + if (c->mm == mm) + goto assign_new_owner; + if (c->mm) + break; + } } read_unlock(&tasklist_lock); /* -- cgit From 664eeddeef6539247691197c1ac124d4aa872ab6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:08 -0700 Subject: mm: page_alloc: use jump labels to avoid checking number_of_cpusets If cpusets are not in use then we still check a global variable on every page allocation. Use jump labels to avoid the overhead. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 29 ++++++++++++++++++++++++++--- kernel/cpuset.c | 14 ++++---------- mm/page_alloc.c | 3 ++- 3 files changed, 32 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index b19d3dc2e651..ade2390ffe92 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -12,10 +12,31 @@ #include #include #include +#include #ifdef CONFIG_CPUSETS -extern int number_of_cpusets; /* How many cpusets are defined in system? */ +extern struct static_key cpusets_enabled_key; +static inline bool cpusets_enabled(void) +{ + return static_key_false(&cpusets_enabled_key); +} + +static inline int nr_cpusets(void) +{ + /* jump label reference count + the top-level cpuset */ + return static_key_count(&cpusets_enabled_key) + 1; +} + +static inline void cpuset_inc(void) +{ + static_key_slow_inc(&cpusets_enabled_key); +} + +static inline void cpuset_dec(void) +{ + static_key_slow_dec(&cpusets_enabled_key); +} extern int cpuset_init(void); extern void cpuset_init_smp(void); @@ -32,13 +53,13 @@ extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask); static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) { - return number_of_cpusets <= 1 || + return nr_cpusets() <= 1 || __cpuset_node_allowed_softwall(node, gfp_mask); } static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) { - return number_of_cpusets <= 1 || + return nr_cpusets() <= 1 || __cpuset_node_allowed_hardwall(node, gfp_mask); } @@ -124,6 +145,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) #else /* !CONFIG_CPUSETS */ +static inline bool cpusets_enabled(void) { return false; } + static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3d54c418bd06..130017843899 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,12 +61,7 @@ #include #include -/* - * Tracks how many cpusets are currently defined in system. - * When there is only one cpuset (the root cpuset) we can - * short circuit some hooks. - */ -int number_of_cpusets __read_mostly; +struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; /* See "Frequency meter" comments, below. */ @@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains, goto done; } - csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); + csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; @@ -1888,7 +1883,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); - number_of_cpusets++; + cpuset_inc(); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; @@ -1939,7 +1934,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - number_of_cpusets--; + cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); mutex_unlock(&cpuset_mutex); @@ -1992,7 +1987,6 @@ int __init cpuset_init(void) if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) BUG(); - number_of_cpusets = 1; return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b4381eaee715..a2955e101715 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1921,7 +1921,8 @@ zonelist_scan: if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; - if ((alloc_flags & ALLOC_CPUSET) && + if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); -- cgit From f6187769dae48234f3877df3c4d99294cc2254fa Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:12 -0700 Subject: sys_sgetmask/sys_ssetmask: add CONFIG_SGETMASK_SYSCALL sys_sgetmask and sys_ssetmask are obsolete system calls no longer supported in libc. This patch replaces architecture related __ARCH_WANT_SYS_SGETMAX by expert mode configuration.That option is enabled by default for those architectures. Signed-off-by: Fabian Frederick Cc: Steven Miao Cc: Mikael Starvik Cc: Jesper Nilsson Cc: David Howells Cc: Geert Uytterhoeven Cc: Michal Simek Cc: Ralf Baechle Cc: Koichi Yasutake Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Greg Ungerer Cc: Heiko Carstens Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/blackfin/include/asm/unistd.h | 1 - arch/cris/include/asm/unistd.h | 1 - arch/frv/include/asm/unistd.h | 1 - arch/m68k/include/asm/unistd.h | 1 - arch/microblaze/include/asm/unistd.h | 1 - arch/mips/include/asm/unistd.h | 1 - arch/mn10300/include/asm/unistd.h | 1 - arch/parisc/include/asm/unistd.h | 1 - arch/powerpc/include/asm/unistd.h | 1 - arch/sh/include/asm/unistd.h | 1 - arch/sparc/include/asm/unistd.h | 1 - arch/x86/include/asm/unistd.h | 1 - init/Kconfig | 10 ++++++++++ kernel/signal.c | 4 ++-- kernel/sys_ni.c | 2 ++ 15 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h index c35414bdf7bd..c8c8ff9eff61 100644 --- a/arch/blackfin/include/asm/unistd.h +++ b/arch/blackfin/include/asm/unistd.h @@ -12,7 +12,6 @@ #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_FADVISE64 #define __ARCH_WANT_SYS_GETPGRP diff --git a/arch/cris/include/asm/unistd.h b/arch/cris/include/asm/unistd.h index 5cc7d1991e48..0f40fed1ba25 100644 --- a/arch/cris/include/asm/unistd.h +++ b/arch/cris/include/asm/unistd.h @@ -15,7 +15,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h index 70ec7293dce7..17b5df8fc28a 100644 --- a/arch/frv/include/asm/unistd.h +++ b/arch/frv/include/asm/unistd.h @@ -13,7 +13,6 @@ /* #define __ARCH_WANT_SYS_GETHOSTNAME */ #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -/* #define __ARCH_WANT_SYS_SGETMASK */ /* #define __ARCH_WANT_SYS_SIGNAL */ #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h index 33afa56ad47a..1fcdd344c7ad 100644 --- a/arch/m68k/include/asm/unistd.h +++ b/arch/m68k/include/asm/unistd.h @@ -13,7 +13,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h index b14232b6878f..fd56a8f66489 100644 --- a/arch/microblaze/include/asm/unistd.h +++ b/arch/microblaze/include/asm/unistd.h @@ -19,7 +19,6 @@ #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index 413d6c612bec..e55813029d5a 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -29,7 +29,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_UTIME #define __ARCH_WANT_SYS_WAITPID #define __ARCH_WANT_SYS_SOCKETCALL diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h index 9d4e2d1ef90e..0522468f488b 100644 --- a/arch/mn10300/include/asm/unistd.h +++ b/arch/mn10300/include/asm/unistd.h @@ -26,7 +26,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h index 74d835820ee7..5f4c68daa261 100644 --- a/arch/parisc/include/asm/unistd.h +++ b/arch/parisc/include/asm/unistd.h @@ -145,7 +145,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \ #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_COMPAT_SYS_TIME diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 9b892bbd9d84..5ce5552ab9f5 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -29,7 +29,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/sh/include/asm/unistd.h b/arch/sh/include/asm/unistd.h index e77816c4b9bc..126fe8340b22 100644 --- a/arch/sh/include/asm/unistd.h +++ b/arch/sh/include/asm/unistd.h @@ -11,7 +11,6 @@ # define __ARCH_WANT_SYS_GETHOSTNAME # define __ARCH_WANT_SYS_IPC # define __ARCH_WANT_SYS_PAUSE -# define __ARCH_WANT_SYS_SGETMASK # define __ARCH_WANT_SYS_SIGNAL # define __ARCH_WANT_SYS_TIME # define __ARCH_WANT_SYS_UTIME diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index dfa53fdd5cbc..0aac1e8f2968 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -25,7 +25,6 @@ #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 3f556c6a0157..2b19caa4081c 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -41,7 +41,6 @@ # define __ARCH_WANT_SYS_OLD_GETRLIMIT # define __ARCH_WANT_SYS_OLD_UNAME # define __ARCH_WANT_SYS_PAUSE -# define __ARCH_WANT_SYS_SGETMASK # define __ARCH_WANT_SYS_SIGNAL # define __ARCH_WANT_SYS_SIGPENDING # define __ARCH_WANT_SYS_SIGPROCMASK diff --git a/init/Kconfig b/init/Kconfig index ce034ad4a162..9d76b99af1b9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1313,6 +1313,16 @@ config UID16 help This enables the legacy 16-bit UID syscall wrappers. +config SGETMASK_SYSCALL + bool "sgetmask/ssetmask syscalls support" if EXPERT + def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH + ---help--- + sys_sgetmask and sys_ssetmask are obsolete system calls + no longer supported in libc but still enabled by default in some + architectures. + + If unsure, leave the default option here. + config SYSFS_SYSCALL bool "Sysfs syscall support" if EXPERT default y diff --git a/kernel/signal.c b/kernel/signal.c index 6ea13c09ae56..6e600aaa2af4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3496,7 +3496,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, } #endif -#ifdef __ARCH_WANT_SYS_SGETMASK +#ifdef CONFIG_SGETMASK_SYSCALL /* * For backwards compatibility. Functionality superseded by sigprocmask. @@ -3517,7 +3517,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) return old; } -#endif /* __ARCH_WANT_SGETMASK */ +#endif /* CONFIG_SGETMASK_SYSCALL */ #ifdef __ARCH_WANT_SYS_SIGNAL /* diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bc8d1b74a6b9..36441b51b5df 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -135,6 +135,8 @@ cond_syscall(sys_setresgid16); cond_syscall(sys_setresuid16); cond_syscall(sys_setreuid16); cond_syscall(sys_setuid16); +cond_syscall(sys_sgetmask); +cond_syscall(sys_ssetmask); cond_syscall(sys_vm86old); cond_syscall(sys_vm86); cond_syscall(sys_ipc); -- cgit From 84117da5b79ffb4077bb05d64c86dfa4d746115c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:17 -0700 Subject: kernel/cpu.c: convert printk to pr_foo() no level printk converted to pr_warn (if err) no level printk converted to pr_info (disabling non-boot cpus) Other printk converted to respective level. Signed-off-by: Fabian Frederick Cc: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 247979a1b815..acf791c55b71 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -283,8 +283,7 @@ static inline void check_for_tasks(int cpu) task_cputime(p, &utime, &stime); if (task_cpu(p) == cpu && p->state == TASK_RUNNING && (utime || stime)) - printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " - "(state = %ld, flags = %x)\n", + pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", p->comm, task_pid_nr(p), cpu, p->state, p->flags); } @@ -336,8 +335,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) if (err) { nr_calls--; __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); - printk("%s: attempt to take down CPU %u failed\n", - __func__, cpu); + pr_warn("%s: attempt to take down CPU %u failed\n", + __func__, cpu); goto out_release; } @@ -444,8 +443,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; - printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", - __func__, cpu); + pr_warn("%s: attempt to bring up CPU %u failed\n", + __func__, cpu); goto out_notify; } @@ -475,11 +474,10 @@ int cpu_up(unsigned int cpu) int err = 0; if (!cpu_possible(cpu)) { - printk(KERN_ERR "can't online cpu %d because it is not " - "configured as may-hotadd at boot time\n", cpu); + pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n", + cpu); #if defined(CONFIG_IA64) - printk(KERN_ERR "please check additional_cpus= boot " - "parameter\n"); + pr_err("please check additional_cpus= boot parameter\n"); #endif return -EINVAL; } @@ -518,7 +516,7 @@ int disable_nonboot_cpus(void) */ cpumask_clear(frozen_cpus); - printk("Disabling non-boot CPUs ...\n"); + pr_info("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; @@ -526,8 +524,7 @@ int disable_nonboot_cpus(void) if (!error) cpumask_set_cpu(cpu, frozen_cpus); else { - printk(KERN_ERR "Error taking CPU%d down: %d\n", - cpu, error); + pr_err("Error taking CPU%d down: %d\n", cpu, error); break; } } @@ -537,7 +534,7 @@ int disable_nonboot_cpus(void) /* Make sure the CPUs won't be enabled by someone else */ cpu_hotplug_disabled = 1; } else { - printk(KERN_ERR "Non-boot CPUs are not disabled\n"); + pr_err("Non-boot CPUs are not disabled\n"); } cpu_maps_update_done(); return error; @@ -561,17 +558,17 @@ void __ref enable_nonboot_cpus(void) if (cpumask_empty(frozen_cpus)) goto out; - printk(KERN_INFO "Enabling non-boot CPUs ...\n"); + pr_info("Enabling non-boot CPUs ...\n"); arch_enable_nonboot_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { - printk(KERN_INFO "CPU%d is up\n", cpu); + pr_info("CPU%d is up\n", cpu); continue; } - printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); + pr_warn("Error taking CPU%d up: %d\n", cpu, error); } arch_enable_nonboot_cpus_end(); -- cgit From 462b29b8564c489e0aa3f5a3a505fd2776af5e55 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:18 -0700 Subject: kernel/backtracetest.c: replace no level printk by pr_info() Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/backtracetest.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a5e026bc45c4..1323360d90e3 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -19,8 +19,8 @@ static void backtrace_test_normal(void) { - printk("Testing a backtrace from process context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); + pr_info("Testing a backtrace from process context.\n"); + pr_info("The following trace is a kernel self test and not a bug!\n"); dump_stack(); } @@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); static void backtrace_test_irq(void) { - printk("Testing a backtrace from irq context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); + pr_info("Testing a backtrace from irq context.\n"); + pr_info("The following trace is a kernel self test and not a bug!\n"); init_completion(&backtrace_work); tasklet_schedule(&backtrace_tasklet); @@ -51,8 +51,8 @@ static void backtrace_test_saved(void) struct stack_trace trace; unsigned long entries[8]; - printk("Testing a saved backtrace.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); + pr_info("Testing a saved backtrace.\n"); + pr_info("The following trace is a kernel self test and not a bug!\n"); trace.nr_entries = 0; trace.max_entries = ARRAY_SIZE(entries); @@ -65,19 +65,19 @@ static void backtrace_test_saved(void) #else static void backtrace_test_saved(void) { - printk("Saved backtrace test skipped.\n"); + pr_info("Saved backtrace test skipped.\n"); } #endif static int backtrace_regression_test(void) { - printk("====[ backtrace testing ]===========\n"); + pr_info("====[ backtrace testing ]===========\n"); backtrace_test_normal(); backtrace_test_irq(); backtrace_test_saved(); - printk("====[ end of backtrace testing ]====\n"); + pr_info("====[ end of backtrace testing ]====\n"); return 0; } -- cgit From a6c8c6902c53e620e607e83f520e9ae424e2a424 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:19 -0700 Subject: kernel/capability.c: code clean-up - EXPORT_SYMBOL - typo: unexpectidly->unexpectedly - function prototype over 80 characters Signed-off-by: Fabian Frederick Cc: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/capability.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index a8d63df0c322..84b2bbf443e7 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -24,7 +24,6 @@ */ const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; - EXPORT_SYMBOL(__cap_empty_set); int file_caps_enabled = 1; @@ -189,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) * * An alternative would be to return an error here * (-ERANGE), but that causes legacy applications to - * unexpectidly fail; the capget/modify/capset aborts + * unexpectedly fail; the capget/modify/capset aborts * before modification is attempted and the application * fails. */ @@ -395,7 +394,8 @@ EXPORT_SYMBOL(ns_capable); * This does not set PF_SUPERPRIV because the caller may not * actually be privileged. */ -bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) +bool file_ns_capable(const struct file *file, struct user_namespace *ns, + int cap) { if (WARN_ON_ONCE(!cap_valid(cap))) return false; -- cgit From b9e5db6d2bbe4416cd1c30c2d1891ef39d6bd0b7 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:20 -0700 Subject: kernel/exec_domain.c: code clean-up Fix checkpatch warnings about EXPORT_SYMBOL and return() Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exec_domain.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 0dbeae374225..83d4382f5699 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -37,7 +37,7 @@ static unsigned long ident_map[32] = { struct exec_domain default_exec_domain = { .name = "Linux", /* name */ .handler = default_handler, /* lcall7 causes a seg fault. */ - .pers_low = 0, /* PER_LINUX personality. */ + .pers_low = 0, /* PER_LINUX personality. */ .pers_high = 0, /* PER_LINUX personality. */ .signal_map = ident_map, /* Identity map signals. */ .signal_invmap = ident_map, /* - both ways. */ @@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality) ep = &default_exec_domain; out: read_unlock(&exec_domains_lock); - return (ep); + return ep; } int @@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep) out: write_unlock(&exec_domains_lock); - return (err); + return err; } +EXPORT_SYMBOL(register_exec_domain); int unregister_exec_domain(struct exec_domain *ep) @@ -133,6 +134,7 @@ unregister: write_unlock(&exec_domains_lock); return 0; } +EXPORT_SYMBOL(unregister_exec_domain); int __set_personality(unsigned int personality) { @@ -144,6 +146,7 @@ int __set_personality(unsigned int personality) return 0; } +EXPORT_SYMBOL(__set_personality); #ifdef CONFIG_PROC_FS static int execdomains_proc_show(struct seq_file *m, void *v) @@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality) return old; } - - -EXPORT_SYMBOL(register_exec_domain); -EXPORT_SYMBOL(unregister_exec_domain); -EXPORT_SYMBOL(__set_personality); -- cgit From eaa1809b900c460a020bff1f4030f4f6a237b2b2 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:21 -0700 Subject: kernel/latencytop.c: convert seq_printf to seq_puts This patch also fixes one function declaration over 80 characters. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/latencytop.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a462b317f9a0..a02812743a7e 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -88,7 +88,8 @@ static void clear_global_latency_tracing(void) } static void __sched -account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) +account_global_scheduler_latency(struct task_struct *tsk, + struct latency_record *lat) { int firstnonnull = MAXLR + 1; int i; @@ -255,7 +256,7 @@ static int lstats_show(struct seq_file *m, void *v) break; seq_printf(m, " %ps", (void *)bt); } - seq_printf(m, "\n"); + seq_puts(m, "\n"); } } return 0; -- cgit From cf25004069d3ccd6aae607d8175bdff67c1dd319 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:22 -0700 Subject: kernel/stop_machine.c: kernel-doc warning fix Signed-off-by: Fabian Frederick Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 01fbae5b97b7..695f0c6cd169 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -307,6 +307,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * * @cpu: cpu to stop * @fn: function to execute * @arg: argument to @fn + * @work_buf: pointer to cpu_stop_work structure * * Similar to stop_one_cpu() but doesn't wait for completion. The * caller is responsible for ensuring @work_buf is currently unused -- cgit From cac92ba74f19fd58a28976f753f9327f27cf1669 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:23 -0700 Subject: kernel/tracepoint.c: kernel-doc fixes Signed-off-by: Fabian Frederick Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/tracepoint.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 6620e5837ce2..33cbd8c203f8 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -239,6 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, * tracepoint_probe_register - Connect a probe to a tracepoint * @tp: tracepoint * @probe: probe handler + * @data: tracepoint data * * Returns 0 if ok, error value on error. * Note: if @tp is within a module, the caller is responsible for @@ -264,6 +265,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register); * tracepoint_probe_unregister - Disconnect a probe from a tracepoint * @tp: tracepoint * @probe: probe function pointer + * @data: tracepoint data * * Returns 0 if ok, error value on error. */ -- cgit From 6c5a53c67057bddf7f8e26c93a8e045215f61539 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:24 -0700 Subject: kernel/res_counter.c: replace simple_strtoull by kstrtoull [akpm@linux-foundation.org: don't overwrite kstrtoull()'s errno] Signed-off-by: Fabian Frederick Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/res_counter.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 51dbac6a3633..e791130f85a7 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -186,8 +186,11 @@ int res_counter_memparse_write_strategy(const char *buf, /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ if (*buf == '-') { - res = simple_strtoull(buf + 1, &end, 10); - if (res != 1 || *end != '\0') + int rc = kstrtoull(buf + 1, 10, &res); + + if (rc) + return rc; + if (res != 1) return -EINVAL; *resp = RES_COUNTER_MAX; return 0; -- cgit From 616feab753972b9751308f3cd2a68fc57eae8edb Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:25 -0700 Subject: kernel/reboot.c: convert simple_strtoul to kstrtoint Replace obsolete function. kstrtoint is used as reboot_cpu is an integer. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/reboot.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 662c83fc16b7..a3a9e240fcdb 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -388,15 +388,22 @@ static int __init reboot_setup(char *str) break; case 's': - if (isdigit(*(str+1))) - reboot_cpu = simple_strtoul(str+1, NULL, 0); - else if (str[1] == 'm' && str[2] == 'p' && - isdigit(*(str+3))) - reboot_cpu = simple_strtoul(str+3, NULL, 0); - else + { + int rc; + + if (isdigit(*(str+1))) { + rc = kstrtoint(str+1, 0, &reboot_cpu); + if (rc) + return rc; + } else if (str[1] == 'm' && str[2] == 'p' && + isdigit(*(str+3))) { + rc = kstrtoint(str+3, 0, &reboot_cpu); + if (rc) + return rc; + } else reboot_mode = REBOOT_SOFT; break; - + } case 'g': reboot_mode = REBOOT_GPIO; break; -- cgit From 95583e4ab5745218373add88ffddb70faff2d0c8 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:26 -0700 Subject: kernel/utsname_sysctl.c: replace obsolete __initcall by device_initcall Also fixes checkpatch warnings on proc_dostring function parameters Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/utsname_sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4f69f9a5e221..6fbe811c7ad1 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -51,7 +51,7 @@ static int proc_do_uts_string(ctl_table *table, int write, int r; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = proc_dostring(&uts_table,write,buffer,lenp, ppos); + r = proc_dostring(&uts_table, write, buffer, lenp, ppos); put_uts(table, write, uts_table.data); if (write) @@ -135,4 +135,4 @@ static int __init utsname_sysctl_init(void) return 0; } -__initcall(utsname_sysctl_init); +device_initcall(utsname_sysctl_init); -- cgit From b51dbec68c8732caac2495f558659556523e8322 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:26 -0700 Subject: kernel/hung_task.c: convert simple_strtoul to kstrtouint sysctl_hung_task_panic has been changed to unsigned int. use kstrtouint instead of obsolete simple_strtoul Signed-off-by: Fabian Frederick Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 06bb1417b063..06db12434d72 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -52,8 +52,10 @@ unsigned int __read_mostly sysctl_hung_task_panic = static int __init hung_task_panic_setup(char *str) { - sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); + int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); + if (rc) + return rc; return 1; } __setup("hung_task_panic=", hung_task_panic_setup); -- cgit From b300a4ea665f7fa44f015616ac1874deca891c5e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:11:27 -0700 Subject: kernel/user.c: drop unused field 'files' from user_struct Nobody seems uses it for a long time. Let's drop it. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - kernel/user.c | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2f2dd7d932a2..611676fd4c2c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -745,7 +745,6 @@ static inline int signal_group_exit(const struct signal_struct *sig) struct user_struct { atomic_t __count; /* reference count */ atomic_t processes; /* How many processes does this user have? */ - atomic_t files; /* How many open files does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */ #ifdef CONFIG_INOTIFY_USER atomic_t inotify_watches; /* How many inotify watches does this user have? */ diff --git a/kernel/user.c b/kernel/user.c index 294fc6a94168..4efa39350e44 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -87,7 +87,6 @@ static DEFINE_SPINLOCK(uidhash_lock); struct user_struct root_user = { .__count = ATOMIC_INIT(1), .processes = ATOMIC_INIT(1), - .files = ATOMIC_INIT(0), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .uid = GLOBAL_ROOT_UID, -- cgit From 0a581694ab7a5bc083d710df8a552a6a055b005f Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 4 Jun 2014 16:11:28 -0700 Subject: printk: split code for making free space in the log buffer The check for free space in the log buffer always passes when "first_seq" and "next_seq" are equal. In theory, it might cause writing outside of the log buffer. Fortunately, the current usage looks safe because the used "text" and "dict" buffers are quite limited. See the second patch for more details. Anyway, it is better to be on the safe side and add a check. An easy solution is done in the 2nd patch and it is improved in the 4th patch. 5th patch fixes the computation of the printed message length. 1st and 3rd patches just do some code refactoring to make the other patches easier. This patch (of 5): There will be needed some fixes in the check for free space. They will be easier if the code is moved outside of the quite long log_store() function. This patch does not change the existing behavior. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Jiri Kosina Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 221229cf0190..99b7a2d87b6a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -297,6 +297,34 @@ static u32 log_next(u32 idx) return idx + msg->len; } +/* check whether there is enough free space for the given message */ +static int logbuf_has_space(u32 msg_size) +{ + u32 free; + + if (log_next_idx > log_first_idx) + free = max(log_buf_len - log_next_idx, log_first_idx); + else + free = log_first_idx - log_next_idx; + + /* + * We need space also for an empty header that signalizes wrapping + * of the buffer. + */ + return free >= msg_size + sizeof(struct printk_log); +} + +static void log_make_free_space(u32 msg_size) +{ + while (log_first_seq < log_next_seq) { + if (logbuf_has_space(msg_size)) + return; + /* drop old messages until we have enough continuous space */ + log_first_idx = log_next(log_first_idx); + log_first_seq++; + } +} + /* insert record into the buffer, discard old ones, update heads */ static void log_store(int facility, int level, enum log_flags flags, u64 ts_nsec, @@ -311,21 +339,7 @@ static void log_store(int facility, int level, pad_len = (-size) & (LOG_ALIGN - 1); size += pad_len; - while (log_first_seq < log_next_seq) { - u32 free; - - if (log_next_idx > log_first_idx) - free = max(log_buf_len - log_next_idx, log_first_idx); - else - free = log_first_idx - log_next_idx; - - if (free >= size + sizeof(struct printk_log)) - break; - - /* drop old messages until we have enough contiuous space */ - log_first_idx = log_next(log_first_idx); - log_first_seq++; - } + log_make_free_space(size); if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { /* -- cgit From f40e4b9f70d48eb08f443642283fdd9d05b27c6d Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 4 Jun 2014 16:11:30 -0700 Subject: printk: ignore too long messages There was no check for too long messages. The check for free space always passed when first_seq and next_seq were equal. Enough free space was not guaranteed, though. log_store() might be called to store messages up to 64kB + 64kB + 16B. This is sum of maximal text_len, dict_len values, and the size of the structure printk_log. On the other hand, the minimal size for the main log buffer currently is 4kB and it is enforced only by Kconfig. The good news is that the usage looks safe right now. log_store() is called only from vprintk_emit() and cont_flush(). Here the "text" part is always passed via a static buffer and the length is limited to LOG_LINE_MAX which is 1024. The "dict" part is NULL in most cases. The only exceptions is when vprintk_emit() is called from printk_emit() and dev_vprintk_emit(). But printk_emit() is currently used only in devkmsg_writev() and here "dict" is NULL as well. In dev_vprintk_emit(), "dict" is limited by the static buffer "hdr" of the size 128 bytes. It meas that the current maximal printed text is 1024B + 128B + 16B and it always fit the log buffer. But it is only matter of time when someone calls printk_emit() with unsafe parameters, especially the "dict" one. This patch adds a check for the free space when the buffer is empty. It reuses the already existing log_has_space() function but it has to add an extra parameter. It defines whether the buffer is empty. Note that the same values of "first_idx" and "next_idx" might also mean that the buffer is full. If the buffer is empty, we must respect the current position of the indexes. We cannot reset them to the beginning of the buffer. Otherwise, the functions reading the buffer would get crazy. The question is what to do when the message is too long. This patch uses the easiest solution and just ignores the problematic message. Let's do something better in a followup patch. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Jiri Kosina Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 99b7a2d87b6a..8fbbab1771eb 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -297,12 +297,20 @@ static u32 log_next(u32 idx) return idx + msg->len; } -/* check whether there is enough free space for the given message */ -static int logbuf_has_space(u32 msg_size) +/* + * Check whether there is enough free space for the given message. + * + * The same values of first_idx and next_idx mean that the buffer + * is either empty or full. + * + * If the buffer is empty, we must respect the position of the indexes. + * They cannot be reset to the beginning of the buffer. + */ +static int logbuf_has_space(u32 msg_size, bool empty) { u32 free; - if (log_next_idx > log_first_idx) + if (log_next_idx > log_first_idx || empty) free = max(log_buf_len - log_next_idx, log_first_idx); else free = log_first_idx - log_next_idx; @@ -314,15 +322,21 @@ static int logbuf_has_space(u32 msg_size) return free >= msg_size + sizeof(struct printk_log); } -static void log_make_free_space(u32 msg_size) +static int log_make_free_space(u32 msg_size) { while (log_first_seq < log_next_seq) { - if (logbuf_has_space(msg_size)) - return; + if (logbuf_has_space(msg_size, false)) + return 0; /* drop old messages until we have enough continuous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } + + /* sequence numbers are equal, so the log buffer is empty */ + if (logbuf_has_space(msg_size, true)) + return 0; + + return -ENOMEM; } /* insert record into the buffer, discard old ones, update heads */ @@ -339,7 +353,9 @@ static void log_store(int facility, int level, pad_len = (-size) & (LOG_ALIGN - 1); size += pad_len; - log_make_free_space(size); + /* if message does not fit empty log buffer, ignore it */ + if (log_make_free_space(size)) + return; if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { /* -- cgit From 85c87043023b7e5535f975bbee12a4f5399df520 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 4 Jun 2014 16:11:31 -0700 Subject: printk: split message size computation We will want to recompute the message size when shrinking too long messages. Let's put the code into separate function. The side effect of setting "pad_len" is not nice but it is worth removing the code duplication. Note that I will probably have one more usage for this function when handling messages safe way in NMI context. This patch does not change the existing behavior. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Jiri Kosina Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8fbbab1771eb..9f088ed8404b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -339,6 +339,18 @@ static int log_make_free_space(u32 msg_size) return -ENOMEM; } +/* compute the message size including the padding bytes */ +static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) +{ + u32 size; + + size = sizeof(struct printk_log) + text_len + dict_len; + *pad_len = (-size) & (LOG_ALIGN - 1); + size += *pad_len; + + return size; +} + /* insert record into the buffer, discard old ones, update heads */ static void log_store(int facility, int level, enum log_flags flags, u64 ts_nsec, @@ -349,9 +361,7 @@ static void log_store(int facility, int level, u32 size, pad_len; /* number of '\0' padding bytes to next message */ - size = sizeof(struct printk_log) + text_len + dict_len; - pad_len = (-size) & (LOG_ALIGN - 1); - size += pad_len; + size = msg_used_size(text_len, dict_len, &pad_len); /* if message does not fit empty log buffer, ignore it */ if (log_make_free_space(size)) -- cgit From 55bd53a4eb3dd18be8744f8b4d026068fc801a62 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 4 Jun 2014 16:11:32 -0700 Subject: printk: shrink too long messages We might want to print at least part of too long messages and add some warning for debugging purpose. The question is how long the shrunken message should be. If we use the whole buffer, it might get rotated too soon. Let's try to use only 1/4 of the buffer for now. Also shrink the whole dictionary. We do not want to parse it or break it in the middle of some pair of values. It would not cause any real harm but still. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Jiri Kosina Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9f088ed8404b..7131dd4d0e3a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -351,6 +351,32 @@ static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) return size; } +/* + * Define how much of the log buffer we could take at maximum. The value + * must be greater than two. Note that only half of the buffer is available + * when the index points to the middle. + */ +#define MAX_LOG_TAKE_PART 4 +static const char trunc_msg[] = ""; + +static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, + u16 *dict_len, u32 *pad_len) +{ + /* + * The message should not take the whole buffer. Otherwise, it might + * get removed too soon. + */ + u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; + if (*text_len > max_text_len) + *text_len = max_text_len; + /* enable the warning message */ + *trunc_msg_len = strlen(trunc_msg); + /* disable the "dict" completely */ + *dict_len = 0; + /* compute the size again, count also the warning message */ + return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); +} + /* insert record into the buffer, discard old ones, update heads */ static void log_store(int facility, int level, enum log_flags flags, u64 ts_nsec, @@ -359,13 +385,19 @@ static void log_store(int facility, int level, { struct printk_log *msg; u32 size, pad_len; + u16 trunc_msg_len = 0; /* number of '\0' padding bytes to next message */ size = msg_used_size(text_len, dict_len, &pad_len); - /* if message does not fit empty log buffer, ignore it */ - if (log_make_free_space(size)) - return; + if (log_make_free_space(size)) { + /* truncate the message if it is too long for empty buffer */ + size = truncate_msg(&text_len, &trunc_msg_len, + &dict_len, &pad_len); + /* survive when the log buffer is too small for trunc_msg */ + if (log_make_free_space(size)) + return; + } if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { /* @@ -381,6 +413,10 @@ static void log_store(int facility, int level, msg = (struct printk_log *)(log_buf + log_next_idx); memcpy(log_text(msg), text, text_len); msg->text_len = text_len; + if (trunc_msg_len) { + memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len); + msg->text_len += trunc_msg_len; + } memcpy(log_dict(msg), dict, dict_len); msg->dict_len = dict_len; msg->facility = facility; -- cgit From 034633ccb24d675850f99bf85c1c5880c831e4b6 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 4 Jun 2014 16:11:33 -0700 Subject: printk: return really stored message length I wonder if anyone uses printk return value but it is there and should be counted correctly. This patch modifies log_store() to return the number of really stored bytes from the 'text' part. Also it handles the return value in vprintk_emit(). Note that log_store() is used also in cont_flush() but we could ignore the return value there. The function works with characters that were already counted earlier. In addition, the store could newer fail here because the length of the printed text is limited by the "cont" buffer and "dict" is NULL. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Jiri Kosina Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7131dd4d0e3a..7476a53bc378 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -378,10 +378,10 @@ static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, } /* insert record into the buffer, discard old ones, update heads */ -static void log_store(int facility, int level, - enum log_flags flags, u64 ts_nsec, - const char *dict, u16 dict_len, - const char *text, u16 text_len) +static int log_store(int facility, int level, + enum log_flags flags, u64 ts_nsec, + const char *dict, u16 dict_len, + const char *text, u16 text_len) { struct printk_log *msg; u32 size, pad_len; @@ -396,7 +396,7 @@ static void log_store(int facility, int level, &dict_len, &pad_len); /* survive when the log buffer is too small for trunc_msg */ if (log_make_free_space(size)) - return; + return 0; } if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { @@ -432,6 +432,8 @@ static void log_store(int facility, int level, /* insert message */ log_next_idx += msg->len; log_next_seq++; + + return msg->text_len; } #ifdef CONFIG_SECURITY_DMESG_RESTRICT @@ -1606,10 +1608,10 @@ asmlinkage int vprintk_emit(int facility, int level, "BUG: recent printk recursion!"; recursion_bug = 0; - printed_len += strlen(recursion_msg); + text_len = strlen(recursion_msg); /* emit KERN_CRIT message */ - log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, recursion_msg, printed_len); + printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, + NULL, 0, recursion_msg, text_len); } /* @@ -1662,9 +1664,12 @@ asmlinkage int vprintk_emit(int facility, int level, cont_flush(LOG_NEWLINE); /* buffer line if possible, otherwise store it right away */ - if (!cont_add(facility, level, text, text_len)) - log_store(facility, level, lflags | LOG_CONT, 0, - dict, dictlen, text, text_len); + if (cont_add(facility, level, text, text_len)) + printed_len += text_len; + else + printed_len += log_store(facility, level, + lflags | LOG_CONT, 0, + dict, dictlen, text, text_len); } else { bool stored = false; @@ -1683,11 +1688,12 @@ asmlinkage int vprintk_emit(int facility, int level, cont_flush(LOG_NEWLINE); } - if (!stored) - log_store(facility, level, lflags, 0, - dict, dictlen, text, text_len); + if (stored) + printed_len += text_len; + else + printed_len += log_store(facility, level, lflags, 0, + dict, dictlen, text, text_len); } - printed_len += text_len; /* * Try to acquire and then immediately release the console semaphore. -- cgit From ca1d432ad8a527fabc5c7ceed8526e3a28de121c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 4 Jun 2014 16:11:34 -0700 Subject: printk: remove outdated comment Comment about interesting interlocking between lockbuf_lock and console_sem is outdated. It was added in 2002 by commit a880f45a48be during conversion of console_lock to console_sem + lockbuf_lock. At that time release_console_sem() (today's equivalent is console_unlock()) was indeed using lockbuf_lock to avoid races between trylock on console_sem in printk() and unlock of console_sem. However these days the interlocking is gone and the races are avoided by rechecking logbuf state after releasing console_sem. Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7476a53bc378..5bc54478c963 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -206,8 +206,7 @@ struct printk_log { }; /* - * The logbuf_lock protects kmsg buffer, indices, counters. It is also - * used in interesting ways to provide interlocking in console_unlock(); + * The logbuf_lock protects kmsg buffer, indices, counters. */ static DEFINE_RAW_SPINLOCK(logbuf_lock); -- cgit From 608873cacb9d0d2811586fcc79a38b64eabd6d32 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 4 Jun 2014 16:11:35 -0700 Subject: printk: release lockbuf_lock before calling console_trylock_for_printk() There's no reason to hold lockbuf_lock when entering console_trylock_for_printk(). The first thing this function does is to call down_trylock(console_sem) and if that fails it immediately unlocks lockbuf_lock. So lockbuf_lock isn't needed for that branch. When down_trylock() succeeds, the rest of console_trylock() is OK without lockbuf_lock (it is called without it from other places), and the only remaining thing in console_trylock_for_printk() is can_use_console() call. For that call console_sem is enough (it iterates all consoles and checks CON_ANYTIME flag). So we drop logbuf_lock before entering console_trylock_for_printk() which simplifies the code. [akpm@linux-foundation.org: fix have_callable_console() comment] Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 54 ++++++++++++++++++++------------------------------ 1 file changed, 21 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5bc54478c963..6e1b21a8a497 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -249,9 +249,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; -/* cpu currently holding logbuf_lock */ -static volatile unsigned int logbuf_cpu = UINT_MAX; - /* human readable text of the record */ static char *log_text(const struct printk_log *msg) { @@ -1380,7 +1377,10 @@ static void zap_locks(void) sema_init(&console_sem, 1); } -/* Check if we have any console registered that can be called early in boot. */ +/* + * Check if we have any console that is capable of printing while cpu is + * booting or shutting down. Requires console_sem. + */ static int have_callable_console(void) { struct console *con; @@ -1410,36 +1410,22 @@ static inline int can_use_console(unsigned int cpu) * messages from a 'printk'. Return true (and with the * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. - * - * This gets called with the 'logbuf_lock' spinlock held and - * interrupts disabled. It should return with 'lockbuf_lock' - * released but interrupts still disabled. */ static int console_trylock_for_printk(unsigned int cpu) - __releases(&logbuf_lock) { - int retval = 0, wake = 0; - - if (console_trylock()) { - retval = 1; - - /* - * If we can't use the console, we need to release - * the console semaphore by hand to avoid flushing - * the buffer. We need to hold the console semaphore - * in order to do this test safely. - */ - if (!can_use_console(cpu)) { - console_locked = 0; - wake = 1; - retval = 0; - } - } - logbuf_cpu = UINT_MAX; - raw_spin_unlock(&logbuf_lock); - if (wake) + if (!console_trylock()) + return 0; + /* + * If we can't use the console, we need to release the console + * semaphore by hand to avoid flushing the buffer. We need to hold the + * console semaphore in order to do this test safely. + */ + if (!can_use_console(cpu)) { + console_locked = 0; up(&console_sem); - return retval; + return 0; + } + return 1; } int printk_delay_msec __read_mostly; @@ -1572,6 +1558,9 @@ asmlinkage int vprintk_emit(int facility, int level, unsigned long flags; int this_cpu; int printed_len = 0; + /* cpu currently holding logbuf_lock in this function */ + static volatile unsigned int logbuf_cpu = UINT_MAX; + boot_delay_msec(level); printk_delay(); @@ -1694,13 +1683,12 @@ asmlinkage int vprintk_emit(int facility, int level, dict, dictlen, text, text_len); } + logbuf_cpu = UINT_MAX; + raw_spin_unlock(&logbuf_lock); /* * Try to acquire and then immediately release the console semaphore. * The release will print out buffers and wake up /dev/kmsg and syslog() * users. - * - * The console_trylock_for_printk() function will release 'logbuf_lock' - * regardless of whether it actually gets the console semaphore or not. */ if (console_trylock_for_printk(this_cpu)) console_unlock(); -- cgit From bd8d7cf5b8410fe98eba06a9aaa90efe88815d8a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 4 Jun 2014 16:11:36 -0700 Subject: printk: fix lockdep instrumentation of console_sem Printk calls mutex_acquire() / mutex_release() by hand to instrument lockdep about console_sem. However in some corner cases the instrumentation is missing. Fix the problem by creating helper functions for locking / unlocking console_sem which take care of lockdep instrumentation as well. Signed-off-by: Jan Kara Reported-by: Fabio Estevam Reported-by: Andy Shevchenko Tested-by: Fabio Estevam Tested-By: Valdis Kletnieks Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 6e1b21a8a497..5ba37f813723 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -90,6 +90,29 @@ static struct lockdep_map console_lock_dep_map = { }; #endif +/* + * Helper macros to handle lockdep when locking/unlocking console_sem. We use + * macros instead of functions so that _RET_IP_ contains useful information. + */ +#define down_console_sem() do { \ + down(&console_sem);\ + mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\ +} while (0) + +static int __down_trylock_console_sem(unsigned long ip) +{ + if (down_trylock(&console_sem)) + return 1; + mutex_acquire(&console_lock_dep_map, 0, 1, ip); + return 0; +} +#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) + +#define up_console_sem() do { \ + mutex_release(&console_lock_dep_map, 1, _RET_IP_);\ + up(&console_sem);\ +} while (0) + /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -1422,7 +1445,7 @@ static int console_trylock_for_printk(unsigned int cpu) */ if (!can_use_console(cpu)) { console_locked = 0; - up(&console_sem); + up_console_sem(); return 0; } return 1; @@ -1951,16 +1974,14 @@ void suspend_console(void) printk("Suspending console(s) (use no_console_suspend to debug)\n"); console_lock(); console_suspended = 1; - up(&console_sem); - mutex_release(&console_lock_dep_map, 1, _RET_IP_); + up_console_sem(); } void resume_console(void) { if (!console_suspend_enabled) return; - down(&console_sem); - mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); + down_console_sem(); console_suspended = 0; console_unlock(); } @@ -2002,12 +2023,11 @@ void console_lock(void) { might_sleep(); - down(&console_sem); + down_console_sem(); if (console_suspended) return; console_locked = 1; console_may_schedule = 1; - mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); } EXPORT_SYMBOL(console_lock); @@ -2021,15 +2041,14 @@ EXPORT_SYMBOL(console_lock); */ int console_trylock(void) { - if (down_trylock(&console_sem)) + if (down_trylock_console_sem()) return 0; if (console_suspended) { - up(&console_sem); + up_console_sem(); return 0; } console_locked = 1; console_may_schedule = 0; - mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); return 1; } EXPORT_SYMBOL(console_trylock); @@ -2091,7 +2110,7 @@ void console_unlock(void) bool retry; if (console_suspended) { - up(&console_sem); + up_console_sem(); return; } @@ -2153,7 +2172,6 @@ skip: local_irq_restore(flags); } console_locked = 0; - mutex_release(&console_lock_dep_map, 1, _RET_IP_); /* Release the exclusive_console once it is used */ if (unlikely(exclusive_console)) @@ -2161,7 +2179,7 @@ skip: raw_spin_unlock(&logbuf_lock); - up(&console_sem); + up_console_sem(); /* * Someone could have filled up the buffer again, so re-check if there's @@ -2206,7 +2224,7 @@ void console_unblank(void) * oops_in_progress is set to 1.. */ if (oops_in_progress) { - if (down_trylock(&console_sem) != 0) + if (down_trylock_console_sem() != 0) return; } else console_lock(); -- cgit From 939f04bec1a4ef6ba4370b0f34b01decc844b1b1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 4 Jun 2014 16:11:37 -0700 Subject: printk: enable interrupts before calling console_trylock_for_printk() We need interrupts disabled when calling console_trylock_for_printk() only so that cpu id we pass to can_use_console() remains valid (for other things console_sem provides all the exclusion we need and deadlocks on console_sem due to interrupts are impossible because we use down_trylock()). However if we are rescheduled, we are guaranteed to run on an online cpu so we can easily just get the cpu id in can_use_console(). We can lose a bit of performance when we enable interrupts in vprintk_emit() and then disable them again in console_unlock() but OTOH it can somewhat reduce interrupt latency caused by console_unlock() especially since later in the patch series we will want to spin on console_sem in console_trylock_for_printk(). Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5ba37f813723..4e22230f1f6c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1418,10 +1418,9 @@ static int have_callable_console(void) /* * Can we actually use the console at this time on this cpu? * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. */ static inline int can_use_console(unsigned int cpu) { @@ -1434,8 +1433,10 @@ static inline int can_use_console(unsigned int cpu) * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. */ -static int console_trylock_for_printk(unsigned int cpu) +static int console_trylock_for_printk(void) { + unsigned int cpu = smp_processor_id(); + if (!console_trylock()) return 0; /* @@ -1605,7 +1606,8 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; - goto out_restore_irqs; + local_irq_restore(flags); + return 0; } zap_locks(); } @@ -1708,17 +1710,22 @@ asmlinkage int vprintk_emit(int facility, int level, logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); + lockdep_on(); + local_irq_restore(flags); + + /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to console + */ + preempt_disable(); /* * Try to acquire and then immediately release the console semaphore. * The release will print out buffers and wake up /dev/kmsg and syslog() * users. */ - if (console_trylock_for_printk(this_cpu)) + if (console_trylock_for_printk()) console_unlock(); - - lockdep_on(); -out_restore_irqs: - local_irq_restore(flags); + preempt_enable(); return printed_len; } -- cgit From 458df9fd4815b47809875d57f42e16401674b621 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 4 Jun 2014 16:11:38 -0700 Subject: printk: remove separate printk_sched buffers and use printk buf instead To prevent deadlocks with doing a printk inside the scheduler, printk_sched() was created. The issue is that printk has a console_sem that it can grab and release. The release does a wake up if there's a task pending on the sem, and this wake up grabs the rq locks that is held in the scheduler. This leads to a possible deadlock if the wake up uses the same rq as the one with the rq lock held already. What printk_sched() does is to save the printk write in a per cpu buffer and sets the PRINTK_PENDING_SCHED flag. On a timer tick, if this flag is set, the printk() is done against the buffer. There's a couple of issues with this approach. 1) If two printk_sched()s are called before the tick, the second one will overwrite the first one. 2) The temporary buffer is 512 bytes and is per cpu. This is a quite a bit of space wasted for something that is seldom used. In order to remove this, the printk_sched() can use the printk buffer instead, and delay the console_trylock()/console_unlock() to the queued work. Because printk_sched() would then be taking the logbuf_lock, the logbuf_lock must not be held while doing anything that may call into the scheduler functions, which includes wake ups. Unfortunately, printk() also has a console_sem that it uses, and on release, the up(&console_sem) may do a wake up of any pending waiters. This must be avoided while holding the logbuf_lock. Signed-off-by: Steven Rostedt Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 47 +++++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4e22230f1f6c..247b0c1fadfc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -68,6 +68,9 @@ int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; +/* Deferred messaged from sched code are marked by this special level */ +#define SCHED_MESSAGE_LOGLEVEL -2 + /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -229,7 +232,9 @@ struct printk_log { }; /* - * The logbuf_lock protects kmsg buffer, indices, counters. + * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken + * within the scheduler's rq lock. It must be released before calling + * console_unlock() or anything else that might wake up a process. */ static DEFINE_RAW_SPINLOCK(logbuf_lock); @@ -1577,14 +1582,19 @@ asmlinkage int vprintk_emit(int facility, int level, static int recursion_bug; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; - size_t text_len; + size_t text_len = 0; enum log_flags lflags = 0; unsigned long flags; int this_cpu; int printed_len = 0; + bool in_sched = false; /* cpu currently holding logbuf_lock in this function */ static volatile unsigned int logbuf_cpu = UINT_MAX; + if (level == SCHED_MESSAGE_LOGLEVEL) { + level = -1; + in_sched = true; + } boot_delay_msec(level); printk_delay(); @@ -1631,7 +1641,12 @@ asmlinkage int vprintk_emit(int facility, int level, * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. */ - text_len = vscnprintf(text, sizeof(textbuf), fmt, args); + if (in_sched) + text_len = scnprintf(text, sizeof(textbuf), + KERN_WARNING "[sched_delayed] "); + + text_len += vscnprintf(text + text_len, + sizeof(textbuf) - text_len, fmt, args); /* mark and strip a trailing newline */ if (text_len && text[text_len-1] == '\n') { @@ -1713,6 +1728,10 @@ asmlinkage int vprintk_emit(int facility, int level, lockdep_on(); local_irq_restore(flags); + /* If called from the scheduler, we can not call up(). */ + if (in_sched) + return printed_len; + /* * Disable preemption to avoid being preempted while holding * console_sem which would prevent anyone from printing to console @@ -2532,21 +2551,19 @@ late_initcall(printk_late_init); /* * Delayed printk version, for scheduler-internal messages: */ -#define PRINTK_BUF_SIZE 512 - #define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_SCHED 0x02 +#define PRINTK_PENDING_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = __this_cpu_xchg(printk_pending, 0); - if (pending & PRINTK_PENDING_SCHED) { - char *buf = __get_cpu_var(printk_sched_buf); - pr_warn("[sched_delayed] %s", buf); + if (pending & PRINTK_PENDING_OUTPUT) { + /* If trylock fails, someone else is doing the printing */ + if (console_trylock()) + console_unlock(); } if (pending & PRINTK_PENDING_WAKEUP) @@ -2570,21 +2587,15 @@ void wake_up_klogd(void) int printk_sched(const char *fmt, ...) { - unsigned long flags; va_list args; - char *buf; int r; - local_irq_save(flags); - buf = __get_cpu_var(printk_sched_buf); - va_start(args, fmt); - r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); + r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); va_end(args); - __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); + __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); - local_irq_restore(flags); return r; } -- cgit From 81954606265ab8f04b41154bd00576013affcf5b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 4 Jun 2014 16:11:39 -0700 Subject: printk: disable preemption for printk_sched An earlier change in -mm (printk: remove separate printk_sched buffers...), removed the printk_sched irqsave/restore lines since it was safe for current users. Since we may be expanding usage of printk_sched(), disable preepmtion for this function to make it more generally safe to call. Signed-off-by: John Stultz Reviewed-by: Jan Kara Cc: Peter Zijlstra Cc: Jiri Bohac Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 247b0c1fadfc..dc2b8bd9bc1e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2590,12 +2590,14 @@ int printk_sched(const char *fmt, ...) va_list args; int r; + preempt_disable(); va_start(args, fmt); r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); va_end(args); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + preempt_enable(); return r; } -- cgit From aac74dc495456412c4130a1167ce4beb6c1f0b38 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 4 Jun 2014 16:11:40 -0700 Subject: printk: rename printk_sched to printk_deferred After learning we'll need some sort of deferred printk functionality in the timekeeping core, Peter suggested we rename the printk_sched function so it can be reused by needed subsystems. This only changes the function name. No logic changes. Signed-off-by: John Stultz Reviewed-by: Steven Rostedt Cc: Jan Kara Cc: Peter Zijlstra Cc: Jiri Bohac Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 6 +++--- kernel/printk/printk.c | 2 +- kernel/sched/core.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/rt.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index 8752f7595b27..7847301e2837 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -128,9 +128,9 @@ asmlinkage __printf(1, 2) __cold int printk(const char *fmt, ...); /* - * Special printk facility for scheduler use only, _DO_NOT_USE_ ! + * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! */ -__printf(1, 2) __cold int printk_sched(const char *fmt, ...); +__printf(1, 2) __cold int printk_deferred(const char *fmt, ...); /* * Please don't use printk_ratelimit(), because it shares ratelimiting state @@ -165,7 +165,7 @@ int printk(const char *s, ...) return 0; } static inline __printf(1, 2) __cold -int printk_sched(const char *s, ...) +int printk_deferred(const char *s, ...) { return 0; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index dc2b8bd9bc1e..35d9db251903 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2585,7 +2585,7 @@ void wake_up_klogd(void) preempt_enable(); } -int printk_sched(const char *fmt, ...) +int printk_deferred(const char *fmt, ...) { va_list args; int r; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 913c6d6cc2c1..caf03e89a068 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1367,7 +1367,7 @@ out: * leave kernel. */ if (p->mm && printk_ratelimit()) { - printk_sched("process %d (%s) no longer affine to cpu%d\n", + printk_deferred("process %d (%s) no longer affine to cpu%d\n", task_pid_nr(p), p->comm, cpu); } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f9ca7d19781a..d17e1c48a79d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -352,7 +352,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, if (!lag_once) { lag_once = true; - printk_sched("sched: DL replenish lagged to much\n"); + printk_deferred("sched: DL replenish lagged to much\n"); } dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0ebfd7a29472..5d7667b37c21 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -896,7 +896,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) if (!once) { once = true; - printk_sched("sched: RT throttling activated\n"); + printk_deferred("sched: RT throttling activated\n"); } } else { /* -- cgit From c224815dac9c739b79050d3cc67443ff500bc478 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 4 Jun 2014 16:11:41 -0700 Subject: printk: Add printk_deferred_once Two of the three prink_deferred uses are really printk_once style uses, so add a printk_deferred_once macro to simplify those call sites. Signed-off-by: John Stultz Reviewed-by: Steven Rostedt Reviewed-by: Jan Kara Cc: Peter Zijlstra Cc: Jiri Bohac Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 11 +++++++++++ kernel/sched/deadline.c | 7 +------ kernel/sched/rt.c | 8 +------- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index 7847301e2837..f086d6c99dbc 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -266,9 +266,20 @@ extern asmlinkage void dump_stack(void) __cold; printk(fmt, ##__VA_ARGS__); \ } \ }) +#define printk_deferred_once(fmt, ...) \ +({ \ + static bool __print_once __read_mostly; \ + \ + if (!__print_once) { \ + __print_once = true; \ + printk_deferred(fmt, ##__VA_ARGS__); \ + } \ +}) #else #define printk_once(fmt, ...) \ no_printk(fmt, ##__VA_ARGS__) +#define printk_deferred_once(fmt, ...) \ + no_printk(fmt, ##__VA_ARGS__) #endif #define pr_emerg_once(fmt, ...) \ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d17e1c48a79d..e1574fca03b5 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -348,12 +348,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * entity. */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { - static bool lag_once = false; - - if (!lag_once) { - lag_once = true; - printk_deferred("sched: DL replenish lagged to much\n"); - } + printk_deferred_once("sched: DL replenish lagged to much\n"); dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5d7667b37c21..b3512f1afce9 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -890,14 +890,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) * but accrue some time due to boosting. */ if (likely(rt_b->rt_runtime)) { - static bool once = false; - rt_rq->rt_throttled = 1; - - if (!once) { - once = true; - printk_deferred("sched: RT throttling activated\n"); - } + printk_deferred_once("sched: RT throttling activated\n"); } else { /* * In case we did anyway, make it go away, -- cgit From 6d9bcb621b0b0a20604cbdb298c4487e44dd0da2 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 4 Jun 2014 16:11:43 -0700 Subject: timekeeping: use printk_deferred when holding timekeeping seqlock Jiri Bohac pointed out that there are rare but potential deadlock possibilities when calling printk while holding the timekeeping seqlock. This is due to printk() triggering console sem wakeup, which can cause scheduling code to trigger hrtimers which may try to read the time. Specifically, as Jiri pointed out, that path is: printk vprintk_emit console_unlock up(&console_sem) __up wake_up_process try_to_wake_up ttwu_do_activate ttwu_activate activate_task enqueue_task enqueue_task_fair hrtick_update hrtick_start_fair hrtick_start_fair get_time ktime_get --> endless loop on read_seqcount_retry(&timekeeper_seq, ...) This patch tries to avoid this issue by using printk_deferred (previously named printk_sched) which should defer printing via a irq_work_queue. Signed-off-by: John Stultz Reported-by: Jiri Bohac Reviewed-by: Steven Rostedt Cc: Jan Kara Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 15 +++++++++------ kernel/time/timekeeping.c | 7 ++++--- 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 419a52cecd20..5b0ac4de3822 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -786,8 +786,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) time_status |= STA_PPSERROR; pps_errcnt++; pps_dec_freq_interval(); - pr_err("hardpps: PPSERROR: interval too long - %ld s\n", - freq_norm.sec); + printk_deferred(KERN_ERR + "hardpps: PPSERROR: interval too long - %ld s\n", + freq_norm.sec); return 0; } @@ -800,7 +801,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); pps_freq = ftemp; if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { - pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); + printk_deferred(KERN_WARNING + "hardpps: PPSWANDER: change=%ld\n", delta); time_status |= STA_PPSWANDER; pps_stbcnt++; pps_dec_freq_interval(); @@ -844,8 +846,9 @@ static void hardpps_update_phase(long error) * the time offset is updated. */ if (jitter > (pps_jitter << PPS_POPCORN)) { - pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", - jitter, (pps_jitter << PPS_POPCORN)); + printk_deferred(KERN_WARNING + "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (pps_jitter << PPS_POPCORN)); time_status |= STA_PPSJITTER; pps_jitcnt++; } else if (time_status & STA_PPSTIME) { @@ -902,7 +905,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; - pr_err("hardpps: PPSJITTER: bad pulse\n"); + printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); return; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f7df8ea21707..32d8d6aaedb8 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -852,8 +852,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, struct timespec *delta) { if (!timespec_valid_strict(delta)) { - printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " - "sleep delta value!\n"); + printk_deferred(KERN_WARNING + "__timekeeping_inject_sleeptime: Invalid " + "sleep delta value!\n"); return; } tk_xtime_add(tk, delta); @@ -1157,7 +1158,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) if (unlikely(tk->clock->maxadj && (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { - printk_once(KERN_WARNING + printk_deferred_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", tk->clock->name, (long)tk->mult + adj, (long)tk->clock->mult + tk->clock->maxadj); -- cgit From 84b5ec8a9df86f3dcaaaf912715db35e4852d1da Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 4 Jun 2014 16:11:45 -0700 Subject: printk: report dropping of messages from logbuf If the log ring buffer becomes full, we silently overwrite old messages with new data. console_unlock will detect this case and fast-forward the console_* pointers to skip over the corrupted data, but nothing will be reported to the user. This patch hijacks the first valid log message after detecting that we dropped messages and prefixes it with a note detailing how many messages were dropped. For long (~1000 char) messages, this will result in some truncation of the real message, but given that we're dropping things anyway, that doesn't seem to be the end of the world. Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Kay Sievers Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 35d9db251903..923c5d4e4202 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2157,10 +2157,15 @@ again: } if (console_seq < log_first_seq) { + len = sprintf(text, "** %u printk messages dropped ** ", + (unsigned)(log_first_seq - console_seq)); + /* messages are gone, move to first one */ console_seq = log_first_seq; console_idx = log_first_idx; console_prev = 0; + } else { + len = 0; } skip: if (console_seq == log_next_seq) @@ -2185,8 +2190,8 @@ skip: } level = msg->level; - len = msg_print_text(msg, console_prev, false, - text, sizeof(text)); + len += msg_print_text(msg, console_prev, false, + text + len, sizeof(text) - len); console_idx = log_next(console_idx); console_seq++; console_prev = msg->flags; -- cgit From a8fe19ebfbfd90ec17c02284717238b02efb9580 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 4 Jun 2014 16:11:46 -0700 Subject: kernel/printk: use symbolic defines for console loglevels ... instead of naked numbers. Stuff in sysrq.c used to set it to 8 which is supposed to mean above default level so set it to DEBUG instead as we're terminating/killing all tasks and we want to be verbose there. Also, correct the check in x86_64_start_kernel which should be >= as we're clearly issuing the string there for all debug levels, not only the magical 10. Signed-off-by: Borislav Petkov Acked-by: Kees Cook Acked-by: Randy Dunlap Cc: Joe Perches Cc: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/head64.c | 2 +- arch/x86/platform/uv/uv_nmi.c | 2 +- drivers/nubus/nubus.c | 18 +++++++++--------- drivers/tty/sysrq.c | 8 ++++---- include/linux/printk.h | 15 +++++++++++++-- init/main.c | 4 ++-- kernel/debug/kdb/kdb_bt.c | 2 +- kernel/debug/kdb/kdb_io.c | 2 +- kernel/debug/kdb/kdb_main.c | 2 +- kernel/printk/printk.c | 13 +++---------- 10 files changed, 36 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 068054f4bf20..eda1a865641e 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -172,7 +172,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) */ load_ucode_bsp(); - if (console_loglevel == 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) early_printk("Kernel alive\n"); clear_page(init_level4_pgt); diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index be27da60dc8f..c89c93320c12 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -85,7 +85,7 @@ static cpumask_var_t uv_nmi_cpu_mask; * Default is all stack dumps go to the console and buffer. * Lower level to send to log buffer only. */ -static int uv_nmi_loglevel = 7; +static int uv_nmi_loglevel = CONSOLE_LOGLEVEL_DEFAULT; module_param_named(dump_loglevel, uv_nmi_loglevel, int, 0644); /* diff --git a/drivers/nubus/nubus.c b/drivers/nubus/nubus.c index 43926cd25ae8..5066a7ef7b6c 100644 --- a/drivers/nubus/nubus.c +++ b/drivers/nubus/nubus.c @@ -473,7 +473,7 @@ static struct nubus_dev* __init if (slot == 0 && (unsigned long)dir.base % 2) dir.base += 1; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_functional_resource: parent is 0x%p, dir is 0x%p\n", parent->base, dir.base); @@ -568,7 +568,7 @@ static int __init nubus_get_vidnames(struct nubus_board* board, printk(KERN_INFO " video modes supported:\n"); nubus_get_subdir(parent, &dir); - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_vidnames: parent is 0x%p, dir is 0x%p\n", parent->base, dir.base); @@ -629,7 +629,7 @@ static int __init nubus_get_vendorinfo(struct nubus_board* board, printk(KERN_INFO " vendor info:\n"); nubus_get_subdir(parent, &dir); - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_vendorinfo: parent is 0x%p, dir is 0x%p\n", parent->base, dir.base); @@ -654,7 +654,7 @@ static int __init nubus_get_board_resource(struct nubus_board* board, int slot, struct nubus_dirent ent; nubus_get_subdir(parent, &dir); - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_board_resource: parent is 0x%p, dir is 0x%p\n", parent->base, dir.base); @@ -753,19 +753,19 @@ static void __init nubus_find_rom_dir(struct nubus_board* board) if (nubus_readdir(&dir, &ent) == -1) goto badrom; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_INFO "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); /* This one takes us to where we want to go. */ if (nubus_readdir(&dir, &ent) == -1) goto badrom; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); nubus_get_subdir(&ent, &dir); /* Resource ID 01, also an "Unknown Macintosh" */ if (nubus_readdir(&dir, &ent) == -1) goto badrom; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); /* FIXME: the first one is *not* always the right one. We @@ -780,7 +780,7 @@ static void __init nubus_find_rom_dir(struct nubus_board* board) path to that address... */ if (nubus_readdir(&dir, &ent) == -1) goto badrom; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); /* Bwahahahaha... */ @@ -816,7 +816,7 @@ static struct nubus_board* __init nubus_add_board(int slot, int bytelanes) board->fblock = rp; /* Dump the format block for debugging purposes */ - if (console_loglevel >= 10) { + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) { int i; printk(KERN_DEBUG "Slot %X, format block at 0x%p\n", slot, rp); diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index ce396ecdf412..b767a64e49d9 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -88,7 +88,7 @@ static void sysrq_handle_loglevel(int key) int i; i = key - '0'; - console_loglevel = 7; + console_loglevel = CONSOLE_LOGLEVEL_DEFAULT; printk("Loglevel set to %d\n", i); console_loglevel = i; } @@ -343,7 +343,7 @@ static void send_sig_all(int sig) static void sysrq_handle_term(int key) { send_sig_all(SIGTERM); - console_loglevel = 8; + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; } static struct sysrq_key_op sysrq_term_op = { .handler = sysrq_handle_term, @@ -387,7 +387,7 @@ static struct sysrq_key_op sysrq_thaw_op = { static void sysrq_handle_kill(int key) { send_sig_all(SIGKILL); - console_loglevel = 8; + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; } static struct sysrq_key_op sysrq_kill_op = { .handler = sysrq_handle_kill, @@ -520,7 +520,7 @@ void __handle_sysrq(int key, bool check_mask) * routing in the consumers of /proc/kmsg. */ orig_log_level = console_loglevel; - console_loglevel = 7; + console_loglevel = CONSOLE_LOGLEVEL_DEFAULT; printk(KERN_INFO "SysRq : "); op_p = __sysrq_get_key_op(key); diff --git a/include/linux/printk.h b/include/linux/printk.h index 37f3a6589c1c..319ff7e53efb 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -30,6 +30,17 @@ static inline const char *printk_skip_level(const char *buffer) return buffer; } +/* printk's without a loglevel use this.. */ +#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL + +/* We show everything that is MORE important than this.. */ +#define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */ +#define CONSOLE_LOGLEVEL_MIN 1 /* Minimum loglevel we let people use */ +#define CONSOLE_LOGLEVEL_QUIET 4 /* Shhh ..., when booted with "quiet" */ +#define CONSOLE_LOGLEVEL_DEFAULT 7 /* anything MORE serious than KERN_DEBUG */ +#define CONSOLE_LOGLEVEL_DEBUG 10 /* issue debug messages */ +#define CONSOLE_LOGLEVEL_MOTORMOUTH 15 /* You can't shut this one up */ + extern int console_printk[]; #define console_loglevel (console_printk[0]) @@ -39,13 +50,13 @@ extern int console_printk[]; static inline void console_silent(void) { - console_loglevel = 0; + console_loglevel = CONSOLE_LOGLEVEL_SILENT; } static inline void console_verbose(void) { if (console_loglevel) - console_loglevel = 15; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; } struct va_format { diff --git a/init/main.c b/init/main.c index e08c0b2065a1..04fab8d74c89 100644 --- a/init/main.c +++ b/init/main.c @@ -203,13 +203,13 @@ EXPORT_SYMBOL(loops_per_jiffy); static int __init debug_kernel(char *str) { - console_loglevel = 10; + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; return 0; } static int __init quiet_kernel(char *str) { - console_loglevel = 4; + console_loglevel = CONSOLE_LOGLEVEL_QUIET; return 0; } diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index b03e0e814e43..fe15fff5df53 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -21,7 +21,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr) { int old_lvl = console_loglevel; - console_loglevel = 15; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; kdb_set_current_task(p); if (addr) { diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 14ff4849262c..7c70812caea5 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -710,7 +710,7 @@ kdb_printit: } if (logging) { saved_loglevel = console_loglevel; - console_loglevel = 0; + console_loglevel = CONSOLE_LOGLEVEL_SILENT; printk(KERN_INFO "%s", kdb_buffer); } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 0b097c8a1e50..2f7c760305ca 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1091,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv) static void kdb_dumpregs(struct pt_regs *regs) { int old_lvl = console_loglevel; - console_loglevel = 15; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; show_regs(regs); kdb_trap_printk--; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 923c5d4e4202..ea2d5f6962ed 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -54,18 +54,11 @@ #include "console_cmdline.h" #include "braille.h" -/* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL - -/* We show everything that is MORE important than this.. */ -#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ -#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ - int console_printk[4] = { - DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ + CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ - MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ - DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ + CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ + CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; /* Deferred messaged from sched code are marked by this special level */ -- cgit From 6516a466193fe7f72644d65467fb9905139228c3 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:12:02 -0700 Subject: kernel/compat.c: use sizeof() instead of sizeof Fix 4 checkpatch warnings WARNING: sizeof *tv should be sizeof(*tv) Signed-off-by: Fabian Frederick Cc: "H. Peter Anvin" Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index e40b0430b562..633394f442f8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -157,7 +157,7 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp int compat_get_timeval(struct timeval *tv, const void __user *utv) { if (COMPAT_USE_64BIT_TIME) - return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; + return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0; else return __compat_get_timeval(tv, utv); } @@ -166,7 +166,7 @@ EXPORT_SYMBOL_GPL(compat_get_timeval); int compat_put_timeval(const struct timeval *tv, void __user *utv) { if (COMPAT_USE_64BIT_TIME) - return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; + return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0; else return __compat_put_timeval(tv, utv); } @@ -175,7 +175,7 @@ EXPORT_SYMBOL_GPL(compat_put_timeval); int compat_get_timespec(struct timespec *ts, const void __user *uts) { if (COMPAT_USE_64BIT_TIME) - return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; + return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; else return __compat_get_timespec(ts, uts); } @@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(compat_get_timespec); int compat_put_timespec(const struct timespec *ts, void __user *uts) { if (COMPAT_USE_64BIT_TIME) - return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; + return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; else return __compat_put_timespec(ts, uts); } -- cgit From 1f779fb28aa07350d72976d304591d216ca86f0e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 4 Jun 2014 16:48:15 +0800 Subject: cgroup: don't destroy the default root The default root is allocated and initialized at boot phase, so we shouldn't destroy the default root when it's umounted, otherwise it will lead to disaster. Just try mount and then umount the default root, and the kernel will crash immediately. v2: - No need to check for CSS_NO_REF in cgroup_get/put(). (Tejun) - Better call cgroup_put() for the default root in kill_sb(). (Tejun) - Add a comment. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a5f75ac4e793..3f46165829a4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1780,8 +1780,11 @@ static void cgroup_kill_sb(struct super_block *sb) * If @root doesn't have any mounts or children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). * cgroup_mount() may wait for @root's release. + * + * And don't kill the default root. */ - if (css_has_online_children(&root->cgrp.self)) + if (css_has_online_children(&root->cgrp.self) || + root == &cgrp_dfl_root) cgroup_put(&root->cgrp); else percpu_ref_kill(&root->cgrp.self.refcnt); -- cgit From 4fc828e24cd9c385d3a44e1b499ec7fc70239d8a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 2 May 2014 11:24:15 -0700 Subject: locking/rwsem: Support optimistic spinning We have reached the point where our mutexes are quite fine tuned for a number of situations. This includes the use of heuristics and optimistic spinning, based on MCS locking techniques. Exclusive ownership of read-write semaphores are, conceptually, just about the same as mutexes, making them close cousins. To this end we need to make them both perform similarly, and right now, rwsems are simply not up to it. This was discovered by both reverting commit 4fc3f1d6 (mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable) and similarly, converting some other mutexes (ie: i_mmap_mutex) to rwsems. This creates a situation where users have to choose between a rwsem and mutex taking into account this important performance difference. Specifically, biggest difference between both locks is when we fail to acquire a mutex in the fastpath, optimistic spinning comes in to play and we can avoid a large amount of unnecessary sleeping and overhead of moving tasks in and out of wait queue. Rwsems do not have such logic. This patch, based on the work from Tim Chen and I, adds support for write-side optimistic spinning when the lock is contended. It also includes support for the recently added cancelable MCS locking for adaptive spinning. Note that is is only applicable to the xadd method, and the spinlock rwsem variant remains intact. Allowing optimistic spinning before putting the writer on the wait queue reduces wait queue contention and provided greater chance for the rwsem to get acquired. With these changes, rwsem is on par with mutex. The performance benefits can be seen on a number of workloads. For instance, on a 8 socket, 80 core 64bit Westmere box, aim7 shows the following improvements in throughput: +--------------+---------------------+-----------------+ | Workload | throughput-increase | number of users | +--------------+---------------------+-----------------+ | alltests | 20% | >1000 | | custom | 27%, 60% | 10-100, >1000 | | high_systime | 36%, 30% | >100, >1000 | | shared | 58%, 29% | 10-100, >1000 | +--------------+---------------------+-----------------+ There was also improvement on smaller systems, such as a quad-core x86-64 laptop running a 30Gb PostgreSQL (pgbench) workload for up to +60% in throughput for over 50 clients. Additionally, benefits were also noticed in exim (mail server) workloads. Furthermore, no performance regression have been seen at all. Based-on-work-from: Tim Chen Signed-off-by: Davidlohr Bueso [peterz: rej fixup due to comment patches, sched/rt.h header] Signed-off-by: Peter Zijlstra Cc: Alex Shi Cc: Andi Kleen Cc: Michel Lespinasse Cc: Rik van Riel Cc: Peter Hurley Cc: "Paul E.McKenney" Cc: Jason Low Cc: Aswin Chandramouleeswaran Cc: Andrew Morton Cc: Linus Torvalds Cc: "Scott J Norton" Cc: Andrea Arcangeli Cc: Chris Mason Cc: Josef Bacik Link: http://lkml.kernel.org/r/1399055055.6275.15.camel@buesod1.americas.hpqcorp.net Signed-off-by: Ingo Molnar --- include/linux/rwsem.h | 25 ++++- kernel/locking/rwsem-xadd.c | 225 ++++++++++++++++++++++++++++++++++++++------ kernel/locking/rwsem.c | 31 +++++- 3 files changed, 248 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 03f3b05e8ec1..3e108f154cb6 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -16,6 +16,7 @@ #include +struct optimistic_spin_queue; struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK @@ -23,9 +24,17 @@ struct rw_semaphore; #else /* All arch specific implementations share the same struct */ struct rw_semaphore { - long count; - raw_spinlock_t wait_lock; - struct list_head wait_list; + long count; + raw_spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_SMP + /* + * Write owner. Used as a speculative check to see + * if the owner is running on the cpu. + */ + struct task_struct *owner; + struct optimistic_spin_queue *osq; /* spinner MCS lock */ +#endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif @@ -55,11 +64,21 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) # define __RWSEM_DEP_MAP_INIT(lockname) #endif +#ifdef CONFIG_SMP +#define __RWSEM_INITIALIZER(name) \ + { RWSEM_UNLOCKED_VALUE, \ + __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ + LIST_HEAD_INIT((name).wait_list), \ + NULL, /* owner */ \ + NULL /* mcs lock */ \ + __RWSEM_DEP_MAP_INIT(name) } +#else #define __RWSEM_INITIALIZER(name) \ { RWSEM_UNLOCKED_VALUE, \ __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ LIST_HEAD_INIT((name).wait_list) \ __RWSEM_DEP_MAP_INIT(name) } +#endif #define DECLARE_RWSEM(name) \ struct rw_semaphore name = __RWSEM_INITIALIZER(name) diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index b4219ff87b8c..4a75278142cd 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -5,11 +5,17 @@ * * Writer lock-stealing by Alex Shi * and Michel Lespinasse + * + * Optimistic spinning by Tim Chen + * and Davidlohr Bueso . Based on mutexes. */ #include #include #include #include +#include + +#include "mcs_spinlock.h" /* * Guide to the rw_semaphore's count field for common values. @@ -76,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, sem->count = RWSEM_UNLOCKED_VALUE; raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); +#ifdef CONFIG_SMP + sem->owner = NULL; + sem->osq = NULL; +#endif } EXPORT_SYMBOL(__init_rwsem); @@ -190,7 +200,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) } /* - * wait for the read lock to be granted + * Wait for the read lock to be granted */ __visible struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) @@ -237,64 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) return sem; } +static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) +{ + if (!(count & RWSEM_ACTIVE_MASK)) { + /* try acquiring the write lock */ + if (sem->count == RWSEM_WAITING_BIAS && + cmpxchg(&sem->count, RWSEM_WAITING_BIAS, + RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { + if (!list_is_singular(&sem->wait_list)) + rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + return true; + } + } + return false; +} + +#ifdef CONFIG_SMP /* - * wait until we successfully acquire the write lock + * Try to acquire write lock before the writer has been put on wait queue. + */ +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) +{ + long old, count = ACCESS_ONCE(sem->count); + + while (true) { + if (!(count == 0 || count == RWSEM_WAITING_BIAS)) + return false; + + old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); + if (old == count) + return true; + + count = old; + } +} + +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +{ + struct task_struct *owner; + bool on_cpu = true; + + if (need_resched()) + return 0; + + rcu_read_lock(); + owner = ACCESS_ONCE(sem->owner); + if (owner) + on_cpu = owner->on_cpu; + rcu_read_unlock(); + + /* + * If sem->owner is not set, the rwsem owner may have + * just acquired it and not set the owner yet or the rwsem + * has been released. + */ + return on_cpu; +} + +static inline bool owner_running(struct rw_semaphore *sem, + struct task_struct *owner) +{ + if (sem->owner != owner) + return false; + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * sem->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. + */ + barrier(); + + return owner->on_cpu; +} + +static noinline +bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) +{ + rcu_read_lock(); + while (owner_running(sem, owner)) { + if (need_resched()) + break; + + arch_mutex_cpu_relax(); + } + rcu_read_unlock(); + + /* + * We break out the loop above on need_resched() or when the + * owner changed, which is a sign for heavy contention. Return + * success only when sem->owner is NULL. + */ + return sem->owner == NULL; +} + +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + struct task_struct *owner; + bool taken = false; + + preempt_disable(); + + /* sem->wait_lock should not be held when doing optimistic spinning */ + if (!rwsem_can_spin_on_owner(sem)) + goto done; + + if (!osq_lock(&sem->osq)) + goto done; + + while (true) { + owner = ACCESS_ONCE(sem->owner); + if (owner && !rwsem_spin_on_owner(sem, owner)) + break; + + /* wait_lock will be acquired if write_lock is obtained */ + if (rwsem_try_write_lock_unqueued(sem)) { + taken = true; + break; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(current))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + arch_mutex_cpu_relax(); + } + osq_unlock(&sem->osq); +done: + preempt_enable(); + return taken; +} + +#else +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + return false; +} +#endif + +/* + * Wait until we successfully acquire the write lock */ __visible struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { - long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; + long count; + bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; - struct task_struct *tsk = current; - /* set up my own style of waitqueue */ - waiter.task = tsk; + /* undo write bias from down_write operation, stop active locking */ + count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); + + /* do optimistic spinning and steal lock if possible */ + if (rwsem_optimistic_spin(sem)) + return sem; + + /* + * Optimistic spinning failed, proceed to the slowpath + * and block until we can acquire the sem. + */ + waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; raw_spin_lock_irq(&sem->wait_lock); + + /* account for this before adding a new element to the list */ if (list_empty(&sem->wait_list)) - adjustment += RWSEM_WAITING_BIAS; + waiting = false; + list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); + if (waiting) { + count = ACCESS_ONCE(sem->count); - /* If there were already threads queued before us and there are no - * active writers, the lock must be read owned; so we try to wake - * any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS && - adjustment == -RWSEM_ACTIVE_WRITE_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + /* + * If there were already threads queued before us and there are no + * active writers, the lock must be read owned; so we try to wake + * any read locks that were queued ahead of us. + */ + if (count > RWSEM_WAITING_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + + } else + count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); /* wait until we successfully acquire the lock */ - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); while (true) { - if (!(count & RWSEM_ACTIVE_MASK)) { - /* Try acquiring the write lock. */ - count = RWSEM_ACTIVE_WRITE_BIAS; - if (!list_is_singular(&sem->wait_list)) - count += RWSEM_WAITING_BIAS; - - if (sem->count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == - RWSEM_WAITING_BIAS) - break; - } - + if (rwsem_try_write_lock(count, sem)) + break; raw_spin_unlock_irq(&sem->wait_lock); /* Block until there are no active lockers. */ do { schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); } while ((count = sem->count) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } + __set_current_state(TASK_RUNNING); list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); - tsk->state = TASK_RUNNING; return sem; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index cfff1435bdfb..42f806de49d4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -12,6 +12,27 @@ #include +#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ + sem->owner = current; +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ + sem->owner = NULL; +} + +#else +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ +} +#endif + /* * lock for reading */ @@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem) rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write); @@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem) { int ret = __down_write_trylock(sem); - if (ret == 1) + if (ret == 1) { rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); + rwsem_set_owner(sem); + } + return ret; } @@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_clear_owner(sem); __up_write(sem); } @@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem) * lockdep: a downgraded write will live on as a write * dependency. */ + rwsem_clear_owner(sem); __downgrade_write(sem); } @@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(_down_write_nest_lock); @@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); + rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write_nested); -- cgit From 0cc3d01164aba483edd8232aa5c781136843c367 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 4 Jun 2014 20:19:48 +0200 Subject: locking/rwsem: Fix checkpatch.pl warnings WARNING: line over 80 characters #205: FILE: kernel/locking/rwsem-xadd.c:275: + old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); WARNING: line over 80 characters #376: FILE: kernel/locking/rwsem-xadd.c:434: + * If there were already threads queued before us and there are no WARNING: line over 80 characters #377: FILE: kernel/locking/rwsem-xadd.c:435: + * active writers, the lock must be read owned; so we try to wake total: 0 errors, 3 warnings, 417 lines checked Signed-off-by: Andrew Morton Signed-off-by: Peter Zijlstra Cc: "H. Peter Anvin" Cc: Davidlohr Bueso Cc: Tim Chen Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-pn6pslaplw031lykweojsn8c@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 4a75278142cd..dacc32142fcc 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -433,9 +433,9 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) count = ACCESS_ONCE(sem->count); /* - * If there were already threads queued before us and there are no - * active writers, the lock must be read owned; so we try to wake - * any read locks that were queued ahead of us. + * If there were already threads queued before us and there are + * no active writers, the lock must be read owned; so we try to + * wake any read locks that were queued ahead of us. */ if (count > RWSEM_WAITING_BIAS) sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); -- cgit From e9dd685ce81815811fb4da72e6ab10a694ac8468 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 May 2014 17:02:04 -0400 Subject: sched/numa: Fix use of spin_{un}lock_irq() when interrupts are disabled As Peter Zijlstra told me, we have the following path: do_exit() exit_itimers() itimer_delete() spin_lock_irqsave(&timer->it_lock, &flags); timer_delete_hook(timer); kc->timer_del(timer) := posix_cpu_timer_del() put_task_struct() __put_task_struct() task_numa_free() spin_lock(&grp->lock); Which means that task_numa_free() can be called with interrupts disabled, which means that we should not be using spin_lock_irq() but spin_lock_irqsave() instead. Otherwise we are enabling interrupts while holding an interrupt unsafe lock! Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Cc: Thomas Gleixner Cc: Mike Galbraith Cc: Eric Dumazet Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140527182541.GH11096@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0fdb96de81a5..b4768c069392 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1707,18 +1707,19 @@ no_join: void task_numa_free(struct task_struct *p) { struct numa_group *grp = p->numa_group; - int i; void *numa_faults = p->numa_faults_memory; + unsigned long flags; + int i; if (grp) { - spin_lock_irq(&grp->lock); + spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] -= p->numa_faults_memory[i]; grp->total_faults -= p->total_numa_faults; list_del(&p->numa_entry); grp->nr_tasks--; - spin_unlock_irq(&grp->lock); + spin_unlock_irqrestore(&grp->lock, flags); rcu_assign_pointer(p->numa_group, NULL); put_numa_group(grp); } -- cgit From b14ed2c273f8ab872ae4e6735fe5ab09cb14b8c3 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Mon, 2 Jun 2014 22:38:34 +0200 Subject: sched: Fix sched_policy < 0 comparison attr.sched_policy is u32, therefore a comparison against < 0 is never true. Fix this by casting sched_policy to int. This issue was reported by coverity CID 1219934. Fixes: dbdb22754fde ("sched: Disallow sched_attr::sched_policy < 0") Signed-off-by: Richard Weinberger Signed-off-by: Peter Zijlstra Cc: Michael Kerrisk Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1401741514-7045-1-git-send-email-richard@nod.at Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 86f3890c3d08..540542b93f9f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3685,7 +3685,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (retval) return retval; - if (attr.sched_policy < 0) + if ((int)attr.sched_policy < 0) return -EINVAL; rcu_read_lock(); -- cgit From 0f397f2c90ce68821ee864c2c53baafe78de765d Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 20 May 2014 13:33:42 +0400 Subject: sched/dl: Fix race in dl_task_timer() Throttled task is still on rq, and it may be moved to other cpu if user is playing with sched_setaffinity(). Therefore, unlocked task_rq() access makes the race. Juri Lelli reports he got this race when dl_bandwidth_enabled() was not set. Other thing, pointed by Peter Zijlstra: "Now I suppose the problem can still actually happen when you change the root domain and trigger a effective affinity change that way". To fix that we do the same as made in __task_rq_lock(). We do not use __task_rq_lock() itself, because it has a useful lockdep check, which is not correct in case of dl_task_timer(). We do not need pi_lock locked here. This case is an exception (PeterZ): "The only reason we don't strictly need ->pi_lock now is because we're guaranteed to have p->state == TASK_RUNNING here and are thus free of ttwu races". Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Cc: # v3.14+ Cc: Linus Torvalds Link: http://lkml.kernel.org/r/3056991400578422@web14g.yandex.ru Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 800e99b99075..14bc348ba3b4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -513,9 +513,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity, dl_timer); struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = task_rq(p); + struct rq *rq; +again: + rq = task_rq(p); raw_spin_lock(&rq->lock); + if (rq != task_rq(p)) { + /* Task was moved, retrying. */ + raw_spin_unlock(&rq->lock); + goto again; + } + /* * We need to take care of a possible races here. In fact, the * task might have changed its scheduling policy to something -- cgit From 09dc4ab03936df5c5aa711d27c81283c6d09f495 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 19 May 2014 15:10:09 +0400 Subject: sched/fair: Fix tg_set_cfs_bandwidth() deadlock on rq->lock tg_set_cfs_bandwidth() sets cfs_b->timer_active to 0 to force the period timer restart. It's not safe, because can lead to deadlock, described in commit 927b54fccbf0: "__start_cfs_bandwidth calls hrtimer_cancel while holding rq->lock, waiting for the hrtimer to finish. However, if sched_cfs_period_timer runs for another loop iteration, the hrtimer can attempt to take rq->lock, resulting in deadlock." Three CPUs must be involved: CPU0 CPU1 CPU2 take rq->lock period timer fired ... take cfs_b lock ... ... tg_set_cfs_bandwidth() throttle_cfs_rq() release cfs_b lock take cfs_b lock ... distribute_cfs_runtime() timer_active = 0 take cfs_b->lock wait for rq->lock ... __start_cfs_bandwidth() {wait for timer callback break if timer_active == 1} So, CPU0 and CPU1 are deadlocked. Instead of resetting cfs_b->timer_active, tg_set_cfs_bandwidth can wait for period timer callbacks (ignoring cfs_b->timer_active) and restart the timer explicitly. Signed-off-by: Roman Gushchin Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/87wqdi9g8e.wl\%klamm@yandex-team.ru Cc: pjt@google.com Cc: chris.j.arges@canonical.com Cc: gregkh@linuxfoundation.org Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +-- kernel/sched/fair.c | 8 ++++---- kernel/sched/sched.h | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 540542b93f9f..f3f48e7a2dda 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7751,8 +7751,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) /* restart the period timer (if active) to handle new period expiry */ if (runtime_enabled && cfs_b->timer_active) { /* force a reprogram */ - cfs_b->timer_active = 0; - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, true); } raw_spin_unlock_irq(&cfs_b->lock); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b4768c069392..8cbe2d2c16de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3130,7 +3130,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) */ if (!cfs_b->timer_active) { __refill_cfs_bandwidth_runtime(cfs_b); - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, false); } if (cfs_b->runtime > 0) { @@ -3309,7 +3309,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); if (!cfs_b->timer_active) - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, false); raw_spin_unlock(&cfs_b->lock); } @@ -3691,7 +3691,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) } /* requires cfs_b->lock, may release to reprogram timer */ -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) { /* * The timer may be active because we're trying to set a new bandwidth @@ -3706,7 +3706,7 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cpu_relax(); raw_spin_lock(&cfs_b->lock); /* if someone else restarted the timer then we're done */ - if (cfs_b->timer_active) + if (!force && cfs_b->timer_active) return; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 456e492a3dca..369b4d663c42 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -278,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); extern void free_rt_sched_group(struct task_group *tg); -- cgit From 51f2176d74ace4c3f58579a605ef5a9720befb00 Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Mon, 19 May 2014 15:49:45 -0700 Subject: sched/fair: Fix unlocked reads of some cfs_b->quota/period sched_cfs_period_timer() reads cfs_b->period without locks before calling do_sched_cfs_period_timer(), and similarly unthrottle_offline_cfs_rqs() would read cfs_b->period without the right lock. Thus a simultaneous change of bandwidth could cause corruption on any platform where ktime_t or u64 writes/reads are not atomic. Extend cfs_b->lock from do_sched_cfs_period_timer() to include the read of cfs_b->period to solve that issue; unthrottle_offline_cfs_rqs() can just use 1 rather than the exact quota, much like distribute_cfs_runtime() does. There is also an unlocked read of cfs_b->runtime_expires, but a race there would only delay runtime expiry by a tick. Still, the comparison should just be != anyway, which clarifies even that problem. Signed-off-by: Ben Segall Tested-by: Roman Gushchin [peterz: Fix compile warn] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140519224945.20303.93530.stgit@sword-of-the-dawn.mtv.corp.google.com Cc: pjt@google.com Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c9617b73bcc0..b71d8c39f1fd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3224,10 +3224,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) * has not truly expired. * * Fortunately we can check determine whether this the case by checking - * whether the global deadline has advanced. + * whether the global deadline has advanced. It is valid to compare + * cfs_b->runtime_expires without any locks since we only care about + * exact equality, so a partial write will still work. */ - if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { + if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; } else { @@ -3456,21 +3458,21 @@ next: static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { u64 runtime, runtime_expires; - int idle = 1, throttled; + int throttled; - raw_spin_lock(&cfs_b->lock); /* no need to continue the timer with no bandwidth constraint */ if (cfs_b->quota == RUNTIME_INF) - goto out_unlock; + goto out_deactivate; throttled = !list_empty(&cfs_b->throttled_cfs_rq); - /* idle depends on !throttled (for the case of a large deficit) */ - idle = cfs_b->idle && !throttled; cfs_b->nr_periods += overrun; - /* if we're going inactive then everything else can be deferred */ - if (idle) - goto out_unlock; + /* + * idle depends on !throttled (for the case of a large deficit), and if + * we're going inactive then everything else can be deferred + */ + if (cfs_b->idle && !throttled) + goto out_deactivate; /* * if we have relooped after returning idle once, we need to update our @@ -3484,7 +3486,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) if (!throttled) { /* mark as potentially idle for the upcoming period */ cfs_b->idle = 1; - goto out_unlock; + return 0; } /* account preceding periods in which throttling occurred */ @@ -3524,12 +3526,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) * timer to remain active while there are any throttled entities.) */ cfs_b->idle = 0; -out_unlock: - if (idle) - cfs_b->timer_active = 0; - raw_spin_unlock(&cfs_b->lock); - return idle; + return 0; + +out_deactivate: + cfs_b->timer_active = 0; + return 1; } /* a cfs_rq won't donate quota below this amount */ @@ -3706,6 +3708,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) int overrun; int idle = 0; + raw_spin_lock(&cfs_b->lock); for (;;) { now = hrtimer_cb_get_time(timer); overrun = hrtimer_forward(timer, now, cfs_b->period); @@ -3715,6 +3718,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) idle = do_sched_cfs_period_timer(cfs_b, overrun); } + raw_spin_unlock(&cfs_b->lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -3774,8 +3778,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) struct cfs_rq *cfs_rq; for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - if (!cfs_rq->runtime_enabled) continue; @@ -3783,7 +3785,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) * clock_task is not advancing so we just need to make sure * there's some valid quota amount */ - cfs_rq->runtime_remaining = cfs_b->quota; + cfs_rq->runtime_remaining = 1; if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } -- cgit From ed61bbc69c773465782476c7e5869fa5607fa73a Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Tue, 20 May 2014 14:39:27 -0700 Subject: sched/balancing: Reduce the rate of needless idle load balancing The current no_hz idle load balancer do load balancing for *all* idle cpus, even though the time due to load balance for a particular idle cpu could be still a while in the future. This introduces a much higher load balancing rate than what is necessary. The patch changes the behavior by only doing idle load balancing on behalf of an idle cpu only when it is due for load balancing. On SGI's systems with over 3000 cores, the cpu responsible for idle balancing got overwhelmed with idle balancing, and introduces a lot of OS noise to workloads. This patch fixes the issue. Signed-off-by: Tim Chen Acked-by: Russ Anderson Reviewed-by: Rik van Riel Reviewed-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Len Brown Cc: Dimitri Sivanich Cc: Hedi Berriche Cc: Andi Kleen Cc: MichelLespinasse Cc: Peter Hurley Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1400621967.2970.280.camel@schen9-DESK Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b71d8c39f1fd..7a0c000b6005 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7193,12 +7193,17 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) rq = cpu_rq(balance_cpu); - raw_spin_lock_irq(&rq->lock); - update_rq_clock(rq); - update_idle_cpu_load(rq); - raw_spin_unlock_irq(&rq->lock); - - rebalance_domains(rq, CPU_IDLE); + /* + * If time for next balance is due, + * do the balance. + */ + if (time_after_eq(jiffies, rq->next_balance)) { + raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); + update_idle_cpu_load(rq); + raw_spin_unlock_irq(&rq->lock); + rebalance_domains(rq, CPU_IDLE); + } if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; -- cgit From 2538d960d0c74cdc639f05723e04a67aed1efdf9 Mon Sep 17 00:00:00 2001 From: Manuel Schölling Date: Thu, 22 May 2014 19:45:23 +0200 Subject: sched/fair: Use time_after() in record_wakee() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be future-proof and for better readability the time comparisons are modified to use time_after() instead of plain, error-prone math. Signed-off-by: Manuel Schölling Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1400780723-24626-1-git-send-email-manuel.schoelling@gmx.de Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a0c000b6005..c63dde984956 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4066,7 +4066,7 @@ static void record_wakee(struct task_struct *p) * about the boundary, really active task won't care * about the loss. */ - if (jiffies > current->wakee_flip_decay_ts + HZ) { + if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { current->wakee_flips >>= 1; current->wakee_flip_decay_ts = jiffies; } -- cgit From fa93384f40deeb294fd29f2fdcadbd0ebc2dedf1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 23 May 2014 13:20:42 +0300 Subject: sched: Fix signedness bug in yield_to() yield_to() is supposed to return -ESRCH if there is no task to yield to, but because the type is bool that is the same as returning true. The only place I see which cares is kvm_vcpu_on_spin(). Signed-off-by: Dan Carpenter Reviewed-by: Raghavendra Signed-off-by: Peter Zijlstra Cc: Gleb Natapov Cc: Linus Torvalds Cc: Paolo Bonzini Cc: kvm@vger.kernel.org Link: http://lkml.kernel.org/r/20140523102042.GA7267@mwanda Signed-off-by: Ingo Molnar --- include/linux/kvm_host.h | 2 +- include/linux/sched.h | 2 +- kernel/sched/core.c | 2 +- virt/kvm/kvm_main.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7d21cf9f4380..3c4bcf146159 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -584,7 +584,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); -bool kvm_vcpu_yield_to(struct kvm_vcpu *target); +int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); diff --git a/include/linux/sched.h b/include/linux/sched.h index 0f91d00efd87..6790c3b42072 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2180,7 +2180,7 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { } static inline void sched_autogroup_exit(struct signal_struct *sig) { } #endif -extern bool yield_to(struct task_struct *p, bool preempt); +extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 321d800e4baa..afcc84234a3e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4195,7 +4195,7 @@ EXPORT_SYMBOL(yield); * false (0) if we failed to boost the target. * -ESRCH if there's no task to yield to. */ -bool __sched yield_to(struct task_struct *p, bool preempt) +int __sched yield_to(struct task_struct *p, bool preempt) { struct task_struct *curr = current; struct rq *rq, *p_rq; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 56baae8c2f56..86d1c457458d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1708,11 +1708,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvm_vcpu_kick); #endif /* !CONFIG_S390 */ -bool kvm_vcpu_yield_to(struct kvm_vcpu *target) +int kvm_vcpu_yield_to(struct kvm_vcpu *target) { struct pid *pid; struct task_struct *task = NULL; - bool ret = false; + int ret = 0; rcu_read_lock(); pid = rcu_dereference(target->pid); -- cgit From 5ef20ca181ec592e4684a45f4d5f1385f6055534 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:34 -0400 Subject: sched/fair: Remove "power" from 'struct numa_stats' It is better not to think about compute capacity as being equivalent to "CPU power". The upcoming "power aware" scheduler work may create confusion with the notion of energy consumption if "power" is used too liberally. To make things explicit and not create more confusion with the existing "capacity" member, let's rename things as follows: power -> compute_capacity capacity -> task_capacity Note: none of those fields are actually used outside update_numa_stats(). Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-2e2ndymj5gyshyjq8am79f20@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c63dde984956..1cfe5a25086d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1026,10 +1026,10 @@ struct numa_stats { unsigned long load; /* Total compute capacity of CPUs on a node */ - unsigned long power; + unsigned long compute_capacity; /* Approximate capacity in terms of runnable tasks on a node */ - unsigned long capacity; + unsigned long task_capacity; int has_capacity; }; @@ -1046,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(cpu); - ns->power += power_of(cpu); + ns->compute_capacity += power_of(cpu); cpus++; } @@ -1062,9 +1062,10 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; - ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); - ns->has_capacity = (ns->nr_running < ns->capacity); + ns->load = (ns->load * SCHED_POWER_SCALE) / ns->compute_capacity; + ns->task_capacity = + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_POWER_SCALE); + ns->has_capacity = (ns->nr_running < ns->task_capacity); } struct task_numa_env { -- cgit From 1b6a7495d343fcfe22ff3a8285544bb8e40f1920 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:35 -0400 Subject: sched/fair: Change "has_capacity" to "has_free_capacity" The capacity of a CPU/group should be some intrinsic value that doesn't change with task placement. It is like a container which capacity is stable regardless of the amount of liquid in it (its "utilization")... unless the container itself is crushed that is, but that's another story. Therefore let's rename "has_capacity" to "has_free_capacity" in order to better convey the wanted meaning. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-djzkk027jm0e8x8jxy70opzh@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1cfe5a25086d..8993dfa2e82b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1030,7 +1030,7 @@ struct numa_stats { /* Approximate capacity in terms of runnable tasks on a node */ unsigned long task_capacity; - int has_capacity; + int has_free_capacity; }; /* @@ -1056,8 +1056,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid) * the @ns structure is NULL'ed and task_numa_compare() will * not find this node attractive. * - * We'll either bail at !has_capacity, or we'll detect a huge imbalance - * and bail there. + * We'll either bail at !has_free_capacity, or we'll detect a huge + * imbalance and bail there. */ if (!cpus) return; @@ -1065,7 +1065,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ns->load = (ns->load * SCHED_POWER_SCALE) / ns->compute_capacity; ns->task_capacity = DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_POWER_SCALE); - ns->has_capacity = (ns->nr_running < ns->task_capacity); + ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } struct task_numa_env { @@ -1196,8 +1196,8 @@ static void task_numa_compare(struct task_numa_env *env, if (!cur) { /* Is there capacity at our destination? */ - if (env->src_stats.has_capacity && - !env->dst_stats.has_capacity) + if (env->src_stats.has_free_capacity && + !env->dst_stats.has_free_capacity) goto unlock; goto balance; @@ -1302,8 +1302,8 @@ static int task_numa_migrate(struct task_struct *p) groupimp = group_weight(p, env.dst_nid) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); - /* If the preferred nid has capacity, try to use it. */ - if (env.dst_stats.has_capacity) + /* If the preferred nid has free capacity, try to use it. */ + if (env.dst_stats.has_free_capacity) task_numa_find_cpu(&env, taskimp, groupimp); /* No space available on the preferred nid. Look elsewhere. */ @@ -5538,7 +5538,7 @@ struct sg_lb_stats { unsigned int idle_cpus; unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ - int group_has_capacity; /* Is there extra capacity in the group? */ + int group_has_free_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -5905,7 +5905,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_capacity = sg_capacity(env, group); if (sgs->group_capacity > sgs->sum_nr_running) - sgs->group_has_capacity = 1; + sgs->group_has_free_capacity = 1; } /** @@ -6029,7 +6029,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * with a large weight task outweighs the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_capacity) + sds->local_stat.group_has_free_capacity) sgs->group_capacity = min(sgs->group_capacity, 1U); if (update_sd_pick_busiest(env, sds, sg, sgs)) { @@ -6289,8 +6289,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && - !busiest->group_has_capacity) + if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && + !busiest->group_has_free_capacity) goto force_balance; /* -- cgit From 0fedc6c8e34f4ce0b37b1f25c3619b4a8faa244c Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:36 -0400 Subject: sched/fair: Disambiguate existing/remaining "capacity" usage We have "power" (which should actually become "capacity") and "capacity" which is a scaled down "capacity factor" in terms of unitary tasks. Let's use "capacity_factor" to make room for proper usage of "capacity" later. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-gk1co8sqdev3763opqm6ovml@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8993dfa2e82b..e401e446e87c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5534,7 +5534,7 @@ struct sg_lb_stats { unsigned long load_per_task; unsigned long group_power; unsigned int sum_nr_running; /* Nr tasks running in the group */ - unsigned int group_capacity; + unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ @@ -5829,15 +5829,15 @@ static inline int sg_imbalanced(struct sched_group *group) } /* - * Compute the group capacity. + * Compute the group capacity factor. * * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by * first dividing out the smt factor and computing the actual number of cores * and limit power unit capacity with that. */ -static inline int sg_capacity(struct lb_env *env, struct sched_group *group) +static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) { - unsigned int capacity, smt, cpus; + unsigned int capacity_factor, smt, cpus; unsigned int power, power_orig; power = group->sgp->power; @@ -5846,13 +5846,13 @@ static inline int sg_capacity(struct lb_env *env, struct sched_group *group) /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); - capacity = cpus / smt; /* cores */ + capacity_factor = cpus / smt; /* cores */ - capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); - if (!capacity) - capacity = fix_small_capacity(env->sd, group); + capacity_factor = min_t(unsigned, capacity_factor, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); + if (!capacity_factor) + capacity_factor = fix_small_capacity(env->sd, group); - return capacity; + return capacity_factor; } /** @@ -5902,9 +5902,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; sgs->group_imb = sg_imbalanced(group); - sgs->group_capacity = sg_capacity(env, group); + sgs->group_capacity_factor = sg_capacity_factor(env, group); - if (sgs->group_capacity > sgs->sum_nr_running) + if (sgs->group_capacity_factor > sgs->sum_nr_running) sgs->group_has_free_capacity = 1; } @@ -5929,7 +5929,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->avg_load <= sds->busiest_stat.avg_load) return false; - if (sgs->sum_nr_running > sgs->group_capacity) + if (sgs->sum_nr_running > sgs->group_capacity_factor) return true; if (sgs->group_imb) @@ -6020,17 +6020,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity to one so that we'll try + * first, lower the sg capacity factor to one so that we'll try * and move all the excess tasks away. We lower the capacity * of a group only if the local group has the capacity to fit - * these excess tasks, i.e. nr_running < group_capacity. The + * these excess tasks, i.e. nr_running < group_capacity_factor. The * extra check prevents the case where you always pull from the * heaviest group when it is already under-utilized (possible * with a large weight task outweighs the tasks on the system). */ if (prefer_sibling && sds->local && sds->local_stat.group_has_free_capacity) - sgs->group_capacity = min(sgs->group_capacity, 1U); + sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -6204,7 +6204,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * have to drop below capacity to reach cpu-load equilibrium. */ load_above_capacity = - (busiest->sum_nr_running - busiest->group_capacity); + (busiest->sum_nr_running - busiest->group_capacity_factor); load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); load_above_capacity /= busiest->group_power; @@ -6348,7 +6348,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long power, capacity, wl; + unsigned long power, capacity_factor, wl; enum fbq_type rt; rq = cpu_rq(i); @@ -6377,9 +6377,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; power = power_of(i); - capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); - if (!capacity) - capacity = fix_small_capacity(env->sd, group); + capacity_factor = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); + if (!capacity_factor) + capacity_factor = fix_small_capacity(env->sd, group); wl = weighted_cpuload(i); @@ -6387,7 +6387,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, * When comparing with imbalance, use weighted_cpuload() * which is not scaled with the cpu power. */ - if (capacity && rq->nr_running == 1 && wl > env->imbalance) + if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) continue; /* -- cgit From 63b2ca30bdb3dbf60bc7ac5f46713c0d32308261 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:37 -0400 Subject: sched: Let 'struct sched_group_power' care about CPU capacity It is better not to think about compute capacity as being equivalent to "CPU power". The upcoming "power aware" scheduler work may create confusion with the notion of energy consumption if "power" is used too liberally. Since struct sched_group_power is really about compute capacity of sched groups, let's rename it to struct sched_group_capacity. Similarly sgp becomes sgc. Related variables and functions dealing with groups are also adjusted accordingly. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-5yeix833vvgf2uyj5o36hpu9@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- kernel/sched/core.c | 81 +++++++++++++++---------------- kernel/sched/fair.c | 131 +++++++++++++++++++++++++------------------------- kernel/sched/sched.h | 16 +++--- 4 files changed, 115 insertions(+), 115 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6790c3b42072..a96f03598c61 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1013,7 +1013,7 @@ typedef const int (*sched_domain_flags_f)(void); struct sd_data { struct sched_domain **__percpu sd; struct sched_group **__percpu sg; - struct sched_group_power **__percpu sgp; + struct sched_group_capacity **__percpu sgc; }; struct sched_domain_topology_level { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index afcc84234a3e..2e1fb0902200 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5221,14 +5221,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, } /* - * Even though we initialize ->power to something semi-sane, - * we leave power_orig unset. This allows us to detect if + * Even though we initialize ->capacity to something semi-sane, + * we leave capacity_orig unset. This allows us to detect if * domain iteration is still funny without causing /0 traps. */ - if (!group->sgp->power_orig) { + if (!group->sgc->capacity_orig) { printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not " - "set\n"); + printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); break; } @@ -5250,9 +5249,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->sgp->power != SCHED_POWER_SCALE) { - printk(KERN_CONT " (cpu_power = %d)", - group->sgp->power); + if (group->sgc->capacity != SCHED_POWER_SCALE) { + printk(KERN_CONT " (cpu_capacity = %d)", + group->sgc->capacity); } group = group->next; @@ -5466,7 +5465,7 @@ static struct root_domain *alloc_rootdomain(void) return rd; } -static void free_sched_groups(struct sched_group *sg, int free_sgp) +static void free_sched_groups(struct sched_group *sg, int free_sgc) { struct sched_group *tmp, *first; @@ -5477,8 +5476,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp) do { tmp = sg->next; - if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) - kfree(sg->sgp); + if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) + kfree(sg->sgc); kfree(sg); sg = tmp; @@ -5496,7 +5495,7 @@ static void free_sched_domain(struct rcu_head *rcu) if (sd->flags & SD_OVERLAP) { free_sched_groups(sd->groups, 1); } else if (atomic_dec_and_test(&sd->groups->ref)) { - kfree(sd->groups->sgp); + kfree(sd->groups->sgc); kfree(sd->groups); } kfree(sd); @@ -5707,17 +5706,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_or(covered, covered, sg_span); - sg->sgp = *per_cpu_ptr(sdd->sgp, i); - if (atomic_inc_return(&sg->sgp->ref) == 1) + sg->sgc = *per_cpu_ptr(sdd->sgc, i); + if (atomic_inc_return(&sg->sgc->ref) == 1) build_group_mask(sd, sg); /* - * Initialize sgp->power such that even if we mess up the + * Initialize sgc->capacity such that even if we mess up the * domains and no possible iteration will get us here, we won't * die on a /0 trap. */ - sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); - sg->sgp->power_orig = sg->sgp->power; + sg->sgc->capacity = SCHED_POWER_SCALE * cpumask_weight(sg_span); + sg->sgc->capacity_orig = sg->sgc->capacity; /* * Make sure the first group of this domain contains the @@ -5755,8 +5754,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) if (sg) { *sg = *per_cpu_ptr(sdd->sg, cpu); - (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); - atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ + (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); + atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ } return cpu; @@ -5819,16 +5818,16 @@ build_sched_groups(struct sched_domain *sd, int cpu) } /* - * Initialize sched groups cpu_power. + * Initialize sched groups cpu_capacity. * - * cpu_power indicates the capacity of sched group, which is used while + * cpu_capacity indicates the capacity of sched group, which is used while * distributing the load between different sched groups in a sched domain. - * Typically cpu_power for all the groups in a sched domain will be same unless - * there are asymmetries in the topology. If there are asymmetries, group - * having more cpu_power will pickup more load compared to the group having - * less cpu_power. + * Typically cpu_capacity for all the groups in a sched domain will be same + * unless there are asymmetries in the topology. If there are asymmetries, + * group having more cpu_capacity will pickup more load compared to the + * group having less cpu_capacity. */ -static void init_sched_groups_power(int cpu, struct sched_domain *sd) +static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; @@ -5842,8 +5841,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) if (cpu != group_balance_cpu(sg)) return; - update_group_power(sd, cpu); - atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); + update_group_capacity(sd, cpu); + atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } /* @@ -5934,8 +5933,8 @@ static void claim_allocations(int cpu, struct sched_domain *sd) if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) - *per_cpu_ptr(sdd->sgp, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) + *per_cpu_ptr(sdd->sgc, cpu) = NULL; } #ifdef CONFIG_NUMA @@ -6337,14 +6336,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sg) return -ENOMEM; - sdd->sgp = alloc_percpu(struct sched_group_power *); - if (!sdd->sgp) + sdd->sgc = alloc_percpu(struct sched_group_capacity *); + if (!sdd->sgc) return -ENOMEM; for_each_cpu(j, cpu_map) { struct sched_domain *sd; struct sched_group *sg; - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); @@ -6362,12 +6361,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sg, j) = sg; - sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), + sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); - if (!sgp) + if (!sgc) return -ENOMEM; - *per_cpu_ptr(sdd->sgp, j) = sgp; + *per_cpu_ptr(sdd->sgc, j) = sgc; } } @@ -6394,15 +6393,15 @@ static void __sdt_free(const struct cpumask *cpu_map) if (sdd->sg) kfree(*per_cpu_ptr(sdd->sg, j)); - if (sdd->sgp) - kfree(*per_cpu_ptr(sdd->sgp, j)); + if (sdd->sgc) + kfree(*per_cpu_ptr(sdd->sgc, j)); } free_percpu(sdd->sd); sdd->sd = NULL; free_percpu(sdd->sg); sdd->sg = NULL; - free_percpu(sdd->sgp); - sdd->sgp = NULL; + free_percpu(sdd->sgc); + sdd->sgc = NULL; } } @@ -6479,7 +6478,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { claim_allocations(i, sd); - init_sched_groups_power(i, sd); + init_sched_groups_capacity(i, sd); } } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e401e446e87c..36bd4d23fca8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4369,8 +4369,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, avg_load += load; } - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; + /* Adjust by relative CPU capacity of the group */ + avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgc->capacity; if (local_group) { this_load = avg_load; @@ -5532,7 +5532,7 @@ struct sg_lb_stats { unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; - unsigned long group_power; + unsigned long group_capacity; unsigned int sum_nr_running; /* Nr tasks running in the group */ unsigned int group_capacity_factor; unsigned int idle_cpus; @@ -5553,7 +5553,7 @@ struct sd_lb_stats { struct sched_group *busiest; /* Busiest group in this sd */ struct sched_group *local; /* Local group in this sd */ unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_pwr; /* Total power of all groups in sd */ + unsigned long total_capacity; /* Total capacity of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ @@ -5572,7 +5572,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .busiest = NULL, .local = NULL, .total_load = 0UL, - .total_pwr = 0UL, + .total_capacity = 0UL, .busiest_stat = { .avg_load = 0UL, }, @@ -5681,7 +5681,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power >>= SCHED_POWER_SHIFT; } - sdg->sgp->power_orig = power; + sdg->sgc->capacity_orig = power; if (sched_feat(ARCH_POWER)) power *= arch_scale_freq_power(sd, cpu); @@ -5697,26 +5697,26 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power = 1; cpu_rq(cpu)->cpu_power = power; - sdg->sgp->power = power; + sdg->sgc->capacity = power; } -void update_group_power(struct sched_domain *sd, int cpu) +void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long power, power_orig; + unsigned long capacity, capacity_orig; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); interval = clamp(interval, 1UL, max_load_balance_interval); - sdg->sgp->next_update = jiffies + interval; + sdg->sgc->next_update = jiffies + interval; if (!child) { update_cpu_power(sd, cpu); return; } - power_orig = power = 0; + capacity_orig = capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -5725,31 +5725,31 @@ void update_group_power(struct sched_domain *sd, int cpu) */ for_each_cpu(cpu, sched_group_cpus(sdg)) { - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); /* - * build_sched_domains() -> init_sched_groups_power() + * build_sched_domains() -> init_sched_groups_capacity() * gets here before we've attached the domains to the * runqueues. * * Use power_of(), which is set irrespective of domains * in update_cpu_power(). * - * This avoids power/power_orig from being 0 and + * This avoids capacity/capacity_orig from being 0 and * causing divide-by-zero issues on boot. * - * Runtime updates will correct power_orig. + * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - power_orig += power_of(cpu); - power += power_of(cpu); + capacity_orig += power_of(cpu); + capacity += power_of(cpu); continue; } - sgp = rq->sd->groups->sgp; - power_orig += sgp->power_orig; - power += sgp->power; + sgc = rq->sd->groups->sgc; + capacity_orig += sgc->capacity_orig; + capacity += sgc->capacity; } } else { /* @@ -5759,14 +5759,14 @@ void update_group_power(struct sched_domain *sd, int cpu) group = child->groups; do { - power_orig += group->sgp->power_orig; - power += group->sgp->power; + capacity_orig += group->sgc->capacity_orig; + capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } - sdg->sgp->power_orig = power_orig; - sdg->sgp->power = power; + sdg->sgc->capacity_orig = capacity_orig; + sdg->sgc->capacity = capacity; } /* @@ -5786,9 +5786,9 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) return 0; /* - * If ~90% of the cpu_power is still there, we're good. + * If ~90% of the cpu_capacity is still there, we're good. */ - if (group->sgp->power * 32 > group->sgp->power_orig * 29) + if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) return 1; return 0; @@ -5825,7 +5825,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) static inline int sg_imbalanced(struct sched_group *group) { - return group->sgp->imbalance; + return group->sgc->imbalance; } /* @@ -5833,22 +5833,23 @@ static inline int sg_imbalanced(struct sched_group *group) * * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by * first dividing out the smt factor and computing the actual number of cores - * and limit power unit capacity with that. + * and limit unit capacity with that. */ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) { unsigned int capacity_factor, smt, cpus; - unsigned int power, power_orig; + unsigned int capacity, capacity_orig; - power = group->sgp->power; - power_orig = group->sgp->power_orig; + capacity = group->sgc->capacity; + capacity_orig = group->sgc->capacity_orig; cpus = group->group_weight; - /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ - smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); + /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ + smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, capacity_orig); capacity_factor = cpus / smt; /* cores */ - capacity_factor = min_t(unsigned, capacity_factor, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); + capacity_factor = min_t(unsigned, + capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_POWER_SCALE)); if (!capacity_factor) capacity_factor = fix_small_capacity(env->sd, group); @@ -5892,9 +5893,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; } - /* Adjust by relative CPU power of the group */ - sgs->group_power = group->sgp->power; - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; + /* Adjust by relative CPU capacity of the group */ + sgs->group_capacity = group->sgc->capacity; + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_capacity; if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; @@ -6009,8 +6010,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sgs = &sds->local_stat; if (env->idle != CPU_NEWLY_IDLE || - time_after_eq(jiffies, sg->sgp->next_update)) - update_group_power(env->sd, env->dst_cpu); + time_after_eq(jiffies, sg->sgc->next_update)) + update_group_capacity(env->sd, env->dst_cpu); } update_sg_lb_stats(env, sg, load_idx, local_group, sgs); @@ -6040,7 +6041,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; - sds->total_pwr += sgs->group_power; + sds->total_capacity += sgs->group_capacity; sg = sg->next; } while (sg != env->sd->groups); @@ -6087,7 +6088,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) return 0; env->imbalance = DIV_ROUND_CLOSEST( - sds->busiest_stat.avg_load * sds->busiest_stat.group_power, + sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, SCHED_POWER_SCALE); return 1; @@ -6103,7 +6104,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) static inline void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { - unsigned long tmp, pwr_now = 0, pwr_move = 0; + unsigned long tmp, capa_now = 0, capa_move = 0; unsigned int imbn = 2; unsigned long scaled_busy_load_per_task; struct sg_lb_stats *local, *busiest; @@ -6118,7 +6119,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) scaled_busy_load_per_task = (busiest->load_per_task * SCHED_POWER_SCALE) / - busiest->group_power; + busiest->group_capacity; if (busiest->avg_load + scaled_busy_load_per_task >= local->avg_load + (scaled_busy_load_per_task * imbn)) { @@ -6132,34 +6133,34 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) * moving them. */ - pwr_now += busiest->group_power * + capa_now += busiest->group_capacity * min(busiest->load_per_task, busiest->avg_load); - pwr_now += local->group_power * + capa_now += local->group_capacity * min(local->load_per_task, local->avg_load); - pwr_now /= SCHED_POWER_SCALE; + capa_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ if (busiest->avg_load > scaled_busy_load_per_task) { - pwr_move += busiest->group_power * + capa_move += busiest->group_capacity * min(busiest->load_per_task, busiest->avg_load - scaled_busy_load_per_task); } /* Amount of load we'd add */ - if (busiest->avg_load * busiest->group_power < + if (busiest->avg_load * busiest->group_capacity < busiest->load_per_task * SCHED_POWER_SCALE) { - tmp = (busiest->avg_load * busiest->group_power) / - local->group_power; + tmp = (busiest->avg_load * busiest->group_capacity) / + local->group_capacity; } else { tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / - local->group_power; + local->group_capacity; } - pwr_move += local->group_power * + capa_move += local->group_capacity * min(local->load_per_task, local->avg_load + tmp); - pwr_move /= SCHED_POWER_SCALE; + capa_move /= SCHED_POWER_SCALE; /* Move if we gain throughput */ - if (pwr_move > pwr_now) + if (capa_move > capa_now) env->imbalance = busiest->load_per_task; } @@ -6207,7 +6208,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (busiest->sum_nr_running - busiest->group_capacity_factor); load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - load_above_capacity /= busiest->group_power; + load_above_capacity /= busiest->group_capacity; } /* @@ -6222,8 +6223,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* How much load to actually move to equalise the imbalance */ env->imbalance = min( - max_pull * busiest->group_power, - (sds->avg_load - local->avg_load) * local->group_power + max_pull * busiest->group_capacity, + (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_POWER_SCALE; /* @@ -6278,7 +6279,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; - sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; + sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_capacity; /* * If the busiest group is imbalanced the below checks don't @@ -6611,7 +6612,7 @@ more_balance: * We failed to reach balance because of affinity. */ if (sd_parent) { - int *group_imbalance = &sd_parent->groups->sgp->imbalance; + int *group_imbalance = &sd_parent->groups->sgc->imbalance; if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { *group_imbalance = 1; @@ -6998,7 +6999,7 @@ static inline void set_cpu_sd_state_busy(void) goto unlock; sd->nohz_idle = 0; - atomic_inc(&sd->groups->sgp->nr_busy_cpus); + atomic_inc(&sd->groups->sgc->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -7015,7 +7016,7 @@ void set_cpu_sd_state_idle(void) goto unlock; sd->nohz_idle = 1; - atomic_dec(&sd->groups->sgp->nr_busy_cpus); + atomic_dec(&sd->groups->sgc->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -7219,7 +7220,7 @@ end: * of an idle cpu is the system. * - This rq has more than one task. * - At any scheduler domain level, this cpu's scheduler group has multiple - * busy cpu's exceeding the group's power. + * busy cpu's exceeding the group's capacity. * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ @@ -7227,7 +7228,7 @@ static inline int nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; if (unlikely(rq->idle_balance)) @@ -7257,8 +7258,8 @@ static inline int nohz_kick_needed(struct rq *rq) sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (sd) { - sgp = sd->groups->sgp; - nr_busy = atomic_read(&sgp->nr_busy_cpus); + sgc = sd->groups->sgc; + nr_busy = atomic_read(&sgc->nr_busy_cpus); if (nr_busy > 1) goto need_kick_unlock; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 600e2291a75c..a5b957d53c92 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -728,15 +728,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); -struct sched_group_power { +struct sched_group_capacity { atomic_t ref; /* - * CPU power of this group, SCHED_LOAD_SCALE being max power for a - * single CPU. + * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity + * for a single CPU. */ - unsigned int power, power_orig; + unsigned int capacity, capacity_orig; unsigned long next_update; - int imbalance; /* XXX unrelated to power but shared group state */ + int imbalance; /* XXX unrelated to capacity but shared group state */ /* * Number of busy cpus in this group. */ @@ -750,7 +750,7 @@ struct sched_group { atomic_t ref; unsigned int group_weight; - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; /* * The CPUs this group covers. @@ -773,7 +773,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) */ static inline struct cpumask *sched_group_mask(struct sched_group *sg) { - return to_cpumask(sg->sgp->cpumask); + return to_cpumask(sg->sgc->cpumask); } /** @@ -1167,7 +1167,7 @@ extern const struct sched_class idle_sched_class; #ifdef CONFIG_SMP -extern void update_group_power(struct sched_domain *sd, int cpu); +extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -- cgit From ced549fa5fc1fdaf7fac93894dc673092eb3dc20 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:38 -0400 Subject: sched: Remove remaining dubious usage of "power" It is better not to think about compute capacity as being equivalent to "CPU power". The upcoming "power aware" scheduler work may create confusion with the notion of energy consumption if "power" is used too liberally. This is the remaining "power" -> "capacity" rename for local symbols. Those symbols visible to the rest of the kernel are not included yet. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-yyyhohzhkwnaotr3lx8zd5aa@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +-- kernel/sched/fair.c | 102 +++++++++++++++++++++++++-------------------------- kernel/sched/sched.h | 2 +- 3 files changed, 55 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2e1fb0902200..07bc78a50329 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5764,7 +5764,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) /* * build_sched_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. + * and ->cpu_capacity to 0. * * Assumes the sched_domain tree is fully constructed */ @@ -6471,7 +6471,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, } } - /* Calculate CPU power for physical packages and nodes */ + /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { if (!cpumask_test_cpu(i, cpu_map)) continue; @@ -6921,7 +6921,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_power = SCHED_POWER_SCALE; + rq->cpu_capacity = SCHED_POWER_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 36bd4d23fca8..58684f684fa8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1017,7 +1017,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long power_of(int cpu); +static unsigned long capacity_of(int cpu); static long effective_load(struct task_group *tg, int cpu, long wl, long wg); /* Cached statistics for all CPUs within a node */ @@ -1046,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(cpu); - ns->compute_capacity += power_of(cpu); + ns->compute_capacity += capacity_of(cpu); cpus++; } @@ -1214,7 +1214,7 @@ balance: orig_dst_load = env->dst_stats.load; orig_src_load = env->src_stats.load; - /* XXX missing power terms */ + /* XXX missing capacity terms */ load = task_h_load(env->p); dst_load = orig_dst_load + load; src_load = orig_src_load - load; @@ -4043,9 +4043,9 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long power_of(int cpu) +static unsigned long capacity_of(int cpu) { - return cpu_rq(cpu)->cpu_power; + return cpu_rq(cpu)->cpu_capacity; } static unsigned long cpu_avg_load_per_task(int cpu) @@ -4288,12 +4288,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) s64 this_eff_load, prev_eff_load; this_eff_load = 100; - this_eff_load *= power_of(prev_cpu); + this_eff_load *= capacity_of(prev_cpu); this_eff_load *= this_load + effective_load(tg, this_cpu, weight, weight); prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= power_of(this_cpu); + prev_eff_load *= capacity_of(this_cpu); prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); balanced = this_eff_load <= prev_eff_load; @@ -4950,14 +4950,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) * - * P_i is the cpu power (or compute capacity) of cpu i, typically it is the + * C_i is the compute capacity of cpu i, typically it is the * fraction of 'recent' time available for SCHED_OTHER task execution. But it * can also include other factors [XXX]. * * To achieve this balance we define a measure of imbalance which follows * directly from (1): * - * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) + * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4) * * We them move tasks around to minimize the imbalance. In the continuous * function space it is obvious this converges, in the discrete case we get @@ -5607,17 +5607,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) { return SCHED_POWER_SCALE; } unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) { - return default_scale_freq_power(sd, cpu); + return default_scale_capacity(sd, cpu); } -static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; unsigned long smt_gain = sd->smt_gain; @@ -5629,10 +5629,10 @@ static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) { - return default_scale_smt_power(sd, cpu); + return default_scale_smt_capacity(sd, cpu); } -static unsigned long scale_rt_power(int cpu) +static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); u64 total, available, age_stamp, avg; @@ -5652,7 +5652,7 @@ static unsigned long scale_rt_power(int cpu) total = sched_avg_period() + delta; if (unlikely(total < avg)) { - /* Ensures that power won't end up being negative */ + /* Ensures that capacity won't end up being negative */ available = 0; } else { available = total - avg; @@ -5666,38 +5666,38 @@ static unsigned long scale_rt_power(int cpu) return div_u64(available, total); } -static void update_cpu_power(struct sched_domain *sd, int cpu) +static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; - unsigned long power = SCHED_POWER_SCALE; + unsigned long capacity = SCHED_POWER_SCALE; struct sched_group *sdg = sd->groups; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { if (sched_feat(ARCH_POWER)) - power *= arch_scale_smt_power(sd, cpu); + capacity *= arch_scale_smt_power(sd, cpu); else - power *= default_scale_smt_power(sd, cpu); + capacity *= default_scale_smt_capacity(sd, cpu); - power >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_POWER_SHIFT; } - sdg->sgc->capacity_orig = power; + sdg->sgc->capacity_orig = capacity; if (sched_feat(ARCH_POWER)) - power *= arch_scale_freq_power(sd, cpu); + capacity *= arch_scale_freq_power(sd, cpu); else - power *= default_scale_freq_power(sd, cpu); + capacity *= default_scale_capacity(sd, cpu); - power >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_POWER_SHIFT; - power *= scale_rt_power(cpu); - power >>= SCHED_POWER_SHIFT; + capacity *= scale_rt_capacity(cpu); + capacity >>= SCHED_POWER_SHIFT; - if (!power) - power = 1; + if (!capacity) + capacity = 1; - cpu_rq(cpu)->cpu_power = power; - sdg->sgc->capacity = power; + cpu_rq(cpu)->cpu_capacity = capacity; + sdg->sgc->capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) @@ -5712,7 +5712,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) sdg->sgc->next_update = jiffies + interval; if (!child) { - update_cpu_power(sd, cpu); + update_cpu_capacity(sd, cpu); return; } @@ -5733,8 +5733,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * gets here before we've attached the domains to the * runqueues. * - * Use power_of(), which is set irrespective of domains - * in update_cpu_power(). + * Use capacity_of(), which is set irrespective of domains + * in update_cpu_capacity(). * * This avoids capacity/capacity_orig from being 0 and * causing divide-by-zero issues on boot. @@ -5742,8 +5742,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - capacity_orig += power_of(cpu); - capacity += power_of(cpu); + capacity_orig += capacity_of(cpu); + capacity += capacity_of(cpu); continue; } @@ -5831,7 +5831,7 @@ static inline int sg_imbalanced(struct sched_group *group) /* * Compute the group capacity factor. * - * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by + * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by * first dividing out the smt factor and computing the actual number of cores * and limit unit capacity with that. */ @@ -6129,7 +6129,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) /* * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by + * however we may be able to increase total CPU capacity used by * moving them. */ @@ -6190,7 +6190,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * In the presence of smp nice balancing, certain scenarios can have * max load less than avg load(as we skip the groups at or below - * its cpu_power, while calculating max_load..) + * its cpu_capacity, while calculating max_load..) */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { @@ -6345,11 +6345,11 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long busiest_load = 0, busiest_power = 1; + unsigned long busiest_load = 0, busiest_capacity = 1; int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long power, capacity_factor, wl; + unsigned long capacity, capacity_factor, wl; enum fbq_type rt; rq = cpu_rq(i); @@ -6377,8 +6377,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - power = power_of(i); - capacity_factor = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); + capacity = capacity_of(i); + capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_POWER_SCALE); if (!capacity_factor) capacity_factor = fix_small_capacity(env->sd, group); @@ -6386,25 +6386,25 @@ static struct rq *find_busiest_queue(struct lb_env *env, /* * When comparing with imbalance, use weighted_cpuload() - * which is not scaled with the cpu power. + * which is not scaled with the cpu capacity. */ if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) continue; /* * For the load comparisons with the other cpu's, consider - * the weighted_cpuload() scaled with the cpu power, so that - * the load can be moved away from the cpu that is potentially - * running at a lower capacity. + * the weighted_cpuload() scaled with the cpu capacity, so + * that the load can be moved away from the cpu that is + * potentially running at a lower capacity. * - * Thus we're looking for max(wl_i / power_i), crosswise + * Thus we're looking for max(wl_i / capacity_i), crosswise * multiplication to rid ourselves of the division works out - * to: wl_i * power_j > wl_j * power_i; where j is our - * previous maximum. + * to: wl_i * capacity_j > wl_j * capacity_i; where j is + * our previous maximum. */ - if (wl * busiest_power > busiest_load * power) { + if (wl * busiest_capacity > busiest_load * capacity) { busiest_load = wl; - busiest_power = power; + busiest_capacity = capacity; busiest = rq; } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a5b957d53c92..956b8ca24893 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -567,7 +567,7 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; - unsigned long cpu_power; + unsigned long cpu_capacity; unsigned char idle_balance; /* For active balancing */ -- cgit From ca8ce3d0b144c318a5a9ce99649053e9029061ea Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 May 2014 18:19:39 -0400 Subject: sched: Final power vs. capacity cleanups It is better not to think about compute capacity as being equivalent to "CPU power". The upcoming "power aware" scheduler work may create confusion with the notion of energy consumption if "power" is used too liberally. This contains the architecture visible changes. Incidentally, only ARM takes advantage of the available pow^H^H^Hcapacity scaling hooks and therefore those changes outside kernel/sched/ are confined to one ARM specific file. The default arch_scale_smt_power() hook is not overridden by anyone. Replacements are as follows: arch_scale_freq_power --> arch_scale_freq_capacity arch_scale_smt_power --> arch_scale_smt_capacity SCHED_POWER_SCALE --> SCHED_CAPACITY_SCALE SCHED_POWER_SHIFT --> SCHED_CAPACITY_SHIFT The local usage of "power" in arch/arm/kernel/topology.c is also changed to "capacity" as appropriate. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Arnd Bergmann Cc: Dietmar Eggemann Cc: Grant Likely Cc: Linus Torvalds Cc: Mark Brown Cc: Rob Herring Cc: Russell King Cc: Sudeep KarkadaNagesha Cc: Vincent Guittot Cc: devicetree@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-48zba9qbznvglwelgq2cfygh@git.kernel.org Signed-off-by: Ingo Molnar --- arch/arm/kernel/topology.c | 54 +++++++++++++++++++++--------------------- include/linux/sched.h | 6 ++--- kernel/sched/core.c | 6 ++--- kernel/sched/fair.c | 59 +++++++++++++++++++++++----------------------- 4 files changed, 63 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 71e1fec6d31a..d42a7db22236 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -26,30 +26,30 @@ #include /* - * cpu power scale management + * cpu capacity scale management */ /* - * cpu power table + * cpu capacity table * This per cpu data structure describes the relative capacity of each core. * On a heteregenous system, cores don't have the same computation capacity - * and we reflect that difference in the cpu_power field so the scheduler can - * take this difference into account during load balance. A per cpu structure - * is preferred because each CPU updates its own cpu_power field during the - * load balance except for idle cores. One idle core is selected to run the - * rebalance_domains for all idle cores and the cpu_power can be updated - * during this sequence. + * and we reflect that difference in the cpu_capacity field so the scheduler + * can take this difference into account during load balance. A per cpu + * structure is preferred because each CPU updates its own cpu_capacity field + * during the load balance except for idle cores. One idle core is selected + * to run the rebalance_domains for all idle cores and the cpu_capacity can be + * updated during this sequence. */ static DEFINE_PER_CPU(unsigned long, cpu_scale); -unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) +unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return per_cpu(cpu_scale, cpu); } -static void set_power_scale(unsigned int cpu, unsigned long power) +static void set_capacity_scale(unsigned int cpu, unsigned long capacity) { - per_cpu(cpu_scale, cpu) = power; + per_cpu(cpu_scale, cpu) = capacity; } #ifdef CONFIG_OF @@ -62,11 +62,11 @@ struct cpu_efficiency { * Table of relative efficiency of each processors * The efficiency value must fit in 20bit and the final * cpu_scale value must be in the range - * 0 < cpu_scale < 3*SCHED_POWER_SCALE/2 + * 0 < cpu_scale < 3*SCHED_CAPACITY_SCALE/2 * in order to return at most 1 when DIV_ROUND_CLOSEST * is used to compute the capacity of a CPU. * Processors that are not defined in the table, - * use the default SCHED_POWER_SCALE value for cpu_scale. + * use the default SCHED_CAPACITY_SCALE value for cpu_scale. */ static const struct cpu_efficiency table_efficiency[] = { {"arm,cortex-a15", 3891}, @@ -83,9 +83,9 @@ static unsigned long middle_capacity = 1; * Iterate all CPUs' descriptor in DT and compute the efficiency * (as per table_efficiency). Also calculate a middle efficiency * as close as possible to (max{eff_i} - min{eff_i}) / 2 - * This is later used to scale the cpu_power field such that an - * 'average' CPU is of middle power. Also see the comments near - * table_efficiency[] and update_cpu_power(). + * This is later used to scale the cpu_capacity field such that an + * 'average' CPU is of middle capacity. Also see the comments near + * table_efficiency[] and update_cpu_capacity(). */ static void __init parse_dt_topology(void) { @@ -141,15 +141,15 @@ static void __init parse_dt_topology(void) * cpu_scale because all CPUs have the same capacity. Otherwise, we * compute a middle_capacity factor that will ensure that the capacity * of an 'average' CPU of the system will be as close as possible to - * SCHED_POWER_SCALE, which is the default value, but with the + * SCHED_CAPACITY_SCALE, which is the default value, but with the * constraint explained near table_efficiency[]. */ if (4*max_capacity < (3*(max_capacity + min_capacity))) middle_capacity = (min_capacity + max_capacity) - >> (SCHED_POWER_SHIFT+1); + >> (SCHED_CAPACITY_SHIFT+1); else middle_capacity = ((max_capacity / 3) - >> (SCHED_POWER_SHIFT-1)) + 1; + >> (SCHED_CAPACITY_SHIFT-1)) + 1; } @@ -158,20 +158,20 @@ static void __init parse_dt_topology(void) * boot. The update of all CPUs is in O(n^2) for heteregeneous system but the * function returns directly for SMP system. */ -static void update_cpu_power(unsigned int cpu) +static void update_cpu_capacity(unsigned int cpu) { if (!cpu_capacity(cpu)) return; - set_power_scale(cpu, cpu_capacity(cpu) / middle_capacity); + set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity); - printk(KERN_INFO "CPU%u: update cpu_power %lu\n", - cpu, arch_scale_freq_power(NULL, cpu)); + printk(KERN_INFO "CPU%u: update cpu_capacity %lu\n", + cpu, arch_scale_freq_capacity(NULL, cpu)); } #else static inline void parse_dt_topology(void) {} -static inline void update_cpu_power(unsigned int cpuid) {} +static inline void update_cpu_capacity(unsigned int cpuid) {} #endif /* @@ -267,7 +267,7 @@ void store_cpu_topology(unsigned int cpuid) update_siblings_masks(cpuid); - update_cpu_power(cpuid); + update_cpu_capacity(cpuid); printk(KERN_INFO "CPU%u: thread %d, cpu %d, socket %d, mpidr %x\n", cpuid, cpu_topology[cpuid].thread_id, @@ -297,7 +297,7 @@ void __init init_cpu_topology(void) { unsigned int cpu; - /* init core mask and power*/ + /* init core mask and capacity */ for_each_possible_cpu(cpu) { struct cputopo_arm *cpu_topo = &(cpu_topology[cpu]); @@ -307,7 +307,7 @@ void __init init_cpu_topology(void) cpumask_clear(&cpu_topo->core_sibling); cpumask_clear(&cpu_topo->thread_sibling); - set_power_scale(cpu, SCHED_POWER_SCALE); + set_capacity_scale(cpu, SCHED_CAPACITY_SCALE); } smp_wmb(); diff --git a/include/linux/sched.h b/include/linux/sched.h index a96f03598c61..322110affe63 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -854,10 +854,10 @@ enum cpu_idle_type { }; /* - * Increase resolution of cpu_power calculations + * Increase resolution of cpu_capacity calculations */ -#define SCHED_POWER_SHIFT 10 -#define SCHED_POWER_SCALE (1L << SCHED_POWER_SHIFT) +#define SCHED_CAPACITY_SHIFT 10 +#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) /* * sched-domains (multiprocessor balancing) declarations: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 07bc78a50329..7ba4f5413a10 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5249,7 +5249,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->sgc->capacity != SCHED_POWER_SCALE) { + if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { printk(KERN_CONT " (cpu_capacity = %d)", group->sgc->capacity); } @@ -5715,7 +5715,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * domains and no possible iteration will get us here, we won't * die on a /0 trap. */ - sg->sgc->capacity = SCHED_POWER_SCALE * cpumask_weight(sg_span); + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); sg->sgc->capacity_orig = sg->sgc->capacity; /* @@ -6921,7 +6921,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_capacity = SCHED_POWER_SCALE; + rq->cpu_capacity = SCHED_CAPACITY_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 58684f684fa8..dc7d6527a282 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1062,9 +1062,9 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->load = (ns->load * SCHED_POWER_SCALE) / ns->compute_capacity; + ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; ns->task_capacity = - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_POWER_SCALE); + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } @@ -4370,7 +4370,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU capacity of the group */ - avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgc->capacity; + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; if (local_group) { this_load = avg_load; @@ -5609,10 +5609,10 @@ static inline int get_sd_load_idx(struct sched_domain *sd, static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) { - return SCHED_POWER_SCALE; + return SCHED_CAPACITY_SCALE; } -unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) { return default_scale_capacity(sd, cpu); } @@ -5627,7 +5627,7 @@ static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu return smt_gain; } -unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) { return default_scale_smt_capacity(sd, cpu); } @@ -5658,10 +5658,10 @@ static unsigned long scale_rt_capacity(int cpu) available = total - avg; } - if (unlikely((s64)total < SCHED_POWER_SCALE)) - total = SCHED_POWER_SCALE; + if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) + total = SCHED_CAPACITY_SCALE; - total >>= SCHED_POWER_SHIFT; + total >>= SCHED_CAPACITY_SHIFT; return div_u64(available, total); } @@ -5669,29 +5669,29 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; - unsigned long capacity = SCHED_POWER_SCALE; + unsigned long capacity = SCHED_CAPACITY_SCALE; struct sched_group *sdg = sd->groups; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { if (sched_feat(ARCH_POWER)) - capacity *= arch_scale_smt_power(sd, cpu); + capacity *= arch_scale_smt_capacity(sd, cpu); else capacity *= default_scale_smt_capacity(sd, cpu); - capacity >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; } sdg->sgc->capacity_orig = capacity; if (sched_feat(ARCH_POWER)) - capacity *= arch_scale_freq_power(sd, cpu); + capacity *= arch_scale_freq_capacity(sd, cpu); else capacity *= default_scale_capacity(sd, cpu); - capacity >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; capacity *= scale_rt_capacity(cpu); - capacity >>= SCHED_POWER_SHIFT; + capacity >>= SCHED_CAPACITY_SHIFT; if (!capacity) capacity = 1; @@ -5780,7 +5780,7 @@ static inline int fix_small_capacity(struct sched_domain *sd, struct sched_group *group) { /* - * Only siblings can have significantly less than SCHED_POWER_SCALE + * Only siblings can have significantly less than SCHED_CAPACITY_SCALE */ if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; @@ -5845,11 +5845,11 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro cpus = group->group_weight; /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ - smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, capacity_orig); + smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); capacity_factor = cpus / smt; /* cores */ capacity_factor = min_t(unsigned, - capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_POWER_SCALE)); + capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); if (!capacity_factor) capacity_factor = fix_small_capacity(env->sd, group); @@ -5895,7 +5895,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Adjust by relative CPU capacity of the group */ sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_capacity; + sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; @@ -6089,7 +6089,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) env->imbalance = DIV_ROUND_CLOSEST( sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, - SCHED_POWER_SCALE); + SCHED_CAPACITY_SCALE); return 1; } @@ -6118,7 +6118,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) imbn = 1; scaled_busy_load_per_task = - (busiest->load_per_task * SCHED_POWER_SCALE) / + (busiest->load_per_task * SCHED_CAPACITY_SCALE) / busiest->group_capacity; if (busiest->avg_load + scaled_busy_load_per_task >= @@ -6137,7 +6137,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) min(busiest->load_per_task, busiest->avg_load); capa_now += local->group_capacity * min(local->load_per_task, local->avg_load); - capa_now /= SCHED_POWER_SCALE; + capa_now /= SCHED_CAPACITY_SCALE; /* Amount of load we'd subtract */ if (busiest->avg_load > scaled_busy_load_per_task) { @@ -6148,16 +6148,16 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) /* Amount of load we'd add */ if (busiest->avg_load * busiest->group_capacity < - busiest->load_per_task * SCHED_POWER_SCALE) { + busiest->load_per_task * SCHED_CAPACITY_SCALE) { tmp = (busiest->avg_load * busiest->group_capacity) / local->group_capacity; } else { - tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / + tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / local->group_capacity; } capa_move += local->group_capacity * min(local->load_per_task, local->avg_load + tmp); - capa_move /= SCHED_POWER_SCALE; + capa_move /= SCHED_CAPACITY_SCALE; /* Move if we gain throughput */ if (capa_move > capa_now) @@ -6207,7 +6207,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s load_above_capacity = (busiest->sum_nr_running - busiest->group_capacity_factor); - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); + load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); load_above_capacity /= busiest->group_capacity; } @@ -6225,7 +6225,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s env->imbalance = min( max_pull * busiest->group_capacity, (sds->avg_load - local->avg_load) * local->group_capacity - ) / SCHED_POWER_SCALE; + ) / SCHED_CAPACITY_SCALE; /* * if *imbalance is less than the average load per runnable task @@ -6279,7 +6279,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; - sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_capacity; + sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) + / sds.total_capacity; /* * If the busiest group is imbalanced the below checks don't @@ -6378,7 +6379,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; capacity = capacity_of(i); - capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_POWER_SCALE); + capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); if (!capacity_factor) capacity_factor = fix_small_capacity(env->sd, group); -- cgit From 5d4dfddd4f02b028d6ddaaa04d75d3b0cad1c9ae Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 27 May 2014 13:50:41 -0400 Subject: sched: Rename capacity related flags It is better not to think about compute capacity as being equivalent to "CPU power". The upcoming "power aware" scheduler work may create confusion with the notion of energy consumption if "power" is used too liberally. Let's rename the following feature flags since they do relate to capacity: SD_SHARE_CPUPOWER -> SD_SHARE_CPUCAPACITY ARCH_POWER -> ARCH_CAPACITY NONTASK_POWER -> NONTASK_CAPACITY Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Cc: Daniel Lezcano Cc: Morten Rasmussen Cc: "Rafael J. Wysocki" Cc: linaro-kernel@lists.linaro.org Cc: Andy Fleming Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Grant Likely Cc: Linus Torvalds Cc: Michael Ellerman Cc: Paul Gortmaker Cc: Paul Mackerras Cc: Preeti U Murthy Cc: Rob Herring Cc: Srivatsa S. Bhat Cc: Toshi Kani Cc: Vasant Hegde Cc: Vincent Guittot Cc: devicetree@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/n/tip-e93lpnxb87owfievqatey6b5@git.kernel.org Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/smp.c | 2 +- include/linux/sched.h | 4 ++-- kernel/sched/core.c | 14 +++++++------- kernel/sched/fair.c | 8 ++++---- kernel/sched/features.h | 8 ++++---- 5 files changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 10ffffef0414..c51d16379cba 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -770,7 +770,7 @@ int setup_profiling_timer(unsigned int multiplier) /* cpumask of CPUs with asymetric SMT dependancy */ static const int powerpc_smt_flags(void) { - int flags = SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES; + int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); diff --git a/include/linux/sched.h b/include/linux/sched.h index 322110affe63..ce93768a3312 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -869,7 +869,7 @@ enum cpu_idle_type { #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ +#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ @@ -881,7 +881,7 @@ enum cpu_idle_type { #ifdef CONFIG_SCHED_SMT static inline const int cpu_smt_flags(void) { - return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES; + return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; } #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7ba4f5413a10..5976ca579d3e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -872,7 +872,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->clock_task += delta; #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) + if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) sched_rt_avg_update(rq, irq_delta + steal); #endif } @@ -5309,7 +5309,7 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | + SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) @@ -5340,7 +5340,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | + SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | SD_SHARE_POWERDOMAIN); @@ -5947,7 +5947,7 @@ static int sched_domains_curr_level; /* * SD_flags allowed in topology descriptions. * - * SD_SHARE_CPUPOWER - describes SMT topologies + * SD_SHARE_CPUCAPACITY - describes SMT topologies * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain @@ -5956,7 +5956,7 @@ static int sched_domains_curr_level; * SD_ASYM_PACKING - describes SMT quirks */ #define TOPOLOGY_SD_FLAGS \ - (SD_SHARE_CPUPOWER | \ + (SD_SHARE_CPUCAPACITY | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ @@ -6002,7 +6002,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE | 1*SD_WAKE_AFFINE - | 0*SD_SHARE_CPUPOWER + | 0*SD_SHARE_CPUCAPACITY | 0*SD_SHARE_PKG_RESOURCES | 0*SD_SERIALIZE | 0*SD_PREFER_SIBLING @@ -6024,7 +6024,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) * Convert topological properties into behaviour. */ - if (sd->flags & SD_SHARE_CPUPOWER) { + if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dc7d6527a282..d3c731222199 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5672,8 +5672,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) unsigned long capacity = SCHED_CAPACITY_SCALE; struct sched_group *sdg = sd->groups; - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - if (sched_feat(ARCH_POWER)) + if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { + if (sched_feat(ARCH_CAPACITY)) capacity *= arch_scale_smt_capacity(sd, cpu); else capacity *= default_scale_smt_capacity(sd, cpu); @@ -5683,7 +5683,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) sdg->sgc->capacity_orig = capacity; - if (sched_feat(ARCH_POWER)) + if (sched_feat(ARCH_CAPACITY)) capacity *= arch_scale_freq_capacity(sd, cpu); else capacity *= default_scale_capacity(sd, cpu); @@ -5782,7 +5782,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /* * Only siblings can have significantly less than SCHED_CAPACITY_SCALE */ - if (!(sd->flags & SD_SHARE_CPUPOWER)) + if (!(sd->flags & SD_SHARE_CPUCAPACITY)) return 0; /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 5716929a2e3a..90284d117fe6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) SCHED_FEAT(WAKEUP_PREEMPTION, true) /* - * Use arch dependent cpu power functions + * Use arch dependent cpu capacity functions */ -SCHED_FEAT(ARCH_POWER, true) +SCHED_FEAT(ARCH_CAPACITY, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) /* - * Decrement CPU power based on time not spent running tasks + * Decrement CPU capacity based on time not spent running tasks */ -SCHED_FEAT(NONTASK_POWER, true) +SCHED_FEAT(NONTASK_CAPACITY, true) /* * Queue remote wakeups on the target CPU and process them -- cgit From 0b07939cbfdd05bed0c5ec01b8b25493e6ecd34c Mon Sep 17 00:00:00 2001 From: Giedrius Rekasius Date: Sun, 25 May 2014 15:23:31 +0100 Subject: sched: Remove redundant assignment to "rt_rq" in update_curr_rt(...) Variable "rt_rq" is used only in block "for_each_sched_rt_entity" so the value assigned to it at the beginning of the update_curr_rt(...) gets overwritten without ever being read. Remove redundant assignment and move variable declaration to the block in which it is being used. Signed-off-by: Giedrius Rekasius Signed-off-by: Peter Zijlstra Cc: kernel-janitors@vger.kernel.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1401027811-30066-1-git-send-email-giedrius.rekasius@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0ebfd7a29472..43406db306af 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -924,7 +924,6 @@ static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_rt_entity *rt_se = &curr->rt; - struct rt_rq *rt_rq = rt_rq_of_se(rt_se); u64 delta_exec; if (curr->sched_class != &rt_sched_class) @@ -949,7 +948,7 @@ static void update_curr_rt(struct rq *rq) return; for_each_sched_rt_entity(rt_se) { - rt_rq = rt_rq_of_se(rt_se); + struct rt_rq *rt_rq = rt_rq_of_se(rt_se); if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { raw_spin_lock(&rt_rq->rt_runtime_lock); -- cgit From dfc68f29ae67f2a6e799b44e6a4eb3417dffbfcd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 4 Jun 2014 10:31:15 -0700 Subject: sched, trace: Add a tracepoint for IPI-less remote wakeups Remote wakeups of polling CPUs are a valuable performance improvement; add a tracepoint to make it much easier to verify that they're working. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: umgwanakikbuti@gmail.com Cc: David Ahern Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mel Gorman Cc: Oleg Nesterov Cc: Steven Rostedt Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/16205aee116772aa686814f9b13bccb562108047.1401902905.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 20 ++++++++++++++++++++ kernel/sched/core.c | 4 ++++ 2 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 67e1bbf83695..0a68d5ae584e 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -530,6 +530,26 @@ TRACE_EVENT(sched_swap_numa, __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid, __entry->dst_cpu, __entry->dst_nid) ); + +/* + * Tracepoint for waking a polling cpu without an IPI. + */ +TRACE_EVENT(sched_wake_idle_without_ipi, + + TP_PROTO(int cpu), + + TP_ARGS(cpu), + + TP_STRUCT__entry( + __field( int, cpu ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + ), + + TP_printk("cpu=%d", __entry->cpu) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5976ca579d3e..e4c0ddd3db8e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -564,6 +564,8 @@ void resched_task(struct task_struct *p) if (set_nr_and_not_polling(p)) smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } void resched_cpu(int cpu) @@ -647,6 +649,8 @@ static void wake_up_idle_cpu(int cpu) smp_mb(); if (!tsk_is_polling(rq->idle)) smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } static bool wake_up_full_nohz_cpu(int cpu) -- cgit From 82c65d60d64401aedc1006d6572469bbfdf148de Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 4 Jun 2014 10:31:16 -0700 Subject: sched/idle: Clear polling before descheduling the idle thread Currently, the only real guarantee provided by the polling bit is that, if you hold rq->lock and the polling bit is set, then you can set need_resched to force a reschedule. The only reason the lock is needed is that the idle thread might not be running at all when setting its need_resched bit, and rq->lock keeps it pinned. This is easy to fix: just clear the polling bit before scheduling. Now the idle thread's polling bit is only ever set when rq->curr == rq->idle. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: umgwanakikbuti@gmail.com Cc: Rafael J. Wysocki Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/b2059fcb4c613d520cb503b6fad6e47033c7c203.1401902905.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 25b9423abce9..fe4b24bf33ca 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -67,6 +67,10 @@ void __weak arch_cpu_idle(void) * cpuidle_idle_call - the main idle function * * NOTE: no locks or semaphores should be used here + * + * On archs that support TIF_POLLING_NRFLAG, is called with polling + * set, and it returns with polling set. If it ever stops polling, it + * must clear the polling bit. */ static void cpuidle_idle_call(void) { @@ -175,10 +179,22 @@ exit_idle: /* * Generic idle loop implementation + * + * Called with polling cleared. */ static void cpu_idle_loop(void) { while (1) { + /* + * If the arch has a polling bit, we maintain an invariant: + * + * Our polling bit is clear if we're not scheduled (i.e. if + * rq->curr != rq->idle). This means that, if rq->idle has + * the polling bit set, then setting need_resched is + * guaranteed to cause the cpu to reschedule. + */ + + __current_set_polling(); tick_nohz_idle_enter(); while (!need_resched()) { @@ -218,6 +234,15 @@ static void cpu_idle_loop(void) */ preempt_set_need_resched(); tick_nohz_idle_exit(); + __current_clr_polling(); + + /* + * We promise to reschedule if need_resched is set while + * polling is set. That means that clearing polling + * needs to be visible before rescheduling. + */ + smp_mb__after_atomic(); + schedule_preempt_disabled(); } } @@ -239,7 +264,6 @@ void cpu_startup_entry(enum cpuhp_state state) */ boot_init_stack_canary(); #endif - __current_set_polling(); arch_cpu_idle_prepare(); cpu_idle_loop(); } -- cgit From 67b9ca70c3030e832999e8d1cdba2984c7bb5bfc Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 4 Jun 2014 10:31:17 -0700 Subject: sched/idle: Simplify wake_up_idle_cpu() Now that rq->idle's polling bit is a reliable indication that the cpu is polling, use it. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra Cc: Rafael J. Wysocki Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: umgwanakikbuti@gmail.com Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/922f00761445a830ebb23d058e2ae53956ce2d73.1401902905.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e4c0ddd3db8e..6afbfeefddd6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -628,26 +628,7 @@ static void wake_up_idle_cpu(int cpu) if (cpu == smp_processor_id()) return; - /* - * This is safe, as this function is called with the timer - * wheel base lock of (cpu) held. When the CPU is on the way - * to idle and has not yet set rq->curr to idle then it will - * be serialized on the timer wheel base lock and take the new - * timer into account automatically. - */ - if (rq->curr != rq->idle) - return; - - /* - * We can set TIF_RESCHED on the idle task of the other CPU - * lockless. The worst case is that the other CPU runs the - * idle task through an additional NOOP schedule() - */ - set_tsk_need_resched(rq->idle); - - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(rq->idle)) + if (set_nr_and_not_polling(rq->idle)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); -- cgit From e3baac47f0e82c4be632f4f97215bb93bf16b342 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Jun 2014 10:31:18 -0700 Subject: sched/idle: Optimize try-to-wake-up IPI [ This series reduces the number of IPIs on Andy's workload by something like 99%. It's down from many hundreds per second to very few. The basic idea behind this series is to make TIF_POLLING_NRFLAG be a reliable indication that the idle task is polling. Once that's done, the rest is reasonably straightforward. ] When enqueueing tasks on remote LLC domains, we send an IPI to do the work 'locally' and avoid bouncing all the cachelines over. However, when the remote CPU is idle (and polling, say x86 mwait), we don't need to send an IPI, we can simply kick the TIF word to wake it up and have the 'idle' loop do the work. So when _TIF_POLLING_NRFLAG is set, but _TIF_NEED_RESCHED is not (yet) set, set _TIF_NEED_RESCHED and avoid sending the IPI. Much-requested-by: Andy Lutomirski Signed-off-by: Peter Zijlstra [Edited by Andy Lutomirski, but this is mostly Peter Zijlstra's code.] Signed-off-by: Andy Lutomirski Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: Mike Galbraith Cc: umgwanakikbuti@gmail.com Cc: Rafael J. Wysocki Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/ce06f8b02e7e337be63e97597fc4b248d3aa6f9b.1401902905.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++------ kernel/sched/idle.c | 10 +++++++--- kernel/sched/sched.h | 6 ++++++ 3 files changed, 61 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6afbfeefddd6..60d4e05d64dd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -519,7 +519,7 @@ static inline void init_hrtick(void) __old; \ }) -#ifdef TIF_POLLING_NRFLAG +#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) /* * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, * this avoids any races wrt polling state changes and thereby avoids @@ -530,12 +530,44 @@ static bool set_nr_and_not_polling(struct task_struct *p) struct thread_info *ti = task_thread_info(p); return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); } + +/* + * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. + * + * If this returns true, then the idle task promises to call + * sched_ttwu_pending() and reschedule soon. + */ +static bool set_nr_if_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); + + for (;;) { + if (!(val & _TIF_POLLING_NRFLAG)) + return false; + if (val & _TIF_NEED_RESCHED) + return true; + old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); + if (old == val) + break; + val = old; + } + return true; +} + #else static bool set_nr_and_not_polling(struct task_struct *p) { set_tsk_need_resched(p); return true; } + +#ifdef CONFIG_SMP +static bool set_nr_if_polling(struct task_struct *p) +{ + return false; +} +#endif #endif /* @@ -1490,13 +1522,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static void sched_ttwu_pending(void) +void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); struct task_struct *p; + unsigned long flags; - raw_spin_lock(&rq->lock); + if (!llist) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); while (llist) { p = llist_entry(llist, struct task_struct, wake_entry); @@ -1504,7 +1540,7 @@ static void sched_ttwu_pending(void) ttwu_do_activate(rq, p, 0); } - raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&rq->lock, flags); } void scheduler_ipi(void) @@ -1550,8 +1586,14 @@ void scheduler_ipi(void) static void ttwu_queue_remote(struct task_struct *p, int cpu) { - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) - smp_send_reschedule(cpu); + struct rq *rq = cpu_rq(cpu); + + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { + if (!set_nr_if_polling(rq->idle)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); + } } bool cpus_share_cache(int this_cpu, int that_cpu) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index fe4b24bf33ca..cf009fb0bc25 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -12,6 +12,8 @@ #include +#include "sched.h" + static int __read_mostly cpu_idle_force_poll; void cpu_idle_poll_ctrl(bool enable) @@ -237,12 +239,14 @@ static void cpu_idle_loop(void) __current_clr_polling(); /* - * We promise to reschedule if need_resched is set while - * polling is set. That means that clearing polling - * needs to be visible before rescheduling. + * We promise to call sched_ttwu_pending and reschedule + * if need_resched is set while polling is set. That + * means that clearing polling needs to be visible + * before doing these things. */ smp_mb__after_atomic(); + sched_ttwu_pending(); schedule_preempt_disabled(); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 956b8ca24893..2f8636199b83 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -670,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *); #ifdef CONFIG_SMP +extern void sched_ttwu_pending(void); + #define rcu_dereference_check_sched_domain(p) \ rcu_dereference_check((p), \ lockdep_is_held(&sched_domains_mutex)) @@ -787,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group) extern int group_balance_cpu(struct sched_group *sg); +#else + +static inline void sched_ttwu_pending(void) { } + #endif /* CONFIG_SMP */ #include "stats.h" -- cgit From ebf905fc7a6e7c99c53b5afc888d8f950da90aff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 May 2014 19:00:24 +0200 Subject: perf: Fix use after free in perf_remove_from_context() While that mutex should guard the elements, it doesn't guard against the use-after-free that's from list_for_each_entry_rcu(). __perf_event_exit_task() can actually free the event. And because list addition/deletion is guarded by both ctx->mutex and ctx->lock, holding ctx->mutex is sufficient for reading the list, so we don't actually need the rcu list iteration. Fixes: 3a497f48637e ("perf: Simplify perf_event_exit_task_context()") Reported-by: Sasha Levin Tested-by: Sasha Levin Signed-off-by: Peter Zijlstra Cc: Dave Jones Cc: acme@ghostprotocols.net Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140529170024.GA2315@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ed50b0943213..a62d142ad498 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7431,7 +7431,7 @@ __perf_event_exit_task(struct perf_event *child_event, static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { - struct perf_event *child_event; + struct perf_event *child_event, *next; struct perf_event_context *child_ctx; unsigned long flags; @@ -7485,7 +7485,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ mutex_lock(&child_ctx->mutex); - list_for_each_entry_rcu(child_event, &child_ctx->event_list, event_entry) + list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) __perf_event_exit_task(child_event, child_ctx, child); mutex_unlock(&child_ctx->mutex); -- cgit From 53b25335dd60981ad608da7890420898a34469a6 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 16 May 2014 17:12:12 -0400 Subject: perf: Disable sampled events if no PMU interrupt Add common code to generate -ENOTSUPP at event creation time if an architecture attempts to create a sampled event and PERF_PMU_NO_INTERRUPT is set. This adds a new pmu->capabilities flag. Initially we only support PERF_PMU_NO_INTERRUPT (to indicate a PMU has no support for generating hardware interrupts) but there are other capabilities that can be added later. Signed-off-by: Vince Weaver Acked-by: Will Deacon [peterz: rename to PERF_PMU_CAP_* and moved the pmu::capabilities word into a hole] Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1405161708060.11099@vincent-weaver-1.umelst.maine.edu Signed-off-by: Ingo Molnar Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 10 ++++++++++ kernel/events/core.c | 7 +++++++ 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index af6dcf1d9e47..267c8f37012c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -166,6 +166,11 @@ struct perf_event; */ #define PERF_EVENT_TXN 0x1 +/** + * pmu::capabilities flags + */ +#define PERF_PMU_CAP_NO_INTERRUPT 0x01 + /** * struct pmu - generic performance monitoring unit */ @@ -178,6 +183,11 @@ struct pmu { const char *name; int type; + /* + * various common per-pmu feature flags + */ + int capabilities; + int * __percpu pmu_disable_count; struct perf_cpu_context * __percpu pmu_cpu_context; int task_ctx_nr; diff --git a/kernel/events/core.c b/kernel/events/core.c index a62d142ad498..e9ef0c6646af 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7120,6 +7120,13 @@ SYSCALL_DEFINE5(perf_event_open, } } + if (is_sampling_event(event)) { + if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { + err = -ENOTSUPP; + goto err_alloc; + } + } + account_event(event); /* -- cgit From 41ccba029e9492dd0fc1bf5c23b72c6322a6dfe9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 19 May 2014 20:40:54 +0200 Subject: uprobes: Shift ->readpage check from __copy_insn() to uprobe_register() copy_insn() fails with -EIO if ->readpage == NULL, but this error is not propagated unless uprobe_register() path finds ->mm which already mmaps this file. In this case (say) "perf record" does not actually install the probe, but the user can't know about this. Move this check into uprobe_register() so that this problem can be detected earlier and reported to user. Note: this is still not perfect, - copy_insn() and arch_uprobe_analyze_insn() should be called by uprobe_register() but this is not simple, we need vm_file for read_mapping_page() (although perhaps we can pass NULL), and we need ->mm for is_64bit_mm() (although this logic is broken anyway). - uprobe_register() should be called by create_trace_uprobe(), not by probe_event_enable(), so that an error can be detected at "perf probe -x" time. This also needs more changes in the core uprobe code, uprobe register/unregister interface was poorly designed from the very beginning. Reported-by: Denys Vlasenko Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra Cc: Hugh Dickins Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140519184054.GA6750@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3b02c72938a8..c56b13e3b5e1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -536,9 +536,6 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, void *insn, int nbytes, loff_t offset) { struct page *page; - - if (!mapping->a_ops->readpage) - return -EIO; /* * Ensure that the page that has the original instruction is * populated and in page-cache. @@ -879,6 +876,9 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * if (!uc->handler && !uc->ret_handler) return -EINVAL; + /* copy_insn()->read_mapping_page() needs ->readpage() */ + if (!inode->i_mapping->a_ops->readpage) + return -EIO; /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return -EINVAL; -- cgit From 40814f6805a524130a5ec313871147f7aa9b5318 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 19 May 2014 20:41:36 +0200 Subject: uprobes: Teach copy_insn() to support tmpfs tmpfs is widely used but as Denys reports shmem_aops doesn't have ->readpage() and thus you can't probe a binary on this filesystem. As Hugh suggested we can use shmem_read_mapping_page() in this case, just we need to check shmem_mapping() if ->readpage == NULL. Reported-by: Denys Vlasenko Suggested-by: Hugh Dickins Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140519184136.GB6750@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c56b13e3b5e1..6bfb6717f878 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -36,6 +36,7 @@ #include "../../mm/internal.h" /* munlock_vma_page */ #include #include +#include #include @@ -537,10 +538,14 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, { struct page *page; /* - * Ensure that the page that has the original instruction is - * populated and in page-cache. + * Ensure that the page that has the original instruction is populated + * and in page-cache. If ->readpage == NULL it must be shmem_mapping(), + * see uprobe_register(). */ - page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); + if (mapping->a_ops->readpage) + page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); + else + page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT); if (IS_ERR(page)) return PTR_ERR(page); @@ -876,8 +881,8 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * if (!uc->handler && !uc->ret_handler) return -EINVAL; - /* copy_insn()->read_mapping_page() needs ->readpage() */ - if (!inode->i_mapping->a_ops->readpage) + /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ + if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping)) return -EIO; /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) -- cgit From f602d0632755be4f7eddfdd6c0af13216790177b Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Tue, 13 May 2014 16:40:05 -0400 Subject: sched/deadline: Delete extraneous extern for to_ratio() There was a prototype for it added to kernel/sched/sched.h at the same time the extern was added, so the extern in the C file was never really ever needed. See commit 332ac17ef5bfcff4766dfdfd3b4cdf10b8f8f155 ("sched/deadline: Add bandwidth management for SCHED_DEADLINE tasks") for details. Signed-off-by: Paul Gortmaker Cc: Peter Zijlstra Cc: Dario Faggioli Link: http://lkml.kernel.org/r/1400013605-18717-1-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f9ca7d19781a..0d6b17057188 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) dl_b->dl_runtime = runtime; } -extern unsigned long to_ratio(u64 period, u64 runtime); - void init_dl_bw(struct dl_bw *dl_b) { raw_spin_lock_init(&dl_b->lock); -- cgit From c731ae1d0f02a300697a8b1564780ad28a6c2013 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 5 Jun 2014 17:16:30 +0800 Subject: cgroup: disallow disabled controllers on the default hierarchy After booting with cgroup_disable=memory, I still saw memcg files in the default hierarchy, and I can write to them, though it won't take effect. # dmesg ... Disabling memory control group subsystem ... # mount -t cgroup -o __DEVEL__sane_behavior xxx /cgroup # ls /cgroup ... memory.failcnt memory.move_charge_at_immigrate memory.force_empty memory.numa_stat memory.limit_in_bytes memory.oom_control ... # cat /cgroup/memory.usage_in_bytes 0 tj: Minor comment update. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3f46165829a4..3edcc8ae83b5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3069,6 +3069,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; + if (ss->disabled) + return 0; + if (!cfts || cfts[0].name[0] == '\0') return 0; @@ -4678,8 +4681,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) BUG_ON(online_css(css)); - cgrp_dfl_root.subsys_mask |= 1 << ss->id; - mutex_unlock(&cgroup_mutex); } @@ -4758,11 +4759,14 @@ int __init cgroup_init(void) &cgrp_dfl_root.cgrp.e_csets[ssid]); /* - * cftype registration needs kmalloc and can't be done - * during early_init. Register base cftypes separately. + * Setting dfl_root subsys_mask needs to consider the + * disabled flag and cftype registration needs kmalloc, + * both of which aren't available during early_init. */ - if (ss->base_cftypes) + if (!ss->disabled) { + cgrp_dfl_root.subsys_mask |= 1 << ss->id; WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); + } } cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); -- cgit From 939c7a4f04fcd2162109744e8bf88194948a6e65 Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Thu, 5 Jun 2014 10:24:27 +0900 Subject: tracing: Introduce saved_cmdlines_size file Introduce saved_cmdlines_size file for changing the number of saved pid-comms. saved_cmdlines currently stores 128 command names using SAVED_CMDLINES, but 'no-existing processes' names are often lost in saved_cmdlines when we read the trace data. So, by introducing saved_cmdlines_size file, we can now change the 128 command names saved to something much larger if needed. When we write a value to saved_cmdlines_size, the number of the value will be stored in pid-comm list: # echo 1024 > /sys/kernel/debug/tracing/saved_cmdlines_size Here, 1024 command names can be stored. The default number is 128 and the maximum number is PID_MAX_DEFAULT (=32768 if CONFIG_BASE_SMALL is not set). So, if we want to avoid losing any command names, we need to set 32768 to saved_cmdlines_size. We can read the maximum number of the list: # cat /sys/kernel/debug/tracing/saved_cmdlines_size 128 Link: http://lkml.kernel.org/p/20140605012427.22115.16173.stgit@yunodevel Signed-off-by: Yoshihiro YUNOMAE Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 178 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 154 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 135af323608b..e29edee1542a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1285,22 +1285,71 @@ void tracing_reset_all_online_cpus(void) } } -#define SAVED_CMDLINES 128 +#define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX -static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; -static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; -static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; -static int cmdline_idx; static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; +struct saved_cmdlines_buffer { + unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; + unsigned *map_cmdline_to_pid; + unsigned cmdline_num; + int cmdline_idx; + char *saved_cmdlines; +}; +static struct saved_cmdlines_buffer *savedcmd; /* temporary disable recording */ static atomic_t trace_record_cmdline_disabled __read_mostly; -static void trace_init_cmdlines(void) +static inline char *get_saved_cmdlines(int idx) +{ + return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; +} + +static inline void set_cmdline(int idx, const char *cmdline) { - memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); - memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); - cmdline_idx = 0; + memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); +} + +static int allocate_cmdlines_buffer(unsigned int val, + struct saved_cmdlines_buffer *s) +{ + s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid), + GFP_KERNEL); + if (!s->map_cmdline_to_pid) + return -ENOMEM; + + s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL); + if (!s->saved_cmdlines) { + kfree(s->map_cmdline_to_pid); + return -ENOMEM; + } + + s->cmdline_idx = 0; + s->cmdline_num = val; + memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, + sizeof(s->map_pid_to_cmdline)); + memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, + val * sizeof(*s->map_cmdline_to_pid)); + + return 0; +} + +static int trace_create_savedcmd(void) +{ + int ret; + + savedcmd = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL); + if (!savedcmd) + return -ENOMEM; + + ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd); + if (ret < 0) { + kfree(savedcmd); + savedcmd = NULL; + return -ENOMEM; + } + + return 0; } int is_tracing_stopped(void) @@ -1457,9 +1506,9 @@ static int trace_save_cmdline(struct task_struct *tsk) if (!arch_spin_trylock(&trace_cmdline_lock)) return 0; - idx = map_pid_to_cmdline[tsk->pid]; + idx = savedcmd->map_pid_to_cmdline[tsk->pid]; if (idx == NO_CMDLINE_MAP) { - idx = (cmdline_idx + 1) % SAVED_CMDLINES; + idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; /* * Check whether the cmdline buffer at idx has a pid @@ -1467,17 +1516,17 @@ static int trace_save_cmdline(struct task_struct *tsk) * need to clear the map_pid_to_cmdline. Otherwise we * would read the new comm for the old pid. */ - pid = map_cmdline_to_pid[idx]; + pid = savedcmd->map_cmdline_to_pid[idx]; if (pid != NO_CMDLINE_MAP) - map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; + savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; - map_cmdline_to_pid[idx] = tsk->pid; - map_pid_to_cmdline[tsk->pid] = idx; + savedcmd->map_cmdline_to_pid[idx] = tsk->pid; + savedcmd->map_pid_to_cmdline[tsk->pid] = idx; - cmdline_idx = idx; + savedcmd->cmdline_idx = idx; } - memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); + set_cmdline(idx, tsk->comm); arch_spin_unlock(&trace_cmdline_lock); @@ -1503,9 +1552,9 @@ static void __trace_find_cmdline(int pid, char comm[]) return; } - map = map_pid_to_cmdline[pid]; + map = savedcmd->map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) - strcpy(comm, saved_cmdlines[map]); + strcpy(comm, get_saved_cmdlines(map)); else strcpy(comm, "<...>"); } @@ -3593,6 +3642,7 @@ static const char readme_msg[] = " trace_options\t\t- Set format or modify how tracing happens\n" "\t\t\t Disable an option by adding a suffix 'no' to the\n" "\t\t\t option name\n" + " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" #ifdef CONFIG_DYNAMIC_FTRACE "\n available_filter_functions - list of functions that can be filtered on\n" " set_ftrace_filter\t- echo function name in here to only trace these\n" @@ -3715,7 +3765,8 @@ static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; - for (; ptr < &map_cmdline_to_pid[SAVED_CMDLINES]; ptr++) { + for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; + ptr++) { if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) continue; @@ -3733,7 +3784,7 @@ static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) preempt_disable(); arch_spin_lock(&trace_cmdline_lock); - v = &map_cmdline_to_pid[0]; + v = &savedcmd->map_cmdline_to_pid[0]; while (l <= *pos) { v = saved_cmdlines_next(m, v, &l); if (!v) @@ -3781,6 +3832,79 @@ static const struct file_operations tracing_saved_cmdlines_fops = { .release = seq_release, }; +static ssize_t +tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + arch_spin_lock(&trace_cmdline_lock); + r = sprintf(buf, "%u\n", savedcmd->cmdline_num); + arch_spin_unlock(&trace_cmdline_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) +{ + kfree(s->saved_cmdlines); + kfree(s->map_cmdline_to_pid); + kfree(s); +} + +static int tracing_resize_saved_cmdlines(unsigned int val) +{ + struct saved_cmdlines_buffer *s, *savedcmd_temp; + + s = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL); + if (!s) + return -ENOMEM; + + if (allocate_cmdlines_buffer(val, s) < 0) { + kfree(s); + return -ENOMEM; + } + + arch_spin_lock(&trace_cmdline_lock); + savedcmd_temp = savedcmd; + savedcmd = s; + arch_spin_unlock(&trace_cmdline_lock); + free_saved_cmdlines_buffer(savedcmd_temp); + + return 0; +} + +static ssize_t +tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* must have at least 1 entry or less than PID_MAX_DEFAULT */ + if (!val || val > PID_MAX_DEFAULT) + return -EINVAL; + + ret = tracing_resize_saved_cmdlines((unsigned int)val); + if (ret < 0) + return ret; + + *ppos += cnt; + + return cnt; +} + +static const struct file_operations tracing_saved_cmdlines_size_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_size_read, + .write = tracing_saved_cmdlines_size_write, +}; + static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -6375,6 +6499,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("saved_cmdlines", 0444, d_tracer, NULL, &tracing_saved_cmdlines_fops); + trace_create_file("saved_cmdlines_size", 0644, d_tracer, + NULL, &tracing_saved_cmdlines_size_fops); + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); @@ -6611,18 +6738,19 @@ __init static int tracer_alloc_buffers(void) if (!temp_buffer) goto out_free_cpumask; + if (trace_create_savedcmd() < 0) + goto out_free_temp_buffer; + /* TODO: make the number of buffers hot pluggable with CPUS */ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); - goto out_free_temp_buffer; + goto out_free_savedcmd; } if (global_trace.buffer_disabled) tracing_off(); - trace_init_cmdlines(); - if (trace_boot_clock) { ret = tracing_set_clock(&global_trace, trace_boot_clock); if (ret < 0) @@ -6668,6 +6796,8 @@ __init static int tracer_alloc_buffers(void) return 0; +out_free_savedcmd: + free_saved_cmdlines_buffer(savedcmd); out_free_temp_buffer: ring_buffer_free(temp_buffer); out_free_cpumask: -- cgit From e9c243a5a6de0be8e584c604d353412584b592f8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Jun 2014 12:27:06 +0000 Subject: futex-prevent-requeue-pi-on-same-futex.patch futex: Forbid uaddr == uaddr2 in futex_requeue(..., requeue_pi=1) If uaddr == uaddr2, then we have broken the rule of only requeueing from a non-pi futex to a pi futex with this call. If we attempt this, then dangling pointers may be left for rt_waiter resulting in an exploitable condition. This change brings futex_requeue() in line with futex_wait_requeue_pi() which performs the same check as per commit 6f7b0a2a5c0f ("futex: Forbid uaddr == uaddr2 in futex_wait_requeue_pi()") [ tglx: Compare the resulting keys as well, as uaddrs might be different depending on the mapping ] Fixes CVE-2014-3153. Reported-by: Pinkie Pie Signed-off-by: Will Drewry Signed-off-by: Kees Cook Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner Reviewed-by: Darren Hart Signed-off-by: Linus Torvalds --- kernel/futex.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 81dbe773ce4c..663ea2b84a38 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1441,6 +1441,13 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_q *this, *next; if (requeue_pi) { + /* + * Requeue PI only works on two distinct uaddrs. This + * check is only valid for private futexes. See below. + */ + if (uaddr1 == uaddr2) + return -EINVAL; + /* * requeue_pi requires a pi_state, try to allocate it now * without any locks in case it fails. @@ -1479,6 +1486,15 @@ retry: if (unlikely(ret != 0)) goto out_put_key1; + /* + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: + */ + if (requeue_pi && match_futex(&key1, &key2)) { + ret = -EINVAL; + goto out_put_keys; + } + hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -2525,6 +2541,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (ret) goto out_key2; + /* + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: + */ + if (match_futex(&q.key, &key2)) { + ret = -EINVAL; + goto out_put_keys; + } + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_wait_queue_me(hb, &q, to); -- cgit From b3eaa9fc5cd0a4d74b18f6b8dc617aeaf1873270 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Jun 2014 12:27:06 +0000 Subject: futex: Validate atomic acquisition in futex_lock_pi_atomic() We need to protect the atomic acquisition in the kernel against rogue user space which sets the user space futex to 0, so the kernel side acquisition succeeds while there is existing state in the kernel associated to the real owner. Verify whether the futex has waiters associated with kernel state. If it has, return -EINVAL. The state is corrupted already, so no point in cleaning it up. Subsequent calls will fail as well. Not our problem. [ tglx: Use futex_top_waiter() and explain why we do not need to try restoring the already corrupted user space state. ] Signed-off-by: Darren Hart Cc: Kees Cook Cc: Will Drewry Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/futex.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 663ea2b84a38..520e7b23bf3c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -910,10 +910,18 @@ retry: return -EDEADLK; /* - * Surprise - we got the lock. Just return to userspace: + * Surprise - we got the lock, but we do not trust user space at all. */ - if (unlikely(!curval)) - return 1; + if (unlikely(!curval)) { + /* + * We verify whether there is kernel state for this + * futex. If not, we can safely assume, that the 0 -> + * TID transition is correct. If state exists, we do + * not bother to fixup the user space state as it was + * corrupted already. + */ + return futex_top_waiter(hb, key) ? -EINVAL : 1; + } uval = curval; -- cgit From 13fbca4c6ecd96ec1a1cfa2e4f2ce191fe928a5e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Jun 2014 12:27:07 +0000 Subject: futex: Always cleanup owner tid in unlock_pi If the owner died bit is set at futex_unlock_pi, we currently do not cleanup the user space futex. So the owner TID of the current owner (the unlocker) persists. That's observable inconsistant state, especially when the ownership of the pi state got transferred. Clean it up unconditionally. Signed-off-by: Thomas Gleixner Cc: Kees Cook Cc: Will Drewry Cc: Darren Hart Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/futex.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 520e7b23bf3c..e1cb1baa23fb 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1052,6 +1052,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) struct task_struct *new_owner; struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; + int ret = 0; if (!pi_state) return -EINVAL; @@ -1075,23 +1076,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) new_owner = this->task; /* - * We pass it to the next owner. (The WAITERS bit is always - * kept enabled while there is PI state around. We must also - * preserve the owner died bit.) + * We pass it to the next owner. The WAITERS bit is always + * kept enabled while there is PI state around. We cleanup the + * owner died bit, because we are the owner. */ - if (!(uval & FUTEX_OWNER_DIED)) { - int ret = 0; - - newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) - ret = -EFAULT; - else if (curval != uval) - ret = -EINVAL; - if (ret) { - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); - return ret; - } + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) + ret = -EFAULT; + else if (curval != uval) + ret = -EINVAL; + if (ret) { + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + return ret; } raw_spin_lock_irq(&pi_state->owner->pi_lock); @@ -2351,9 +2348,10 @@ retry: /* * To avoid races, try to do the TID -> 0 atomic transition * again. If it succeeds then we can return without waking - * anyone else up: + * anyone else up. We only try this if neither the waiters nor + * the owner died bit are set. */ - if (!(uval & FUTEX_OWNER_DIED) && + if (!(uval & ~FUTEX_TID_MASK) && cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) goto pi_faulted; /* @@ -2383,11 +2381,9 @@ retry: /* * No waiters - kernel unlocks the futex: */ - if (!(uval & FUTEX_OWNER_DIED)) { - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) - goto pi_faulted; - } + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; out_unlock: spin_unlock(&hb->lock); -- cgit From 54a217887a7b658e2650c3feff22756ab80c7339 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Jun 2014 12:27:08 +0000 Subject: futex: Make lookup_pi_state more robust The current implementation of lookup_pi_state has ambigous handling of the TID value 0 in the user space futex. We can get into the kernel even if the TID value is 0, because either there is a stale waiters bit or the owner died bit is set or we are called from the requeue_pi path or from user space just for fun. The current code avoids an explicit sanity check for pid = 0 in case that kernel internal state (waiters) are found for the user space address. This can lead to state leakage and worse under some circumstances. Handle the cases explicit: Waiter | pi_state | pi->owner | uTID | uODIED | ? [1] NULL | --- | --- | 0 | 0/1 | Valid [2] NULL | --- | --- | >0 | 0/1 | Valid [3] Found | NULL | -- | Any | 0/1 | Invalid [4] Found | Found | NULL | 0 | 1 | Valid [5] Found | Found | NULL | >0 | 1 | Invalid [6] Found | Found | task | 0 | 1 | Valid [7] Found | Found | NULL | Any | 0 | Invalid [8] Found | Found | task | ==taskTID | 0/1 | Valid [9] Found | Found | task | 0 | 0 | Invalid [10] Found | Found | task | !=taskTID | 0/1 | Invalid [1] Indicates that the kernel can acquire the futex atomically. We came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. [2] Valid, if TID does not belong to a kernel thread. If no matching thread is found then it indicates that the owner TID has died. [3] Invalid. The waiter is queued on a non PI futex [4] Valid state after exit_robust_list(), which sets the user space value to FUTEX_WAITERS | FUTEX_OWNER_DIED. [5] The user space value got manipulated between exit_robust_list() and exit_pi_state_list() [6] Valid state after exit_pi_state_list() which sets the new owner in the pi_state but cannot access the user space value. [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. [8] Owner and user space value match [9] There is no transient state which sets the user space TID to 0 except exit_robust_list(), but this is indicated by the FUTEX_OWNER_DIED bit. See [4] [10] There is no transient state which leaves owner and user space TID out of sync. Signed-off-by: Thomas Gleixner Cc: Kees Cook Cc: Will Drewry Cc: Darren Hart Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/futex.c | 134 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 106 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e1cb1baa23fb..de938d20df19 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -743,10 +743,58 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_unlock_irq(&curr->pi_lock); } +/* + * We need to check the following states: + * + * Waiter | pi_state | pi->owner | uTID | uODIED | ? + * + * [1] NULL | --- | --- | 0 | 0/1 | Valid + * [2] NULL | --- | --- | >0 | 0/1 | Valid + * + * [3] Found | NULL | -- | Any | 0/1 | Invalid + * + * [4] Found | Found | NULL | 0 | 1 | Valid + * [5] Found | Found | NULL | >0 | 1 | Invalid + * + * [6] Found | Found | task | 0 | 1 | Valid + * + * [7] Found | Found | NULL | Any | 0 | Invalid + * + * [8] Found | Found | task | ==taskTID | 0/1 | Valid + * [9] Found | Found | task | 0 | 0 | Invalid + * [10] Found | Found | task | !=taskTID | 0/1 | Invalid + * + * [1] Indicates that the kernel can acquire the futex atomically. We + * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. + * + * [2] Valid, if TID does not belong to a kernel thread. If no matching + * thread is found then it indicates that the owner TID has died. + * + * [3] Invalid. The waiter is queued on a non PI futex + * + * [4] Valid state after exit_robust_list(), which sets the user space + * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. + * + * [5] The user space value got manipulated between exit_robust_list() + * and exit_pi_state_list() + * + * [6] Valid state after exit_pi_state_list() which sets the new owner in + * the pi_state but cannot access the user space value. + * + * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. + * + * [8] Owner and user space value match + * + * [9] There is no transient state which sets the user space TID to 0 + * except exit_robust_list(), but this is indicated by the + * FUTEX_OWNER_DIED bit. See [4] + * + * [10] There is no transient state which leaves owner and user space + * TID out of sync. + */ static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps, - struct task_struct *task) + union futex_key *key, struct futex_pi_state **ps) { struct futex_pi_state *pi_state = NULL; struct futex_q *this, *next; @@ -756,12 +804,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, plist_for_each_entry_safe(this, next, &hb->chain, list) { if (match_futex(&this->key, key)) { /* - * Another waiter already exists - bump up - * the refcount and return its pi_state: + * Sanity check the waiter before increasing + * the refcount and attaching to it. */ pi_state = this->pi_state; /* - * Userspace might have messed up non-PI and PI futexes + * Userspace might have messed up non-PI and + * PI futexes [3] */ if (unlikely(!pi_state)) return -EINVAL; @@ -769,44 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, WARN_ON(!atomic_read(&pi_state->refcount)); /* - * When pi_state->owner is NULL then the owner died - * and another waiter is on the fly. pi_state->owner - * is fixed up by the task which acquires - * pi_state->rt_mutex. - * - * We do not check for pid == 0 which can happen when - * the owner died and robust_list_exit() cleared the - * TID. + * Handle the owner died case: */ - if (pid && pi_state->owner) { + if (uval & FUTEX_OWNER_DIED) { /* - * Bail out if user space manipulated the - * futex value. + * exit_pi_state_list sets owner to NULL and + * wakes the topmost waiter. The task which + * acquires the pi_state->rt_mutex will fixup + * owner. */ - if (pid != task_pid_vnr(pi_state->owner)) + if (!pi_state->owner) { + /* + * No pi state owner, but the user + * space TID is not 0. Inconsistent + * state. [5] + */ + if (pid) + return -EINVAL; + /* + * Take a ref on the state and + * return. [4] + */ + goto out_state; + } + + /* + * If TID is 0, then either the dying owner + * has not yet executed exit_pi_state_list() + * or some waiter acquired the rtmutex in the + * pi state, but did not yet fixup the TID in + * user space. + * + * Take a ref on the state and return. [6] + */ + if (!pid) + goto out_state; + } else { + /* + * If the owner died bit is not set, + * then the pi_state must have an + * owner. [7] + */ + if (!pi_state->owner) return -EINVAL; } /* - * Protect against a corrupted uval. If uval - * is 0x80000000 then pid is 0 and the waiter - * bit is set. So the deadlock check in the - * calling code has failed and we did not fall - * into the check above due to !pid. + * Bail out if user space manipulated the + * futex value. If pi state exists then the + * owner TID must be the same as the user + * space TID. [9/10] */ - if (task && pi_state->owner == task) - return -EDEADLK; + if (pid != task_pid_vnr(pi_state->owner)) + return -EINVAL; + out_state: atomic_inc(&pi_state->refcount); *ps = pi_state; - return 0; } } /* * We are the first waiter - try to look up the real owner and attach - * the new pi_state to it, but bail out when TID = 0 + * the new pi_state to it, but bail out when TID = 0 [1] */ if (!pid) return -ESRCH; @@ -839,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return ret; } + /* + * No existing pi state. First waiter. [2] + */ pi_state = alloc_pi_state(); /* @@ -959,7 +1037,7 @@ retry: * We dont have the lock. Look up the PI state (or create it if * we are the first waiter): */ - ret = lookup_pi_state(uval, hb, key, ps, task); + ret = lookup_pi_state(uval, hb, key, ps); if (unlikely(ret)) { switch (ret) { @@ -1565,7 +1643,7 @@ retry_private: * rereading and handing potential crap to * lookup_pi_state. */ - ret = lookup_pi_state(ret, hb2, &key2, &pi_state, NULL); + ret = lookup_pi_state(ret, hb2, &key2, &pi_state); } switch (ret) { -- cgit From 72e2fe38eac2dbf258d4295d75f78b123dd5b823 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 5 Jun 2014 20:35:30 -0400 Subject: tracing: Convert stddev into u64 in tracepoint benchmark I've been told that do_div() expects an unsigned 64 bit number, and is undefined if a signed is used. This gave a warning on the MIPS build. I'm not sure if a signed 64 bit dividend is really an issue or not, but the calculation this is used for is standard deviation, and that isn't going to be negative. We can just convert it to unsigned and be safe. Reported-by: David Daney Signed-off-by: Steven Rostedt --- kernel/trace/trace_benchmark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index a10adc7095cd..8bd3365a65b2 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -33,7 +33,7 @@ static void trace_do_benchmark(void) u64 start; u64 stop; u64 delta; - s64 stddev; + u64 stddev; u64 seed; u64 last_seed; unsigned int avg; -- cgit From 34839f5a69989c0ee48386a788fba37eb75910f7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 5 Jun 2014 23:34:02 -0400 Subject: tracing: Only calculate stats of tracepoint benchmarks for 2^32 times When calculating the average and standard deviation, it is required that the count be less than UINT_MAX, otherwise the do_div() will get undefined results. After 2^32 counts of data, the average and standard deviation should pretty much be set anyway. Signed-off-by: Steven Rostedt --- kernel/trace/trace_benchmark.c | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 8bd3365a65b2..40a14cbcf8e0 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -16,7 +16,10 @@ static u64 bm_last; static u64 bm_max; static u64 bm_min; static u64 bm_first; -static s64 bm_cnt; +static u64 bm_cnt; +static u64 bm_stddev; +static unsigned int bm_avg; +static unsigned int bm_std; /* * This gets called in a loop recording the time it took to write @@ -66,22 +69,35 @@ static void trace_do_benchmark(void) bm_last = delta; - bm_total += delta; - bm_totalsq += delta * delta; - if (delta > bm_max) bm_max = delta; if (!bm_min || delta < bm_min) bm_min = delta; + /* + * When bm_cnt is greater than UINT_MAX, it breaks the statistics + * accounting. Freeze the statistics when that happens. + * We should have enough data for the avg and stddev anyway. + */ + if (bm_cnt > UINT_MAX) { + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld", + bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev); + return; + } + + bm_total += delta; + bm_totalsq += delta * delta; + + if (bm_cnt > 1) { /* * Apply Welford's method to calculate standard deviation: * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) */ stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total; - do_div(stddev, bm_cnt); - do_div(stddev, bm_cnt - 1); + do_div(stddev, (u32)bm_cnt); + do_div(stddev, (u32)bm_cnt - 1); } else stddev = 0; @@ -119,6 +135,10 @@ static void trace_do_benchmark(void) scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld", bm_last, bm_first, bm_max, bm_min, avg, std, stddev); + + bm_std = std; + bm_avg = avg; + bm_stddev = stddev; } static int benchmark_event_kthread(void *arg) @@ -170,6 +190,9 @@ void trace_benchmark_unreg(void) bm_max = 0; bm_min = 0; bm_cnt = 0; - /* bm_first doesn't need to be reset but reset it anyway */ + /* These don't need to be reset but reset them anyway */ bm_first = 0; + bm_std = 0; + bm_avg = 0; + bm_stddev = 0; } -- cgit From e041e328c4b41e1db79bfe5ba9992c2ed771ad19 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 21 May 2014 17:32:19 +0200 Subject: perf: Fix perf_event_comm() vs. exec() assumption perf_event_comm() assumes that set_task_comm() is only called on exec(), and in particular that its only called on current. Neither are true, as Dave reported a WARN triggered by set_task_comm() being called on !current. Separate the exec() hook from the comm hook. Reported-by: Dave Jones Signed-off-by: Peter Zijlstra Cc: Alexander Viro Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20140521153219.GH5226@laptop.programming.kicks-ass.net [ Build fix. ] Signed-off-by: Ingo Molnar --- fs/exec.c | 1 + include/linux/perf_event.h | 4 +++- kernel/events/core.c | 28 ++++++++++++++++------------ 3 files changed, 20 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 238b7aa26f68..a038a41a3677 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1110,6 +1110,7 @@ void setup_new_exec(struct linux_binprm * bprm) else set_dumpable(current->mm, suid_dumpable); + perf_event_exec(); set_task_comm(current, kbasename(bprm->filename)); /* Set the new mm task size. We have to do that late because it may diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3ef6ea12806a..9b5cd1992a88 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -695,6 +695,7 @@ extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); +extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); @@ -772,7 +773,7 @@ extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern int __perf_event_disable(void *info); extern void perf_event_task_tick(void); -#else +#else /* !CONFIG_PERF_EVENTS: */ static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { } @@ -802,6 +803,7 @@ static inline int perf_unregister_guest_info_callbacks (struct perf_guest_info_callbacks *callbacks) { return 0; } static inline void perf_event_mmap(struct vm_area_struct *vma) { } +static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_init(void) { } diff --git a/kernel/events/core.c b/kernel/events/core.c index 440eefc67397..647698f91988 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2970,6 +2970,22 @@ out: local_irq_restore(flags); } +void perf_event_exec(void) +{ + struct perf_event_context *ctx; + int ctxn; + + rcu_read_lock(); + for_each_task_context_nr(ctxn) { + ctx = current->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + perf_event_enable_on_exec(ctx); + } + rcu_read_unlock(); +} + /* * Cross CPU call to read the hardware event */ @@ -5057,18 +5073,6 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) void perf_event_comm(struct task_struct *task) { struct perf_comm_event comm_event; - struct perf_event_context *ctx; - int ctxn; - - rcu_read_lock(); - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - - perf_event_enable_on_exec(ctx); - } - rcu_read_unlock(); if (!atomic_read(&nr_comm_events)) return; -- cgit From 82b897782d10fcc4930c9d4a15b175348fdd2871 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 28 May 2014 11:45:04 +0300 Subject: perf: Differentiate exec() and non-exec() comm events perf tools like 'perf report' can aggregate samples by comm strings, which generally works. However, there are other potential use-cases. For example, to pair up 'calls' with 'returns' accurately (from branch events like Intel BTS) it is necessary to identify whether the process has exec'd. Although a comm event is generated when an 'exec' happens it is also generated whenever the comm string is changed on a whim (e.g. by prctl PR_SET_NAME). This patch adds a flag to the comm event to differentiate one case from the other. In order to determine whether the kernel supports the new flag, a selection bit named 'exec' is added to struct perf_event_attr. The bit does nothing but will cause perf_event_open() to fail if the bit is set on kernels that do not have it defined. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/537D9EBE.7030806@intel.com Cc: Paul Mackerras Cc: Dave Jones Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Alexander Viro Cc: Linus Torvalds Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/exec.c | 6 +++--- include/linux/perf_event.h | 4 ++-- include/linux/sched.h | 6 +++++- include/uapi/linux/perf_event.h | 9 +++++++-- kernel/events/core.c | 4 ++-- 5 files changed, 19 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index a038a41a3677..a3d33fe592d6 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1046,13 +1046,13 @@ EXPORT_SYMBOL_GPL(get_task_comm); * so that a new one can be started */ -void set_task_comm(struct task_struct *tsk, const char *buf) +void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { task_lock(tsk); trace_task_rename(tsk, buf); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); - perf_event_comm(tsk); + perf_event_comm(tsk, exec); } int flush_old_exec(struct linux_binprm * bprm) @@ -1111,7 +1111,7 @@ void setup_new_exec(struct linux_binprm * bprm) set_dumpable(current->mm, suid_dumpable); perf_event_exec(); - set_task_comm(current, kbasename(bprm->filename)); + __set_task_comm(current, kbasename(bprm->filename), true); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b4c1d4685bf0..707617a8c0f6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -707,7 +707,7 @@ extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks * extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern void perf_event_exec(void); -extern void perf_event_comm(struct task_struct *tsk); +extern void perf_event_comm(struct task_struct *tsk, bool exec); extern void perf_event_fork(struct task_struct *tsk); /* Callchains */ @@ -815,7 +815,7 @@ static inline int perf_unregister_guest_info_callbacks static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_exec(void) { } -static inline void perf_event_comm(struct task_struct *tsk) { } +static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 221b2bde3723..ad86e1d7dbc2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2379,7 +2379,11 @@ extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, i struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); -extern void set_task_comm(struct task_struct *tsk, const char *from); +extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); +static inline void set_task_comm(struct task_struct *tsk, const char *from) +{ + __set_task_comm(tsk, from, false); +} extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index d9cd853818ad..5312fae47218 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -302,8 +302,8 @@ struct perf_event_attr { exclude_callchain_kernel : 1, /* exclude kernel callchains */ exclude_callchain_user : 1, /* exclude user callchains */ mmap2 : 1, /* include mmap with inode data */ - - __reserved_1 : 40; + comm_exec : 1, /* flag comm events that are due to an exec */ + __reserved_1 : 39; union { __u32 wakeup_events; /* wakeup every n events */ @@ -502,7 +502,12 @@ struct perf_event_mmap_page { #define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0) #define PERF_RECORD_MISC_GUEST_USER (5 << 0) +/* + * PERF_RECORD_MISC_MMAP_DATA and PERF_RECORD_MISC_COMM_EXEC are used on + * different events so can reuse the same bit position. + */ #define PERF_RECORD_MISC_MMAP_DATA (1 << 13) +#define PERF_RECORD_MISC_COMM_EXEC (1 << 13) /* * Indicates that the content of PERF_SAMPLE_IP points to * the actual instruction that triggered the event. See also diff --git a/kernel/events/core.c b/kernel/events/core.c index 8fac2056d51e..7da5e561e89a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5090,7 +5090,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) NULL); } -void perf_event_comm(struct task_struct *task) +void perf_event_comm(struct task_struct *task, bool exec) { struct perf_comm_event comm_event; @@ -5104,7 +5104,7 @@ void perf_event_comm(struct task_struct *task) .event_id = { .header = { .type = PERF_RECORD_COMM, - .misc = 0, + .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0, /* .size */ }, /* .pid */ -- cgit From 70af2f8a4f48d6cebdf92d533d3aef37853ce6de Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 3 Feb 2014 13:18:49 +0100 Subject: locking/rwlocks: Introduce 'qrwlocks' - fair, queued rwlocks This rwlock uses the arch_spin_lock_t as a waitqueue, and assuming the arch_spin_lock_t is a fair lock (ticket,mcs etc..) the resulting rwlock is a fair lock. It fits in the same 8 bytes as the regular rwlock_t by folding the reader and writer count into a single integer, using the remaining 4 bytes for the arch_spinlock_t. Architectures that can single-copy adress bytes can optimize queue_write_unlock() with a 0 write to the LSB (the write count). Performance as measured by Davidlohr Bueso (rwlock_t -> qrwlock_t): +--------------+-------------+---------------+ | Workload | #users | delta | +--------------+-------------+---------------+ | alltests | > 1400 | -4.83% | | custom | 0-100,> 100 | +1.43%,-1.57% | | high_systime | > 1000 | -2.61 | | shared | all | +0.32 | +--------------+-------------+---------------+ http://www.stgolabs.net/qrwlock-stuff/aim7-results-vs-rwsem_optsin/ Signed-off-by: Waiman Long [peterz: near complete rewrite] Signed-off-by: Peter Zijlstra Cc: Arnd Bergmann Cc: Linus Torvalds Cc: "Paul E.McKenney" Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/n/tip-gac1nnl3wvs2ij87zv2xkdzq@git.kernel.org Signed-off-by: Ingo Molnar --- include/asm-generic/qrwlock.h | 166 ++++++++++++++++++++++++++++++++++++ include/asm-generic/qrwlock_types.h | 21 +++++ kernel/Kconfig.locks | 7 ++ kernel/locking/Makefile | 1 + kernel/locking/qrwlock.c | 133 +++++++++++++++++++++++++++++ 5 files changed, 328 insertions(+) create mode 100644 include/asm-generic/qrwlock.h create mode 100644 include/asm-generic/qrwlock_types.h create mode 100644 kernel/locking/qrwlock.c (limited to 'kernel') diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h new file mode 100644 index 000000000000..6383d54bf983 --- /dev/null +++ b/include/asm-generic/qrwlock.h @@ -0,0 +1,166 @@ +/* + * Queue read/write lock + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. + * + * Authors: Waiman Long + */ +#ifndef __ASM_GENERIC_QRWLOCK_H +#define __ASM_GENERIC_QRWLOCK_H + +#include +#include +#include + +#include + +/* + * Writer states & reader shift and bias + */ +#define _QW_WAITING 1 /* A writer is waiting */ +#define _QW_LOCKED 0xff /* A writer holds the lock */ +#define _QW_WMASK 0xff /* Writer mask */ +#define _QR_SHIFT 8 /* Reader count shift */ +#define _QR_BIAS (1U << _QR_SHIFT) + +/* + * External function declarations + */ +extern void queue_read_lock_slowpath(struct qrwlock *lock); +extern void queue_write_lock_slowpath(struct qrwlock *lock); + +/** + * queue_read_can_lock- would read_trylock() succeed? + * @lock: Pointer to queue rwlock structure + */ +static inline int queue_read_can_lock(struct qrwlock *lock) +{ + return !(atomic_read(&lock->cnts) & _QW_WMASK); +} + +/** + * queue_write_can_lock- would write_trylock() succeed? + * @lock: Pointer to queue rwlock structure + */ +static inline int queue_write_can_lock(struct qrwlock *lock) +{ + return !atomic_read(&lock->cnts); +} + +/** + * queue_read_trylock - try to acquire read lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + * Return: 1 if lock acquired, 0 if failed + */ +static inline int queue_read_trylock(struct qrwlock *lock) +{ + u32 cnts; + + cnts = atomic_read(&lock->cnts); + if (likely(!(cnts & _QW_WMASK))) { + cnts = (u32)atomic_add_return(_QR_BIAS, &lock->cnts); + if (likely(!(cnts & _QW_WMASK))) + return 1; + atomic_sub(_QR_BIAS, &lock->cnts); + } + return 0; +} + +/** + * queue_write_trylock - try to acquire write lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + * Return: 1 if lock acquired, 0 if failed + */ +static inline int queue_write_trylock(struct qrwlock *lock) +{ + u32 cnts; + + cnts = atomic_read(&lock->cnts); + if (unlikely(cnts)) + return 0; + + return likely(atomic_cmpxchg(&lock->cnts, + cnts, cnts | _QW_LOCKED) == cnts); +} +/** + * queue_read_lock - acquire read lock of a queue rwlock + * @lock: Pointer to queue rwlock structure + */ +static inline void queue_read_lock(struct qrwlock *lock) +{ + u32 cnts; + + cnts = atomic_add_return(_QR_BIAS, &lock->cnts); + if (likely(!(cnts & _QW_WMASK))) + return; + + /* The slowpath will decrement the reader count, if necessary. */ + queue_read_lock_slowpath(lock); +} + +/** + * queue_write_lock - acquire write lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + */ +static inline void queue_write_lock(struct qrwlock *lock) +{ + /* Optimize for the unfair lock case where the fair flag is 0. */ + if (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0) + return; + + queue_write_lock_slowpath(lock); +} + +/** + * queue_read_unlock - release read lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + */ +static inline void queue_read_unlock(struct qrwlock *lock) +{ + /* + * Atomically decrement the reader count + */ + smp_mb__before_atomic(); + atomic_sub(_QR_BIAS, &lock->cnts); +} + +#ifndef queue_write_unlock +/** + * queue_write_unlock - release write lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + */ +static inline void queue_write_unlock(struct qrwlock *lock) +{ + /* + * If the writer field is atomic, it can be cleared directly. + * Otherwise, an atomic subtraction will be used to clear it. + */ + smp_mb__before_atomic(); + atomic_sub(_QW_LOCKED, &lock->cnts); +} +#endif + +/* + * Remapping rwlock architecture specific functions to the corresponding + * queue rwlock functions. + */ +#define arch_read_can_lock(l) queue_read_can_lock(l) +#define arch_write_can_lock(l) queue_write_can_lock(l) +#define arch_read_lock(l) queue_read_lock(l) +#define arch_write_lock(l) queue_write_lock(l) +#define arch_read_trylock(l) queue_read_trylock(l) +#define arch_write_trylock(l) queue_write_trylock(l) +#define arch_read_unlock(l) queue_read_unlock(l) +#define arch_write_unlock(l) queue_write_unlock(l) + +#endif /* __ASM_GENERIC_QRWLOCK_H */ diff --git a/include/asm-generic/qrwlock_types.h b/include/asm-generic/qrwlock_types.h new file mode 100644 index 000000000000..4d76f24df518 --- /dev/null +++ b/include/asm-generic/qrwlock_types.h @@ -0,0 +1,21 @@ +#ifndef __ASM_GENERIC_QRWLOCK_TYPES_H +#define __ASM_GENERIC_QRWLOCK_TYPES_H + +#include +#include + +/* + * The queue read/write lock data structure + */ + +typedef struct qrwlock { + atomic_t cnts; + arch_spinlock_t lock; +} arch_rwlock_t; + +#define __ARCH_RW_LOCK_UNLOCKED { \ + .cnts = ATOMIC_INIT(0), \ + .lock = __ARCH_SPIN_LOCK_UNLOCKED, \ +} + +#endif /* __ASM_GENERIC_QRWLOCK_TYPES_H */ diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index d2b32ac27a39..35536d9c0964 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -223,3 +223,10 @@ endif config MUTEX_SPIN_ON_OWNER def_bool y depends on SMP && !DEBUG_MUTEXES + +config ARCH_USE_QUEUE_RWLOCK + bool + +config QUEUE_RWLOCK + def_bool y if ARCH_USE_QUEUE_RWLOCK + depends on SMP diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index b8bdcd4785b7..8541bfdfd232 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -24,4 +24,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o +obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c new file mode 100644 index 000000000000..fb5b8ac411a5 --- /dev/null +++ b/kernel/locking/qrwlock.c @@ -0,0 +1,133 @@ +/* + * Queue read/write lock + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. + * + * Authors: Waiman Long + */ +#include +#include +#include +#include +#include +#include +#include + +/** + * rspin_until_writer_unlock - inc reader count & spin until writer is gone + * @lock : Pointer to queue rwlock structure + * @writer: Current queue rwlock writer status byte + * + * In interrupt context or at the head of the queue, the reader will just + * increment the reader count & wait until the writer releases the lock. + */ +static __always_inline void +rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) +{ + while ((cnts & _QW_WMASK) == _QW_LOCKED) { + arch_mutex_cpu_relax(); + cnts = smp_load_acquire((u32 *)&lock->cnts); + } +} + +/** + * queue_read_lock_slowpath - acquire read lock of a queue rwlock + * @lock: Pointer to queue rwlock structure + */ +void queue_read_lock_slowpath(struct qrwlock *lock) +{ + u32 cnts; + + /* + * Readers come here when they cannot get the lock without waiting + */ + if (unlikely(in_interrupt())) { + /* + * Readers in interrupt context will spin until the lock is + * available without waiting in the queue. + */ + cnts = smp_load_acquire((u32 *)&lock->cnts); + rspin_until_writer_unlock(lock, cnts); + return; + } + atomic_sub(_QR_BIAS, &lock->cnts); + + /* + * Put the reader into the wait queue + */ + arch_spin_lock(&lock->lock); + + /* + * At the head of the wait queue now, wait until the writer state + * goes to 0 and then try to increment the reader count and get + * the lock. It is possible that an incoming writer may steal the + * lock in the interim, so it is necessary to check the writer byte + * to make sure that the write lock isn't taken. + */ + while (atomic_read(&lock->cnts) & _QW_WMASK) + arch_mutex_cpu_relax(); + + cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; + rspin_until_writer_unlock(lock, cnts); + + /* + * Signal the next one in queue to become queue head + */ + arch_spin_unlock(&lock->lock); +} +EXPORT_SYMBOL(queue_read_lock_slowpath); + +/** + * queue_write_lock_slowpath - acquire write lock of a queue rwlock + * @lock : Pointer to queue rwlock structure + */ +void queue_write_lock_slowpath(struct qrwlock *lock) +{ + u32 cnts; + + /* Put the writer into the wait queue */ + arch_spin_lock(&lock->lock); + + /* Try to acquire the lock directly if no reader is present */ + if (!atomic_read(&lock->cnts) && + (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) + goto unlock; + + /* + * Set the waiting flag to notify readers that a writer is pending, + * or wait for a previous writer to go away. + */ + for (;;) { + cnts = atomic_read(&lock->cnts); + if (!(cnts & _QW_WMASK) && + (atomic_cmpxchg(&lock->cnts, cnts, + cnts | _QW_WAITING) == cnts)) + break; + + arch_mutex_cpu_relax(); + } + + /* When no more readers, set the locked flag */ + for (;;) { + cnts = atomic_read(&lock->cnts); + if ((cnts == _QW_WAITING) && + (atomic_cmpxchg(&lock->cnts, _QW_WAITING, + _QW_LOCKED) == _QW_WAITING)) + break; + + arch_mutex_cpu_relax(); + } +unlock: + arch_spin_unlock(&lock->lock); +} +EXPORT_SYMBOL(queue_write_lock_slowpath); -- cgit From dc81e5e3abb9f98a3cb6f269c0bee595b2c1235d Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Fri, 6 Jun 2014 07:35:17 +0900 Subject: tracing: Return error if ftrace_trace_arrays list is empty ftrace_trace_arrays links global_trace.list. However, global_trace is not added to ftrace_trace_arrays if trace_alloc_buffers() failed. As the result, ftrace_trace_arrays becomes an empty list. If ftrace_trace_arrays is an empty list, current top_trace_array() returns an invalid pointer. As the result, the kernel can induce memory corruption or panic. Current implementation does not check whether ftrace_trace_arrays is empty list or not. So, in this patch, if ftrace_trace_arrays is empty list, top_trace_array() returns NULL. Moreover, this patch makes all functions calling top_trace_array() handle it appropriately. Link: http://lkml.kernel.org/p/20140605223517.32311.99233.stgit@yunodevel Signed-off-by: Yoshihiro YUNOMAE Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 3 +++ kernel/trace/trace_events.c | 13 +++++++++++++ 2 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 217207ad60b3..9e82551dd566 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -252,6 +252,9 @@ static inline struct trace_array *top_trace_array(void) { struct trace_array *tr; + if (list_empty(ftrace_trace_arrays.prev)) + return NULL; + tr = list_entry(ftrace_trace_arrays.prev, typeof(*tr), list); WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3ddfd8f62c05..f99e0b3bca8c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -574,6 +574,9 @@ int trace_set_clr_event(const char *system, const char *event, int set) { struct trace_array *tr = top_trace_array(); + if (!tr) + return -ENODEV; + return __ftrace_set_clr_event(tr, NULL, system, event, set); } EXPORT_SYMBOL_GPL(trace_set_clr_event); @@ -2065,6 +2068,9 @@ event_enable_func(struct ftrace_hash *hash, bool enable; int ret; + if (!tr) + return -ENODEV; + /* hash funcs only work with set_ftrace_filter */ if (!enabled || !param) return -EINVAL; @@ -2396,6 +2402,9 @@ static __init int event_trace_enable(void) char *token; int ret; + if (!tr) + return -ENODEV; + for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { call = *iter; @@ -2442,6 +2451,8 @@ static __init int event_trace_init(void) int ret; tr = top_trace_array(); + if (!tr) + return -ENODEV; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -2535,6 +2546,8 @@ static __init void event_trace_self_tests(void) int ret; tr = top_trace_array(); + if (!tr) + return; pr_info("Running tests on trace events:\n"); -- cgit From 748ec3a20eb44fce19af3cef04f8db8a8e7aead3 Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Fri, 6 Jun 2014 07:35:20 +0900 Subject: tracing/kprobes: Avoid self tests if tracing is disabled on boot up If tracing is disabled on boot up, the kernel should not execute tracing self tests. The kernel should check whether tracing is disabled or not before executing any of the tracing self tests. Link: http://lkml.kernel.org/p/20140605223520.32311.56097.stgit@yunodevel Acked-by: Masami Hiramatsu Signed-off-by: Yoshihiro YUNOMAE Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 903ae28962be..ef2fba1f46b5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1377,6 +1377,9 @@ static __init int kprobe_trace_self_tests_init(void) struct trace_kprobe *tk; struct ftrace_event_file *file; + if (tracing_is_disabled()) + return -ENODEV; + target = kprobe_trace_selftest_target; pr_info("Testing kprobe tracing: "); -- cgit From 23aaa3c18e33fe048671b419781b5e44175efafe Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 6 Jun 2014 00:01:46 -0400 Subject: tracing: Fix leak of ring buffer data when new instances creation fails Yoshihiro Yunomae reported that the ring buffer data for a trace instance does not get properly cleaned up when it fails. He proposed a patch that manually cleaned the data up and addad a bunch of labels. The labels are not needed because all trace array is allocated with a kzalloc which initializes it to 0 and all kfree()s can take a NULL pointer and will ignore it. Adding a new helper function free_trace_buffers() that can also take null buffers to free the buffers that were allocated by allocate_trace_buffers(). Link: http://lkml.kernel.org/r/20140605223522.32311.31664.stgit@yunodevel Reported-by: Yoshihiro YUNOMAE Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e29edee1542a..26cfff38e2ab 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6232,6 +6232,25 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) return 0; } +static void free_trace_buffers(struct trace_array *tr) +{ + if (!tr) + return; + + if (tr->trace_buffer.buffer) { + ring_buffer_free(tr->trace_buffer.buffer); + tr->trace_buffer.buffer = NULL; + free_percpu(tr->trace_buffer.data); + } + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->max_buffer.buffer) { + ring_buffer_free(tr->max_buffer.buffer); + tr->max_buffer.buffer = NULL; + } +#endif +} + static int new_instance_create(const char *name) { struct trace_array *tr; @@ -6290,8 +6309,7 @@ static int new_instance_create(const char *name) return 0; out_free_tr: - if (tr->trace_buffer.buffer) - ring_buffer_free(tr->trace_buffer.buffer); + free_trace_buffers(tr); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); -- cgit From bb3632c6101b2fad07e6246721466b984b1e0e9d Mon Sep 17 00:00:00 2001 From: Todd E Brandt Date: Fri, 6 Jun 2014 05:40:17 -0700 Subject: PM / sleep: trace events for suspend/resume Adds trace events that give finer resolution into suspend/resume. These events are graphed in the timelines generated by the analyze_suspend.py script. They represent large areas of time consumed that are typical to suspend and resume. The event is triggered by calling the function "trace_suspend_resume" with three arguments: a string (the name of the event to be displayed in the timeline), an integer (case specific number, such as the power state or cpu number), and a boolean (where true is used to denote the start of the timeline event, and false to denote the end). The suspend_resume trace event reproduces the data that the machine_suspend trace event did, so the latter has been removed. Signed-off-by: Todd Brandt Acked-by: Steven Rostedt Signed-off-by: Rafael J. Wysocki --- drivers/acpi/sleep.c | 3 +++ drivers/base/power/main.c | 16 ++++++++++++++++ drivers/base/syscore.c | 5 +++++ include/trace/events/power.h | 42 +++++++++++++++++++++++++----------------- kernel/cpu.c | 5 +++++ kernel/power/hibernate.c | 3 +++ kernel/power/process.c | 3 +++ kernel/power/suspend.c | 14 ++++++++++++-- 8 files changed, 72 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index c11e3795431b..b3e3cc73ba79 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "internal.h" #include "sleep.h" @@ -501,6 +502,7 @@ static int acpi_suspend_enter(suspend_state_t pm_state) ACPI_FLUSH_CPU_CACHE(); + trace_suspend_resume(TPS("acpi_suspend"), acpi_state, true); switch (acpi_state) { case ACPI_STATE_S1: barrier(); @@ -516,6 +518,7 @@ static int acpi_suspend_enter(suspend_state_t pm_state) pr_info(PREFIX "Low-level resume complete\n"); break; } + trace_suspend_resume(TPS("acpi_suspend"), acpi_state, false); /* This violates the spec but is required for bug compatibility. */ acpi_write_bit_register(ACPI_BITREG_SCI_ENABLE, 1); diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 343ffad59377..de3fe4fe350a 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -545,6 +545,7 @@ static void dpm_resume_noirq(pm_message_t state) struct device *dev; ktime_t starttime = ktime_get(); + trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, true); mutex_lock(&dpm_list_mtx); pm_transition = state; @@ -587,6 +588,7 @@ static void dpm_resume_noirq(pm_message_t state) dpm_show_time(starttime, state, "noirq"); resume_device_irqs(); cpuidle_resume(); + trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false); } /** @@ -664,6 +666,7 @@ static void dpm_resume_early(pm_message_t state) struct device *dev; ktime_t starttime = ktime_get(); + trace_suspend_resume(TPS("dpm_resume_early"), state.event, true); mutex_lock(&dpm_list_mtx); pm_transition = state; @@ -703,6 +706,7 @@ static void dpm_resume_early(pm_message_t state) mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, "early"); + trace_suspend_resume(TPS("dpm_resume_early"), state.event, false); } /** @@ -834,6 +838,7 @@ void dpm_resume(pm_message_t state) struct device *dev; ktime_t starttime = ktime_get(); + trace_suspend_resume(TPS("dpm_resume"), state.event, true); might_sleep(); mutex_lock(&dpm_list_mtx); @@ -875,6 +880,7 @@ void dpm_resume(pm_message_t state) dpm_show_time(starttime, state, NULL); cpufreq_resume(); + trace_suspend_resume(TPS("dpm_resume"), state.event, false); } /** @@ -932,6 +938,7 @@ void dpm_complete(pm_message_t state) { struct list_head list; + trace_suspend_resume(TPS("dpm_complete"), state.event, true); might_sleep(); INIT_LIST_HEAD(&list); @@ -951,6 +958,7 @@ void dpm_complete(pm_message_t state) } list_splice(&list, &dpm_list); mutex_unlock(&dpm_list_mtx); + trace_suspend_resume(TPS("dpm_complete"), state.event, false); } /** @@ -1086,6 +1094,7 @@ static int dpm_suspend_noirq(pm_message_t state) ktime_t starttime = ktime_get(); int error = 0; + trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true); cpuidle_pause(); suspend_device_irqs(); mutex_lock(&dpm_list_mtx); @@ -1126,6 +1135,7 @@ static int dpm_suspend_noirq(pm_message_t state) } else { dpm_show_time(starttime, state, "noirq"); } + trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, false); return error; } @@ -1222,6 +1232,7 @@ static int dpm_suspend_late(pm_message_t state) ktime_t starttime = ktime_get(); int error = 0; + trace_suspend_resume(TPS("dpm_suspend_late"), state.event, true); mutex_lock(&dpm_list_mtx); pm_transition = state; async_error = 0; @@ -1257,6 +1268,7 @@ static int dpm_suspend_late(pm_message_t state) } else { dpm_show_time(starttime, state, "late"); } + trace_suspend_resume(TPS("dpm_suspend_late"), state.event, false); return error; } @@ -1461,6 +1473,7 @@ int dpm_suspend(pm_message_t state) ktime_t starttime = ktime_get(); int error = 0; + trace_suspend_resume(TPS("dpm_suspend"), state.event, true); might_sleep(); cpufreq_suspend(); @@ -1498,6 +1511,7 @@ int dpm_suspend(pm_message_t state) dpm_save_failed_step(SUSPEND_SUSPEND); } else dpm_show_time(starttime, state, NULL); + trace_suspend_resume(TPS("dpm_suspend"), state.event, false); return error; } @@ -1582,6 +1596,7 @@ int dpm_prepare(pm_message_t state) { int error = 0; + trace_suspend_resume(TPS("dpm_prepare"), state.event, true); might_sleep(); mutex_lock(&dpm_list_mtx); @@ -1612,6 +1627,7 @@ int dpm_prepare(pm_message_t state) put_device(dev); } mutex_unlock(&dpm_list_mtx); + trace_suspend_resume(TPS("dpm_prepare"), state.event, false); return error; } diff --git a/drivers/base/syscore.c b/drivers/base/syscore.c index e8d11b6630ee..dbb8350ea8dc 100644 --- a/drivers/base/syscore.c +++ b/drivers/base/syscore.c @@ -10,6 +10,7 @@ #include #include #include +#include static LIST_HEAD(syscore_ops_list); static DEFINE_MUTEX(syscore_ops_lock); @@ -49,6 +50,7 @@ int syscore_suspend(void) struct syscore_ops *ops; int ret = 0; + trace_suspend_resume(TPS("syscore_suspend"), 0, true); pr_debug("Checking wakeup interrupts\n"); /* Return error code if there are any wakeup interrupts pending. */ @@ -70,6 +72,7 @@ int syscore_suspend(void) "Interrupts enabled after %pF\n", ops->suspend); } + trace_suspend_resume(TPS("syscore_suspend"), 0, false); return 0; err_out: @@ -92,6 +95,7 @@ void syscore_resume(void) { struct syscore_ops *ops; + trace_suspend_resume(TPS("syscore_resume"), 0, true); WARN_ONCE(!irqs_disabled(), "Interrupts enabled before system core resume.\n"); @@ -103,6 +107,7 @@ void syscore_resume(void) WARN_ONCE(!irqs_disabled(), "Interrupts enabled after %pF\n", ops->resume); } + trace_suspend_resume(TPS("syscore_resume"), 0, false); } EXPORT_SYMBOL_GPL(syscore_resume); #endif /* CONFIG_PM_SLEEP */ diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 9a7e08d61258..f88c8573e66c 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -7,6 +7,9 @@ #include #include #include +#include + +#define TPS(x) tracepoint_string(x) DECLARE_EVENT_CLASS(cpu, @@ -97,23 +100,6 @@ DEFINE_EVENT(cpu, cpu_frequency, TP_ARGS(frequency, cpu_id) ); -TRACE_EVENT(machine_suspend, - - TP_PROTO(unsigned int state), - - TP_ARGS(state), - - TP_STRUCT__entry( - __field( u32, state ) - ), - - TP_fast_assign( - __entry->state = state; - ), - - TP_printk("state=%lu", (unsigned long)__entry->state) -); - TRACE_EVENT(device_pm_report_time, TP_PROTO(struct device *dev, const char *pm_ops, s64 ops_time, @@ -151,6 +137,28 @@ TRACE_EVENT(device_pm_report_time, __entry->ops_time, __entry->error) ); +TRACE_EVENT(suspend_resume, + + TP_PROTO(const char *action, int val, bool start), + + TP_ARGS(action, val, start), + + TP_STRUCT__entry( + __field(const char *, action) + __field(int, val) + __field(bool, start) + ), + + TP_fast_assign( + __entry->action = action; + __entry->val = val; + __entry->start = start; + ), + + TP_printk("%s[%u] %s", __entry->action, (unsigned int)__entry->val, + (__entry->start)?"begin":"end") +); + DECLARE_EVENT_CLASS(wakeup_source, TP_PROTO(const char *name, unsigned int state), diff --git a/kernel/cpu.c b/kernel/cpu.c index a9e710eef0e2..76deba01322a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "smpboot.h" @@ -522,7 +523,9 @@ int disable_nonboot_cpus(void) for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; + trace_suspend_resume(TPS("CPU_OFF"), cpu, true); error = _cpu_down(cpu, 1); + trace_suspend_resume(TPS("CPU_OFF"), cpu, false); if (!error) cpumask_set_cpu(cpu, frozen_cpus); else { @@ -566,7 +569,9 @@ void __ref enable_nonboot_cpus(void) arch_enable_nonboot_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { + trace_suspend_resume(TPS("CPU_ON"), cpu, true); error = _cpu_up(cpu, 1); + trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { printk(KERN_INFO "CPU%d is up\n", cpu); continue; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index df88d55dc436..49e0a20fd010 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "power.h" @@ -292,7 +293,9 @@ static int create_image(int platform_mode) in_suspend = 1; save_processor_state(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); error = swsusp_arch_suspend(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); if (error) printk(KERN_ERR "PM: Error %d creating hibernation image\n", error); diff --git a/kernel/power/process.c b/kernel/power/process.c index 06ec8869dbf1..0ca8d83e2369 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -17,6 +17,7 @@ #include #include #include +#include /* * Timeout for stopping processes @@ -175,6 +176,7 @@ void thaw_processes(void) struct task_struct *g, *p; struct task_struct *curr = current; + trace_suspend_resume(TPS("thaw_processes"), 0, true); if (pm_freezing) atomic_dec(&system_freezing_cnt); pm_freezing = false; @@ -201,6 +203,7 @@ void thaw_processes(void) schedule(); printk("done.\n"); + trace_suspend_resume(TPS("thaw_processes"), 0, false); } void thaw_kernel_threads(void) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 963e6d0f050b..4dd8822f732a 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -177,7 +177,9 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Finish; + trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); + trace_suspend_resume(TPS("freeze_processes"), 0, false); if (!error) return 0; @@ -240,7 +242,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) * all the devices are suspended. */ if (state == PM_SUSPEND_FREEZE) { + trace_suspend_resume(TPS("machine_suspend"), state, true); freeze_enter(); + trace_suspend_resume(TPS("machine_suspend"), state, false); goto Platform_wake; } @@ -256,7 +260,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (!error) { *wakeup = pm_wakeup_pending(); if (!(suspend_test(TEST_CORE) || *wakeup)) { + trace_suspend_resume(TPS("machine_suspend"), + state, true); error = suspend_ops->enter(state); + trace_suspend_resume(TPS("machine_suspend"), + state, false); events_check_enabled = false; } syscore_resume(); @@ -294,7 +302,6 @@ int suspend_devices_and_enter(suspend_state_t state) if (need_suspend_ops(state) && !suspend_ops) return -ENOSYS; - trace_machine_suspend(state); if (need_suspend_ops(state) && suspend_ops->begin) { error = suspend_ops->begin(state); if (error) @@ -331,7 +338,6 @@ int suspend_devices_and_enter(suspend_state_t state) else if (state == PM_SUSPEND_FREEZE && freeze_ops->end) freeze_ops->end(); - trace_machine_suspend(PWR_EVENT_EXIT); return error; Recover_platform: @@ -365,6 +371,7 @@ static int enter_state(suspend_state_t state) { int error; + trace_suspend_resume(TPS("suspend_enter"), state, true); if (state == PM_SUSPEND_FREEZE) { #ifdef CONFIG_PM_DEBUG if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { @@ -382,9 +389,11 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_FREEZE) freeze_begin(); + trace_suspend_resume(TPS("sync_filesystems"), 0, true); printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); printk("done.\n"); + trace_suspend_resume(TPS("sync_filesystems"), 0, false); pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); error = suspend_prepare(state); @@ -394,6 +403,7 @@ static int enter_state(suspend_state_t state) if (suspend_test(TEST_FREEZER)) goto Finish; + trace_suspend_resume(TPS("suspend_enter"), state, false); pr_debug("PM: Entering %s sleep\n", pm_states[state].label); pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); -- cgit From 4e52365f279564cef0ddd41db5237f0471381093 Mon Sep 17 00:00:00 2001 From: Matthew Dempsky Date: Fri, 6 Jun 2014 14:36:42 -0700 Subject: ptrace: fix fork event messages across pid namespaces When tracing a process in another pid namespace, it's important for fork event messages to contain the child's pid as seen from the tracer's pid namespace, not the parent's. Otherwise, the tracer won't be able to correlate the fork event with later SIGTRAP signals it receives from the child. We still risk a race condition if a ptracer from a different pid namespace attaches after we compute the pid_t value. However, sending a bogus fork event message in this unlikely scenario is still a vast improvement over the status quo where we always send bogus fork event messages to debuggers in a different pid namespace than the forking process. Signed-off-by: Matthew Dempsky Acked-by: Oleg Nesterov Cc: Kees Cook Cc: Julien Tinnes Cc: Roland McGrath Cc: Jan Kratochvil Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptrace.h | 32 ++++++++++++++++++++++++++++++++ kernel/fork.c | 10 +++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 07d0df6bf768..077904c8b70d 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -5,6 +5,7 @@ #include /* For struct task_struct. */ #include /* for IS_ERR_VALUE */ #include /* For BUG_ON. */ +#include /* For task_active_pid_ns. */ #include /* @@ -128,6 +129,37 @@ static inline void ptrace_event(int event, unsigned long message) } } +/** + * ptrace_event_pid - possibly stop for a ptrace event notification + * @event: %PTRACE_EVENT_* value to report + * @pid: process identifier for %PTRACE_GETEVENTMSG to return + * + * Check whether @event is enabled and, if so, report @event and @pid + * to the ptrace parent. @pid is reported as the pid_t seen from the + * the ptrace parent's pid namespace. + * + * Called without locks. + */ +static inline void ptrace_event_pid(int event, struct pid *pid) +{ + /* + * FIXME: There's a potential race if a ptracer in a different pid + * namespace than parent attaches between computing message below and + * when we acquire tasklist_lock in ptrace_stop(). If this happens, + * the ptracer will get a bogus pid from PTRACE_GETEVENTMSG. + */ + unsigned long message = 0; + struct pid_namespace *ns; + + rcu_read_lock(); + ns = task_active_pid_ns(rcu_dereference(current->parent)); + if (ns) + message = pid_nr_ns(pid, ns); + rcu_read_unlock(); + + ptrace_event(event, message); +} + /** * ptrace_init_task - initialize ptrace state for a new child * @child: new child task diff --git a/kernel/fork.c b/kernel/fork.c index 0d53eb0dfb6f..d2799d1fc952 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1606,10 +1606,12 @@ long do_fork(unsigned long clone_flags, */ if (!IS_ERR(p)) { struct completion vfork; + struct pid *pid; trace_sched_process_fork(current, p); - nr = task_pid_vnr(p); + pid = get_task_pid(p, PIDTYPE_PID); + nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); @@ -1624,12 +1626,14 @@ long do_fork(unsigned long clone_flags, /* forking complete and child started to run, tell ptracer */ if (unlikely(trace)) - ptrace_event(trace, nr); + ptrace_event_pid(trace, pid); if (clone_flags & CLONE_VFORK) { if (!wait_for_vfork_done(p, &vfork)) - ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); + ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); } + + put_pid(pid); } else { nr = PTR_ERR(p); } -- cgit From 650226bd95817d727c9a388752d72ef292c4a668 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:44 -0700 Subject: ptrace: task_clear_jobctl_trapping()->wake_up_bit() needs mb() __wake_up_bit() checks waitqueue_active() and thus the caller needs mb() as wake_up_bit() documents, fix task_clear_jobctl_trapping(). Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 6e600aaa2af4..9b5453ee5f5e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -277,6 +277,7 @@ void task_clear_jobctl_trapping(struct task_struct *task) { if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { task->jobctl &= ~JOBCTL_TRAPPING; + smp_mb(); /* advised by wake_up_bit() */ wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); } } -- cgit From 6114041aa7e79a28cee554450fc4a67442327c7f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:46 -0700 Subject: signals: s/siginitset/sigemptyset/ in do_sigtimedwait() Cosmetic, but siginitset(0) looks a bit strange, sigemptyset() is what do_sigtimedwait() needs. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 9b5453ee5f5e..251ca9c9d666 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2855,7 +2855,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); - siginitset(&tsk->real_blocked, 0); + sigemptyset(&tsk->real_blocked); sig = dequeue_signal(tsk, &mask, info); } spin_unlock_irq(&tsk->sighand->siglock); -- cgit From 9490592f2762320ac220f50d4459d9ea7bb797c8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:48 -0700 Subject: signals: kill rm_from_queue(), change prepare_signal() to use for_each_thread() rm_from_queue() doesn't make sense. The only caller, prepare_signal(), can use rm_from_queue_full() with the same effect. While at it, change prepare_signal() to use for_each_thread() instead of do/while_each_thread. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 43 ++++++++++--------------------------------- 1 file changed, 10 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 251ca9c9d666..19316dc3eb0b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -728,29 +728,6 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) } return 1; } -/* - * Remove signals in mask from the pending set and queue. - * Returns 1 if any signals were found. - * - * All callers must be holding the siglock. - */ -static int rm_from_queue(unsigned long mask, struct sigpending *s) -{ - struct sigqueue *q, *n; - - if (!sigtestsetmask(&s->signal, mask)) - return 0; - - sigdelsetmask(&s->signal, mask); - list_for_each_entry_safe(q, n, &s->list, list) { - if (q->info.si_signo < SIGRTMIN && - (mask & sigmask(q->info.si_signo))) { - list_del_init(&q->list); - __sigqueue_free(q); - } - } - return 1; -} static inline int is_si_special(const struct siginfo *info) { @@ -862,6 +839,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) { struct signal_struct *signal = p->signal; struct task_struct *t; + sigset_t flush; if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { if (signal->flags & SIGNAL_GROUP_COREDUMP) @@ -873,26 +851,25 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) /* * This is a stop signal. Remove SIGCONT from all queues. */ - rm_from_queue(sigmask(SIGCONT), &signal->shared_pending); - t = p; - do { - rm_from_queue(sigmask(SIGCONT), &t->pending); - } while_each_thread(p, t); + siginitset(&flush, sigmask(SIGCONT)); + rm_from_queue_full(&flush, &signal->shared_pending); + for_each_thread(p, t) + rm_from_queue_full(&flush, &t->pending); } else if (sig == SIGCONT) { unsigned int why; /* * Remove all stop signals from all queues, wake all threads. */ - rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); - t = p; - do { + siginitset(&flush, SIG_KERNEL_STOP_MASK); + rm_from_queue_full(&flush, &signal->shared_pending); + for_each_thread(p, t) { + rm_from_queue_full(&flush, &t->pending); task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); - rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); if (likely(!(t->ptrace & PT_SEIZED))) wake_up_state(t, __TASK_STOPPED); else ptrace_trap_notify(t); - } while_each_thread(p, t); + } /* * Notify the parent with CLD_CONTINUED if we were stopped. -- cgit From c09c144139f227de13330111adcafa1659ebd008 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:50 -0700 Subject: signals: rename rm_from_queue_full() to flush_sigqueue_mask() "rm_from_queue_full" looks ugly and misleading, especially now that rm_from_queue() has gone away. Rename it to flush_sigqueue_mask(), this matches flush_sigqueue() we already have. Also remove the obsolete comment which explains the difference with rm_from_queue() we already killed. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 19316dc3eb0b..c088011f873c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -706,11 +706,8 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state) * Returns 1 if any signals were found. * * All callers must be holding the siglock. - * - * This version takes a sigset mask and looks at all signals, - * not just those in the first mask word. */ -static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) +static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) { struct sigqueue *q, *n; sigset_t m; @@ -852,18 +849,18 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) * This is a stop signal. Remove SIGCONT from all queues. */ siginitset(&flush, sigmask(SIGCONT)); - rm_from_queue_full(&flush, &signal->shared_pending); + flush_sigqueue_mask(&flush, &signal->shared_pending); for_each_thread(p, t) - rm_from_queue_full(&flush, &t->pending); + flush_sigqueue_mask(&flush, &t->pending); } else if (sig == SIGCONT) { unsigned int why; /* * Remove all stop signals from all queues, wake all threads. */ siginitset(&flush, SIG_KERNEL_STOP_MASK); - rm_from_queue_full(&flush, &signal->shared_pending); + flush_sigqueue_mask(&flush, &signal->shared_pending); for_each_thread(p, t) { - rm_from_queue_full(&flush, &t->pending); + flush_sigqueue_mask(&flush, &t->pending); task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); if (likely(!(t->ptrace & PT_SEIZED))) wake_up_state(t, __TASK_STOPPED); @@ -3102,9 +3099,9 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) if (sig_handler_ignored(sig_handler(t, sig), sig)) { sigemptyset(&mask); sigaddset(&mask, sig); - rm_from_queue_full(&mask, &t->signal->shared_pending); + flush_sigqueue_mask(&mask, &t->signal->shared_pending); do { - rm_from_queue_full(&mask, &t->pending); + flush_sigqueue_mask(&mask, &t->pending); } while_each_thread(current, t); } } @@ -3113,7 +3110,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) return 0; } -static int +static int do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) { stack_t oss; -- cgit From afe2b0386ac22b6a189a2067b25282cade3fbb4d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:51 -0700 Subject: signals: cleanup the usage of t/current in do_sigaction() The usage of "task_struct *t" and "current" in do_sigaction() looks really annoying and chaotic. Initially "t" is used as a cached value of current but not consistently, then it is reused as a loop variable and we have to use "current" again. Clean up this mess and also convert the code to use for_each_thread(). Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c088011f873c..a6d8c3af0ad6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3068,16 +3068,16 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { - struct task_struct *t = current; + struct task_struct *p = current, *t; struct k_sigaction *k; sigset_t mask; if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) return -EINVAL; - k = &t->sighand->action[sig-1]; + k = &p->sighand->action[sig-1]; - spin_lock_irq(¤t->sighand->siglock); + spin_lock_irq(&p->sighand->siglock); if (oact) *oact = *k; @@ -3096,17 +3096,16 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) * (for example, SIGCHLD), shall cause the pending signal to * be discarded, whether or not it is blocked" */ - if (sig_handler_ignored(sig_handler(t, sig), sig)) { + if (sig_handler_ignored(sig_handler(p, sig), sig)) { sigemptyset(&mask); sigaddset(&mask, sig); - flush_sigqueue_mask(&mask, &t->signal->shared_pending); - do { + flush_sigqueue_mask(&mask, &p->signal->shared_pending); + for_each_thread(p, t) flush_sigqueue_mask(&mask, &t->pending); - } while_each_thread(current, t); } } - spin_unlock_irq(¤t->sighand->siglock); + spin_unlock_irq(&p->sighand->siglock); return 0; } -- cgit From 0341729b4b832e753c5e745c6ba0e797f6198be0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:53 -0700 Subject: signals: mv {dis,}allow_signal() from sched.h/exit.c to signal.[ch] Move the declaration/definition of allow_signal/disallow_signal to signal.h/signal.c. The new place is more logical and allows to use the static helpers in signal.c (see the next changes). While at it, make them return void and remove the valid_signal() check. Nobody checks the returned value, and in-kernel users must not pass the wrong signal number. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 --- include/linux/signal.h | 2 ++ kernel/exit.c | 39 --------------------------------------- kernel/signal.c | 29 +++++++++++++++++++++++++++++ 4 files changed, 31 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8fcd0e6098d9..ea74596014a2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2414,9 +2414,6 @@ extern void flush_itimer_signals(void); extern void do_group_exit(int); -extern int allow_signal(int); -extern int disallow_signal(int); - extern int do_execve(struct filename *, const char __user * const __user *, const char __user * const __user *); diff --git a/include/linux/signal.h b/include/linux/signal.h index ae744c314630..ac83c593f4b9 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -284,6 +284,8 @@ extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping); extern void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs, int stepping); extern void exit_signals(struct task_struct *tsk); +extern void allow_signal(int); +extern void disallow_signal(int); /* * Eventually that'll replace get_signal_to_deliver(); macro for now, diff --git a/kernel/exit.c b/kernel/exit.c index 750c2e594617..e5c4668f1799 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -313,45 +313,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -/* - * Let kernel threads use this to say that they allow a certain signal. - * Must not be used if kthread was cloned with CLONE_SIGHAND. - */ -int allow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - /* This is only needed for daemonize()'ed kthreads */ - sigdelset(¤t->blocked, sig); - /* - * Kernel threads handle their own signals. Let the signal code - * know it'll be handled, so that they don't get converted to - * SIGKILL or just silently dropped. - */ - current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(allow_signal); - -int disallow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(disallow_signal); - #ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. diff --git a/kernel/signal.c b/kernel/signal.c index a6d8c3af0ad6..7d6ff8b18509 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3066,6 +3066,35 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, } #endif +/* + * Let kernel threads use this to say that they allow a certain signal. + * Must not be used if kthread was cloned with CLONE_SIGHAND. + */ +void allow_signal(int sig) +{ + spin_lock_irq(¤t->sighand->siglock); + /* This is only needed for daemonize()'ed kthreads */ + sigdelset(¤t->blocked, sig); + /* + * Kernel threads handle their own signals. Let the signal code + * know it'll be handled, so that they don't get converted to + * SIGKILL or just silently dropped. + */ + current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); +} +EXPORT_SYMBOL(allow_signal); + +void disallow_signal(int sig) +{ + spin_lock_irq(¤t->sighand->siglock); + current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); +} +EXPORT_SYMBOL(disallow_signal); + int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *p = current, *t; -- cgit From ec5955b8fdc1b0bc62495e13869769be732cc6c3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:57 -0700 Subject: signals: kill the obsolete sigdelset() and recalc_sigpending() in allow_signal() allow_signal() does sigdelset(current->blocked) due to historic reason, previously it could be called by a daemonize()'ed kthread, and daemonize() played with current->blocked. Now that daemonize() has gone away we can remove sigdelset() and recalc_sigpending(). If a user really wants to unblock a signal, it must use sigprocmask() or set_current_block() explicitely. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7d6ff8b18509..c64c891ca0a6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3072,16 +3072,13 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, */ void allow_signal(int sig) { - spin_lock_irq(¤t->sighand->siglock); - /* This is only needed for daemonize()'ed kthreads */ - sigdelset(¤t->blocked, sig); /* * Kernel threads handle their own signals. Let the signal code * know it'll be handled, so that they don't get converted to * SIGKILL or just silently dropped. */ + spin_lock_irq(¤t->sighand->siglock); current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; - recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); } EXPORT_SYMBOL(allow_signal); -- cgit From 580d34e42ad621736a3c53c9c11a2152c18a4068 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:36:58 -0700 Subject: signals: disallow_signal() should flush the potentially pending signal disallow_signal() simply sets SIG_IGN, this is not enough and recalc_sigpending() is simply pointless because in can never change the state of TIF_SIGPENDING. If we ignore a signal, we also need to do flush_sigqueue_mask() for the case when this signal is pending, this way recalc_sigpending() can actually clear TIF_SIGPENDING and we do not "leak" the allocated siginfo's. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c64c891ca0a6..3ec405132c79 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3085,8 +3085,15 @@ EXPORT_SYMBOL(allow_signal); void disallow_signal(int sig) { + sigset_t mask; + + sigemptyset(&mask); + sigaddset(&mask, sig); + spin_lock_irq(¤t->sighand->siglock); current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; + flush_sigqueue_mask(&mask, ¤t->signal->shared_pending); + flush_sigqueue_mask(&mask, ¤t->pending); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); } -- cgit From b4e74264eb0b03f42097fa70a0766312156244a0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:37:00 -0700 Subject: signals: introduce kernel_sigaction() Now that allow_signal() is really trivial we can unify it with disallow_signal(). Add the new helper, kernel_sigaction(), and reimplement allow_signal/disallow_signal as a trivial wrappers. This saves one EXPORT_SYMBOL() and the new helper can have more users. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/signal.h | 18 ++++++++++++++++-- kernel/signal.c | 36 ++++++++++++------------------------ 2 files changed, 28 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index ac83c593f4b9..c9e65360c49a 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -284,8 +284,22 @@ extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping); extern void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs, int stepping); extern void exit_signals(struct task_struct *tsk); -extern void allow_signal(int); -extern void disallow_signal(int); +extern void kernel_sigaction(int, __sighandler_t); + +static inline void allow_signal(int sig) +{ + /* + * Kernel threads handle their own signals. Let the signal code + * know it'll be handled, so that they don't get converted to + * SIGKILL or just silently dropped. + */ + kernel_sigaction(sig, (__force __sighandler_t)2); +} + +static inline void disallow_signal(int sig) +{ + kernel_sigaction(sig, SIG_IGN); +} /* * Eventually that'll replace get_signal_to_deliver(); macro for now, diff --git a/kernel/signal.c b/kernel/signal.c index 3ec405132c79..a4077e90f19f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3067,37 +3067,25 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, #endif /* - * Let kernel threads use this to say that they allow a certain signal. - * Must not be used if kthread was cloned with CLONE_SIGHAND. + * For kthreads only, must not be used if cloned with CLONE_SIGHAND */ -void allow_signal(int sig) +void kernel_sigaction(int sig, __sighandler_t action) { - /* - * Kernel threads handle their own signals. Let the signal code - * know it'll be handled, so that they don't get converted to - * SIGKILL or just silently dropped. - */ spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; - spin_unlock_irq(¤t->sighand->siglock); -} -EXPORT_SYMBOL(allow_signal); + current->sighand->action[sig - 1].sa.sa_handler = action; + if (action == SIG_IGN) { + sigset_t mask; -void disallow_signal(int sig) -{ - sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, sig); - sigemptyset(&mask); - sigaddset(&mask, sig); - - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; - flush_sigqueue_mask(&mask, ¤t->signal->shared_pending); - flush_sigqueue_mask(&mask, ¤t->pending); - recalc_sigpending(); + flush_sigqueue_mask(&mask, ¤t->signal->shared_pending); + flush_sigqueue_mask(&mask, ¤t->pending); + recalc_sigpending(); + } spin_unlock_irq(¤t->sighand->siglock); } -EXPORT_SYMBOL(disallow_signal); +EXPORT_SYMBOL(kernel_sigaction); int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { -- cgit From 76e0a6f40b198f08fca69221c3d0112d1e9713a7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 6 Jun 2014 14:37:02 -0700 Subject: signals: change wait_for_helper() to use kernel_sigaction() Now that we have kernel_sigaction() we can change wait_for_helper() to use it and cleans up the code a bit. Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Al Viro Cc: David Woodhouse Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Ingo Molnar Cc: Mathieu Desnoyers Cc: Richard Weinberger Cc: Steven Rostedt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 0ac67a5861c5..8637e041a247 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -285,10 +285,7 @@ static int wait_for_helper(void *data) pid_t pid; /* If SIGCLD is ignored sys_wait4 won't populate the status. */ - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL; - spin_unlock_irq(¤t->sighand->siglock); - + kernel_sigaction(SIGCHLD, SIG_DFL); pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); if (pid < 0) { sub_info->retval = pid; -- cgit From a219ccf4637396a2392bfbec7c12acbfe2b06b46 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Fri, 6 Jun 2014 14:37:05 -0700 Subject: smp: print more useful debug info upon receiving IPI on an offline CPU There is a longstanding problem related to CPU hotplug which causes IPIs to be delivered to offline CPUs, and the smp-call-function IPI handler code prints out a warning whenever this is detected. Every once in a while this (usually harmless) warning gets reported on LKML, but so far it has not been completely fixed. Usually the solution involves finding out the IPI sender and fixing it by adding appropriate synchronization with CPU hotplug. However, while going through one such internal bug reports, I found that there is a significant bug in the receiver side itself (more specifically, in stop-machine) that can lead to this problem even when the sender code is perfectly fine. This patchset fixes that synchronization problem in the CPU hotplug stop-machine code. Patch 1 adds some additional debug code to the smp-call-function framework, to help debug such issues easily. Patch 2 modifies the stop-machine code to ensure that any IPIs that were sent while the target CPU was online, would be noticed and handled by that CPU without fail before it goes offline. Thus, this avoids scenarios where IPIs are received on offline CPUs (as long as the sender uses proper hotplug synchronization). In fact, I debugged the problem by using Patch 1, and found that the payload of the IPI was always the block layer's trigger_softirq() function. But I was not able to find anything wrong with the block layer code. That's when I started looking at the stop-machine code and realized that there is a race-window which makes the IPI _receiver_ the culprit, not the sender. Patch 2 fixes that race and hence this should put an end to most of the hard-to-debug IPI-to-offline-CPU issues. This patch (of 2): Today the smp-call-function code just prints a warning if we get an IPI on an offline CPU. This info is sufficient to let us know that something went wrong, but often it is very hard to debug exactly who sent the IPI and why, from this info alone. In most cases, we get the warning about the IPI to an offline CPU, immediately after the CPU going offline comes out of the stop-machine phase and reenables interrupts. Since all online CPUs participate in stop-machine, the information regarding the sender of the IPI is already lost by the time we exit the stop-machine loop. So even if we dump the stack on each CPU at this point, we won't find anything useful since all of them will show the stack-trace of the stopper thread. So we need a better way to figure out who sent the IPI and why. To achieve this, when we detect an IPI targeted to an offline CPU, loop through the call-single-data linked list and print out the payload (i.e., the name of the function which was supposed to be executed by the target CPU). This would give us an insight as to who might have sent the IPI and help us debug this further. [akpm@linux-foundation.org: correctly suppress warning output on second and later occurrences] Signed-off-by: Srivatsa S. Bhat Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Tejun Heo Cc: Rusty Russell Cc: Frederic Weisbecker Cc: Christoph Hellwig Cc: Mel Gorman Cc: Rik van Riel Cc: Borislav Petkov Cc: Steven Rostedt Cc: Mike Galbraith Cc: Gautham R Shenoy Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 06d574e42c72..306f8180b0d5 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -185,14 +185,26 @@ void generic_smp_call_function_single_interrupt(void) { struct llist_node *entry; struct call_single_data *csd, *csd_next; + static bool warned; + + entry = llist_del_all(&__get_cpu_var(call_single_queue)); + entry = llist_reverse_order(entry); /* * Shouldn't receive this interrupt on a cpu that is not yet online. */ - WARN_ON_ONCE(!cpu_online(smp_processor_id())); + if (unlikely(!cpu_online(smp_processor_id()) && !warned)) { + warned = true; + WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); - entry = llist_del_all(&__get_cpu_var(call_single_queue)); - entry = llist_reverse_order(entry); + /* + * We don't have to use the _safe() variant here + * because we are not invoking the IPI handlers yet. + */ + llist_for_each_entry(csd, entry, llist) + pr_warn("IPI callback %pS sent to offline CPU\n", + csd->func); + } llist_for_each_entry_safe(csd, csd_next, entry, llist) { csd->func(csd->info); -- cgit From f06e5153f4ae2e2f3b0300f0e260e40cb7fefd45 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 6 Jun 2014 14:37:07 -0700 Subject: kernel/panic.c: add "crash_kexec_post_notifiers" option for kdump after panic_notifers Add a "crash_kexec_post_notifiers" boot option to run kdump after running panic_notifiers and dump kmsg. This can help rare situations where kdump fails because of unstable crashed kernel or hardware failure (memory corruption on critical data/code), or the 2nd kernel is already broken by the 1st kernel (it's a broken behavior, but who can guarantee that the "crashed" kernel works correctly?). Usage: add "crash_kexec_post_notifiers" to kernel boot option. Note that this actually increases risks of the failure of kdump. This option should be set only if you worry about the rare case of kdump failure rather than increasing the chance of success. Signed-off-by: Masami Hiramatsu Acked-by: Motohiro Kosaki Acked-by: Vivek Goyal Cc: Eric Biederman Cc: Yoshihiro YUNOMAE Cc: Satoru MORIYA Cc: Tomoki Sekiyama Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 8 ++++++++ kernel/panic.c | 23 +++++++++++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9973a7e2e0ac..b9f67781c577 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2361,6 +2361,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted. timeout < 0: reboot immediately Format: + crash_kexec_post_notifiers + Run kdump after running panic-notifiers and dumping + kmsg. This only for the users who doubt kdump always + succeeds in any situation. + Note that this also increases risks of kdump failure, + because some panic notifiers can make the crashed + kernel more unstable. + parkbd.port= [HW] Parallel port number the keyboard adapter is connected to, default is 0. Format: diff --git a/kernel/panic.c b/kernel/panic.c index d02fa9fef46a..62e16cef9cc2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -32,6 +32,7 @@ static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); +static bool crash_kexec_post_notifiers; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -112,9 +113,11 @@ void panic(const char *fmt, ...) /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. - * Do we want to call this before we try to display a message? + * If we want to run this after calling panic_notifiers, pass + * the "crash_kexec_post_notifiers" option to the kernel. */ - crash_kexec(NULL); + if (!crash_kexec_post_notifiers) + crash_kexec(NULL); /* * Note smp_send_stop is the usual smp shutdown function, which @@ -131,6 +134,15 @@ void panic(const char *fmt, ...) kmsg_dump(KMSG_DUMP_PANIC); + /* + * If you doubt kdump always works fine in any situation, + * "crash_kexec_post_notifiers" offers you a chance to run + * panic_notifiers and dumping kmsg before kdump. + * Note: since some panic_notifiers can make crashed kernel + * more unstable, it can increase risks of the kdump failure too. + */ + crash_kexec(NULL); + bust_spinlocks(0); if (!panic_blink) @@ -472,6 +484,13 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); +static int __init setup_crash_kexec_post_notifiers(char *s) +{ + crash_kexec_post_notifiers = true; + return 0; +} +early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers); + static int __init oops_setup(char *s) { if (!s) -- cgit From e1bebcf41ed0aa15f11cec186cbd5141730bcafc Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 6 Jun 2014 14:37:09 -0700 Subject: kernel/kexec.c: convert printk to pr_foo() + some pr_warning -> pr_warn and checkpatch warning fixes Signed-off-by: Fabian Frederick Cc: Eric Biederman Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 69 +++++++++++++++++++++++++++------------------------------- 1 file changed, 32 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 28c57069ef68..6748688813d0 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -125,8 +125,8 @@ static struct page *kimage_alloc_page(struct kimage *image, unsigned long dest); static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) + unsigned long nr_segments, + struct kexec_segment __user *segments) { size_t segment_bytes; struct kimage *image; @@ -257,13 +257,13 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { - printk(KERN_ERR "Could not allocate control_code_buffer\n"); + pr_err("Could not allocate control_code_buffer\n"); goto out_free; } image->swap_page = kimage_alloc_control_pages(image, 0); if (!image->swap_page) { - printk(KERN_ERR "Could not allocate swap buffer\n"); + pr_err("Could not allocate swap buffer\n"); goto out_free; } @@ -332,7 +332,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { - printk(KERN_ERR "Could not allocate control_code_buffer\n"); + pr_err("Could not allocate control_code_buffer\n"); goto out_free; } @@ -621,8 +621,8 @@ static void kimage_terminate(struct kimage *image) #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ - ptr = (entry & IND_INDIRECTION)? \ - phys_to_virt((entry & PAGE_MASK)): ptr +1) + ptr = (entry & IND_INDIRECTION) ? \ + phys_to_virt((entry & PAGE_MASK)) : ptr + 1) static void kimage_free_entry(kimage_entry_t entry) { @@ -650,8 +650,7 @@ static void kimage_free(struct kimage *image) * done with it. */ ind = entry; - } - else if (entry & IND_SOURCE) + } else if (entry & IND_SOURCE) kimage_free_entry(entry); } /* Free the final indirection page */ @@ -774,8 +773,7 @@ static struct page *kimage_alloc_page(struct kimage *image, addr = old_addr; page = old_page; break; - } - else { + } else { /* Place the page on the destination list I * will use it later. */ @@ -1059,7 +1057,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, return -EINVAL; ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); - for (i=0; i < nr_segments; i++) { + for (i = 0; i < nr_segments; i++) { result = copy_from_user(&in, &segments[i], sizeof(in)); if (result) return -EFAULT; @@ -1214,14 +1212,14 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) * squirrelled away. ELF notes happen to provide * all of that, so there is no need to invent something new. */ - buf = (u32*)per_cpu_ptr(crash_notes, cpu); + buf = (u32 *)per_cpu_ptr(crash_notes, cpu); if (!buf) return; memset(&prstatus, 0, sizeof(prstatus)); prstatus.pr_pid = current->pid; elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - &prstatus, sizeof(prstatus)); + &prstatus, sizeof(prstatus)); final_note(buf); } @@ -1230,8 +1228,7 @@ static int __init crash_notes_memory_init(void) /* Allocate memory for saving cpu registers. */ crash_notes = alloc_percpu(note_buf_t); if (!crash_notes) { - printk("Kexec: Memory allocation for saving cpu register" - " states failed\n"); + pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); return -ENOMEM; } return 0; @@ -1253,10 +1250,10 @@ subsys_initcall(crash_notes_memory_init); * * The function returns 0 on success and -EINVAL on failure. */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) { char *cur = cmdline, *tmp; @@ -1267,12 +1264,12 @@ static int __init parse_crashkernel_mem(char *cmdline, /* get the start of the range */ start = memparse(cur, &tmp); if (cur == tmp) { - pr_warning("crashkernel: Memory value expected\n"); + pr_warn("crashkernel: Memory value expected\n"); return -EINVAL; } cur = tmp; if (*cur != '-') { - pr_warning("crashkernel: '-' expected\n"); + pr_warn("crashkernel: '-' expected\n"); return -EINVAL; } cur++; @@ -1281,31 +1278,30 @@ static int __init parse_crashkernel_mem(char *cmdline, if (*cur != ':') { end = memparse(cur, &tmp); if (cur == tmp) { - pr_warning("crashkernel: Memory " - "value expected\n"); + pr_warn("crashkernel: Memory value expected\n"); return -EINVAL; } cur = tmp; if (end <= start) { - pr_warning("crashkernel: end <= start\n"); + pr_warn("crashkernel: end <= start\n"); return -EINVAL; } } if (*cur != ':') { - pr_warning("crashkernel: ':' expected\n"); + pr_warn("crashkernel: ':' expected\n"); return -EINVAL; } cur++; size = memparse(cur, &tmp); if (cur == tmp) { - pr_warning("Memory value expected\n"); + pr_warn("Memory value expected\n"); return -EINVAL; } cur = tmp; if (size >= system_ram) { - pr_warning("crashkernel: invalid size\n"); + pr_warn("crashkernel: invalid size\n"); return -EINVAL; } @@ -1323,8 +1319,7 @@ static int __init parse_crashkernel_mem(char *cmdline, cur++; *crash_base = memparse(cur, &tmp); if (cur == tmp) { - pr_warning("Memory value expected " - "after '@'\n"); + pr_warn("Memory value expected after '@'\n"); return -EINVAL; } } @@ -1336,26 +1331,26 @@ static int __init parse_crashkernel_mem(char *cmdline, /* * That function parses "simple" (old) crashkernel command lines like * - * crashkernel=size[@offset] + * crashkernel=size[@offset] * * It returns 0 on success and -EINVAL on failure. */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) { char *cur = cmdline; *crash_size = memparse(cmdline, &cur); if (cmdline == cur) { - pr_warning("crashkernel: memory value expected\n"); + pr_warn("crashkernel: memory value expected\n"); return -EINVAL; } if (*cur == '@') *crash_base = memparse(cur+1, &cur); else if (*cur != ' ' && *cur != '\0') { - pr_warning("crashkernel: unrecognized char\n"); + pr_warn("crashkernel: unrecognized char\n"); return -EINVAL; } @@ -1691,7 +1686,7 @@ int kernel_kexec(void) * CPU hotplug again; so re-enable it here. */ cpu_hotplug_enable(); - printk(KERN_EMERG "Starting new kernel\n"); + pr_emerg("Starting new kernel\n"); machine_shutdown(); } -- cgit From f88083005ab319abba5d0b2e4e997558245493c8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 Jun 2014 14:37:17 -0700 Subject: sysctl: clean up char buffer arguments When writing to a sysctl string, each write, regardless of VFS position, began writing the string from the start. This meant the contents of the last write to the sysctl controlled the string contents instead of the first. This misbehavior was featured in an exploit against Chrome OS. While it's not in itself a vulnerability, it's a weirdness that isn't on the mind of most auditors: "This filter looks correct, the first line written would not be meaningful to sysctl" doesn't apply here, since the size of the write and the contents of the final write are what matter when writing to sysctls. This adds the sysctl kernel.sysctl_writes_strict to control the write behavior. The default (0) reports when VFS position is non-0 on a write, but retains legacy behavior, -1 disables the warning, and 1 enables the position-respecting behavior. The long-term plan here is to wait for userspace to be fixed in response to the new warning and to then switch the default kernel behavior to the new position-respecting behavior. This patch (of 4): The char buffer arguments are needlessly cast in weird places. Clean it up so things are easier to read. Signed-off-by: Kees Cook Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 40ce2d983b12..3e214beabbe9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1703,8 +1703,8 @@ int __init sysctl_init(void) #ifdef CONFIG_PROC_SYSCTL -static int _proc_do_string(void* data, int maxlen, int write, - void __user *buffer, +static int _proc_do_string(char *data, int maxlen, int write, + char __user *buffer, size_t *lenp, loff_t *ppos) { size_t len; @@ -1730,7 +1730,7 @@ static int _proc_do_string(void* data, int maxlen, int write, len = maxlen-1; if(copy_from_user(data, buffer, len)) return -EFAULT; - ((char *) data)[len] = 0; + data[len] = 0; *ppos += *lenp; } else { len = strlen(data); @@ -1748,10 +1748,10 @@ static int _proc_do_string(void* data, int maxlen, int write, if (len > *lenp) len = *lenp; if (len) - if(copy_to_user(buffer, data, len)) + if (copy_to_user(buffer, data, len)) return -EFAULT; if (len < *lenp) { - if(put_user('\n', ((char __user *) buffer) + len)) + if (put_user('\n', buffer + len)) return -EFAULT; len++; } @@ -1781,8 +1781,8 @@ static int _proc_do_string(void* data, int maxlen, int write, int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return _proc_do_string(table->data, table->maxlen, write, - buffer, lenp, ppos); + return _proc_do_string((char *)(table->data), table->maxlen, write, + (char __user *)buffer, lenp, ppos); } static size_t proc_skip_spaces(char **buf) -- cgit From 2ca9bb456ada8bcbdc8f77f8fc78207653bbaa92 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 Jun 2014 14:37:18 -0700 Subject: sysctl: refactor sysctl string writing logic Consolidate buffer length checking with new-line/end-of-line checking. Additionally, instead of reading user memory twice, just do the assignment during the loop. This change doesn't affect the potential races here. It was already possible to read a sysctl that was in the middle of a write. In both cases, the string will always be NULL terminated. The pre-existing race remains a problem to be solved. Signed-off-by: Kees Cook Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3e214beabbe9..ac6847feaa83 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1717,21 +1717,18 @@ static int _proc_do_string(char *data, int maxlen, int write, } if (write) { + /* Start writing from beginning of buffer. */ len = 0; + *ppos += *lenp; p = buffer; - while (len < *lenp) { + while ((p - buffer) < *lenp && len < maxlen - 1) { if (get_user(c, p++)) return -EFAULT; if (c == 0 || c == '\n') break; - len++; + data[len++] = c; } - if (len >= maxlen) - len = maxlen-1; - if(copy_from_user(data, buffer, len)) - return -EFAULT; data[len] = 0; - *ppos += *lenp; } else { len = strlen(data); if (len > maxlen) -- cgit From f4aacea2f5d1a5f7e3154e967d70cf3f711bcd61 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 Jun 2014 14:37:19 -0700 Subject: sysctl: allow for strict write position handling When writing to a sysctl string, each write, regardless of VFS position, begins writing the string from the start. This means the contents of the last write to the sysctl controls the string contents instead of the first: open("/proc/sys/kernel/modprobe", O_WRONLY) = 1 write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 4096) = 4096 write(1, "/bin/true", 9) = 9 close(1) = 0 $ cat /proc/sys/kernel/modprobe /bin/true Expected behaviour would be to have the sysctl be "AAAA..." capped at maxlen (in this case KMOD_PATH_LEN: 256), instead of truncating to the contents of the second write. Similarly, multiple short writes would not append to the sysctl. The old behavior is unlike regular POSIX files enough that doing audits of software that interact with sysctls can end up in unexpected or dangerous situations. For example, "as long as the input starts with a trusted path" turns out to be an insufficient filter, as what must also happen is for the input to be entirely contained in a single write syscall -- not a common consideration, especially for high level tools. This provides kernel.sysctl_writes_strict as a way to make this behavior act in a less surprising manner for strings, and disallows non-zero file position when writing numeric sysctls (similar to what is already done when reading from non-zero file positions). For now, the default (0) is to warn about non-zero file position use, but retain the legacy behavior. Setting this to -1 disables the warning, and setting this to 1 enables the file position respecting behavior. [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: move misplaced hunk, per Randy] Signed-off-by: Kees Cook Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 21 +++++++++++++ kernel/sysctl.c | 69 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 88 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 9886c3d57fc2..708bb7f1b7e0 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -77,6 +77,7 @@ show up in /proc/sys/kernel: - shmmni - stop-a [ SPARC only ] - sysrq ==> Documentation/sysrq.txt +- sysctl_writes_strict - tainted - threads-max - unknown_nmi_panic @@ -762,6 +763,26 @@ without users and with a dead originative process will be destroyed. ============================================================== +sysctl_writes_strict: + +Control how file position affects the behavior of updating sysctl values +via the /proc/sys interface: + + -1 - Legacy per-write sysctl value handling, with no printk warnings. + Each write syscall must fully contain the sysctl value to be + written, and multiple writes on the same sysctl file descriptor + will rewrite the sysctl value, regardless of file position. + 0 - (default) Same behavior as above, but warn about processes that + perform writes to a sysctl file descriptor when the file position + is not 0. + 1 - Respect file position when writing sysctl strings. Multiple writes + will append to the sysctl value buffer. Anything past the max length + of the sysctl value buffer will be ignored. Writes to numeric sysctl + entries must always be at file position 0 and the value must be + fully contained in the buffer sent in the write syscall. + +============================================================== + tainted: Non-zero if the kernel has been tainted. Numeric values, which diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ac6847feaa83..7a910b9081e8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -173,6 +173,13 @@ extern int no_unaligned_warning; #endif #ifdef CONFIG_PROC_SYSCTL + +#define SYSCTL_WRITES_LEGACY -1 +#define SYSCTL_WRITES_WARN 0 +#define SYSCTL_WRITES_STRICT 1 + +static int sysctl_writes_strict = SYSCTL_WRITES_WARN; + static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); static int proc_taint(struct ctl_table *table, int write, @@ -495,6 +502,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_taint, }, + { + .procname = "sysctl_writes_strict", + .data = &sysctl_writes_strict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &neg_one, + .extra2 = &one, + }, #endif #ifdef CONFIG_LATENCYTOP { @@ -1717,8 +1733,20 @@ static int _proc_do_string(char *data, int maxlen, int write, } if (write) { - /* Start writing from beginning of buffer. */ - len = 0; + if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { + /* Only continue writes not past the end of buffer. */ + len = strlen(data); + if (len > maxlen - 1) + len = maxlen - 1; + + if (*ppos > len) + return 0; + len = *ppos; + } else { + /* Start writing from beginning of buffer. */ + len = 0; + } + *ppos += *lenp; p = buffer; while ((p - buffer) < *lenp && len < maxlen - 1) { @@ -1758,6 +1786,14 @@ static int _proc_do_string(char *data, int maxlen, int write, return 0; } +static void warn_sysctl_write(struct ctl_table *table) +{ + pr_warn_once("%s wrote to %s when file position was not 0!\n" + "This will not be supported in the future. To silence this\n" + "warning, set kernel.sysctl_writes_strict = -1\n", + current->comm, table->procname); +} + /** * proc_dostring - read a string sysctl * @table: the sysctl table @@ -1778,6 +1814,9 @@ static int _proc_do_string(char *data, int maxlen, int write, int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) + warn_sysctl_write(table); + return _proc_do_string((char *)(table->data), table->maxlen, write, (char __user *)buffer, lenp, ppos); } @@ -1953,6 +1992,18 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, conv = do_proc_dointvec_conv; if (write) { + if (*ppos) { + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + goto out; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + break; + default: + break; + } + } + if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; page = __get_free_page(GFP_TEMPORARY); @@ -2010,6 +2061,7 @@ free: return err ? : -EINVAL; } *lenp -= left; +out: *ppos += *lenp; return err; } @@ -2202,6 +2254,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int left = *lenp; if (write) { + if (*ppos) { + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + goto out; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + break; + default: + break; + } + } + if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; page = __get_free_page(GFP_TEMPORARY); @@ -2257,6 +2321,7 @@ free: return err ? : -EINVAL; } *lenp -= left; +out: *ppos += *lenp; return err; } -- cgit From 68a9a435e4efb63c49a0c0a25756e3b71a5634ed Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 6 Jun 2014 14:37:21 -0700 Subject: kernel/user_namespace.c: kernel-doc/checkpatch fixes -uid->gid -split some function declarations -if/then/else warning Signed-off-by: Fabian Frederick Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user_namespace.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index bf71b4b2d632..fcc02560fd6b 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -286,7 +286,7 @@ EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in - * @uid: group identifier + * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. @@ -482,7 +482,8 @@ static int projid_m_show(struct seq_file *seq, void *v) return 0; } -static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) +static void *m_start(struct seq_file *seq, loff_t *ppos, + struct uid_gid_map *map) { struct uid_gid_extent *extent = NULL; loff_t pos = *ppos; @@ -546,7 +547,8 @@ struct seq_operations proc_projid_seq_operations = { .show = projid_m_show, }; -static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) +static bool mappings_overlap(struct uid_gid_map *new_map, + struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; @@ -653,7 +655,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ret = -EINVAL; pos = kbuf; new_map.nr_extents = 0; - for (;pos; pos = next_line) { + for (; pos; pos = next_line) { extent = &new_map.extent[new_map.nr_extents]; /* Find the end of line and ensure I don't look past it */ @@ -687,13 +689,16 @@ static ssize_t map_write(struct file *file, const char __user *buf, /* Verify we have been given valid starting values */ if ((extent->first == (u32) -1) || - (extent->lower_first == (u32) -1 )) + (extent->lower_first == (u32) -1)) goto out; - /* Verify count is not zero and does not cause the extent to wrap */ + /* Verify count is not zero and does not cause the + * extent to wrap + */ if ((extent->first + extent->count) <= extent->first) goto out; - if ((extent->lower_first + extent->count) <= extent->lower_first) + if ((extent->lower_first + extent->count) <= + extent->lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ @@ -751,7 +756,8 @@ out: return ret; } -ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +ssize_t proc_uid_map_write(struct file *file, const char __user *buf, + size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; @@ -767,7 +773,8 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz &ns->uid_map, &ns->parent->uid_map); } -ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +ssize_t proc_gid_map_write(struct file *file, const char __user *buf, + size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; @@ -783,7 +790,8 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz &ns->gid_map, &ns->parent->gid_map); } -ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +ssize_t proc_projid_map_write(struct file *file, const char __user *buf, + size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; @@ -800,7 +808,7 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t &ns->projid_map, &ns->parent->projid_map); } -static bool new_idmap_permitted(const struct file *file, +static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { @@ -811,8 +819,7 @@ static bool new_idmap_permitted(const struct file *file, kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, file->f_cred->fsuid)) return true; - } - else if (cap_setid == CAP_SETGID) { + } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (gid_eq(gid, file->f_cred->fsgid)) return true; -- cgit From aba871f1e9a8dad07d442913998cd679529f2784 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 6 Jun 2014 14:37:29 -0700 Subject: kernel/profile.c: convert printk to pr_foo() Signed-off-by: Fabian Frederick Cc: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index cb980f0c731b..c1204e2af5b2 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -64,12 +64,10 @@ int profile_setup(char *str) str += strlen(sleepstr) + 1; if (get_option(&str, &par)) prof_shift = par; - printk(KERN_INFO - "kernel sleep profiling enabled (shift: %ld)\n", + pr_info("kernel sleep profiling enabled (shift: %ld)\n", prof_shift); #else - printk(KERN_WARNING - "kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); + pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); #endif /* CONFIG_SCHEDSTATS */ } else if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; @@ -77,8 +75,7 @@ int profile_setup(char *str) str += strlen(schedstr) + 1; if (get_option(&str, &par)) prof_shift = par; - printk(KERN_INFO - "kernel schedule profiling enabled (shift: %ld)\n", + pr_info("kernel schedule profiling enabled (shift: %ld)\n", prof_shift); } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; @@ -86,13 +83,12 @@ int profile_setup(char *str) str += strlen(kvmstr) + 1; if (get_option(&str, &par)) prof_shift = par; - printk(KERN_INFO - "kernel KVM profiling enabled (shift: %ld)\n", + pr_info("kernel KVM profiling enabled (shift: %ld)\n", prof_shift); } else if (get_option(&str, &par)) { prof_shift = par; prof_on = CPU_PROFILING; - printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", + pr_info("kernel profiling enabled (shift: %ld)\n", prof_shift); } return 1; -- cgit From f3da64d1ea12ed6cfe18e6dedbc9739743e0b884 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 6 Jun 2014 14:37:30 -0700 Subject: kernel/profile.c: use static const char instead of static char schedstr, sleepstr and kvmstr are only used in strcmp & strlen Signed-off-by: Fabian Frederick Cc: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index c1204e2af5b2..54bf5ba26420 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -52,9 +52,9 @@ static DEFINE_MUTEX(profile_flip_mutex); int profile_setup(char *str) { - static char schedstr[] = "schedule"; - static char sleepstr[] = "sleep"; - static char kvmstr[] = "kvm"; + static const char schedstr[] = "schedule"; + static const char sleepstr[] = "sleep"; + static const char kvmstr[] = "kvm"; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { -- cgit From 7153e402731c3e72331633d1ac15a654768aecac Mon Sep 17 00:00:00 2001 From: Paul McQuade Date: Fri, 6 Jun 2014 14:37:37 -0700 Subject: ipc, kernel: use Linux headers Use #include instead of Use #include instead of Signed-off-by: Paul McQuade Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/compat.c | 2 +- ipc/compat_mq.c | 2 +- ipc/msg.c | 2 +- ipc/sem.c | 2 +- ipc/shm.c | 2 +- kernel/acct.c | 2 +- kernel/audit.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/ipc/compat.c b/ipc/compat.c index 45d035d4cedc..b5ef4f7946dc 100644 --- a/ipc/compat.c +++ b/ipc/compat.c @@ -30,7 +30,7 @@ #include #include -#include +#include #include "util.h" diff --git a/ipc/compat_mq.c b/ipc/compat_mq.c index 90d29f59cac6..ef6f91cc4490 100644 --- a/ipc/compat_mq.c +++ b/ipc/compat_mq.c @@ -12,7 +12,7 @@ #include #include -#include +#include struct compat_mq_attr { compat_long_t mq_flags; /* message queue flags */ diff --git a/ipc/msg.c b/ipc/msg.c index 35e4018de53c..d608e6dde919 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -39,7 +39,7 @@ #include #include -#include +#include #include "util.h" /* diff --git a/ipc/sem.c b/ipc/sem.c index 3fcbc96abee9..e8dcc72d5bef 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -87,7 +87,7 @@ #include #include -#include +#include #include "util.h" /* One semaphore structure for each semaphore in the system. */ diff --git a/ipc/shm.c b/ipc/shm.c index b54c93f6d117..fe49fdd240a8 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -43,7 +43,7 @@ #include #include -#include +#include #include "util.h" diff --git a/kernel/acct.c b/kernel/acct.c index 8d6e145138bb..df2851905d14 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include #include /* sector_div */ #include diff --git a/kernel/audit.c b/kernel/audit.c index 47845c57eb19..f30106459a32 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -44,7 +44,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include +#include #include #include #include -- cgit From 46c0a8ca3e841b14a1d981e2116eaf2d1c7f2235 Mon Sep 17 00:00:00 2001 From: Paul McQuade Date: Fri, 6 Jun 2014 14:37:37 -0700 Subject: ipc, kernel: clear whitespace trailing whitespace Signed-off-by: Paul McQuade Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 31 +++++++++++++++---------------- ipc/sem.c | 10 +++++----- ipc/shm.c | 2 +- ipc/util.c | 4 ++-- ipc/util.h | 8 ++++---- kernel/acct.c | 4 ++-- 6 files changed, 29 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/ipc/msg.c b/ipc/msg.c index d608e6dde919..7ed1ef338e76 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -611,23 +611,22 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) static int testmsg(struct msg_msg *msg, long type, int mode) { - switch (mode) - { - case SEARCH_ANY: - case SEARCH_NUMBER: + switch (mode) { + case SEARCH_ANY: + case SEARCH_NUMBER: + return 1; + case SEARCH_LESSEQUAL: + if (msg->m_type <= type) return 1; - case SEARCH_LESSEQUAL: - if (msg->m_type <= type) - return 1; - break; - case SEARCH_EQUAL: - if (msg->m_type == type) - return 1; - break; - case SEARCH_NOTEQUAL: - if (msg->m_type != type) - return 1; - break; + break; + case SEARCH_EQUAL: + if (msg->m_type == type) + return 1; + break; + case SEARCH_NOTEQUAL: + if (msg->m_type != type) + return 1; + break; } return 0; } diff --git a/ipc/sem.c b/ipc/sem.c index e8dcc72d5bef..fe0928a3d08b 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -160,7 +160,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); * sem_array.pending{_alter,_cont}, * sem_array.sem_undo: global sem_lock() for read/write * sem_undo.proc_next: only "current" is allowed to read/write that field. - * + * * sem_array.sem_base[i].pending_{const,alter}: * global or semaphore sem_lock() for read/write */ @@ -1161,7 +1161,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, err = security_sem_semctl(NULL, cmd); if (err) return err; - + memset(&seminfo, 0, sizeof(seminfo)); seminfo.semmni = ns->sc_semmni; seminfo.semmns = ns->sc_semmns; @@ -1181,7 +1181,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, } max_id = ipc_get_maxid(&sem_ids(ns)); up_read(&sem_ids(ns).rwsem); - if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) + if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) return -EFAULT; return (max_id < 0) ? 0 : max_id; } @@ -1883,7 +1883,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, /* We need to sleep on this operation, so we put the current * task into the pending queue and go to sleep. */ - + queue.sops = sops; queue.nsops = nsops; queue.undo = un; @@ -2016,7 +2016,7 @@ int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) return error; atomic_inc(&undo_list->refcnt); tsk->sysvsem.undo_list = undo_list; - } else + } else tsk->sysvsem.undo_list = NULL; return 0; diff --git a/ipc/shm.c b/ipc/shm.c index fe49fdd240a8..2b64b0d25bba 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -694,7 +694,7 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf out.shmmin = in->shmmin; out.shmmni = in->shmmni; out.shmseg = in->shmseg; - out.shmall = in->shmall; + out.shmall = in->shmall; return copy_to_user(buf, &out, sizeof(out)); } diff --git a/ipc/util.c b/ipc/util.c index 9b3fa38afe2c..27d74e69fd57 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -183,7 +183,7 @@ void __init ipc_init_proc_interface(const char *path, const char *header, * ipc_findkey - find a key in an ipc identifier set * @ids: ipc identifier set * @key: key to find - * + * * Returns the locked pointer to the ipc structure if found or NULL * otherwise. If key is found ipc points to the owning ipc structure * @@ -538,7 +538,7 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flag) else if (in_group_p(ipcp->cgid) || in_group_p(ipcp->gid)) granted_mode >>= 3; /* is there some bit set in requested_mode but not in granted_mode? */ - if ((requested_mode & ~granted_mode & 0007) && + if ((requested_mode & ~granted_mode & 0007) && !ns_capable(ns->user_ns, CAP_IPC_OWNER)) return -1; diff --git a/ipc/util.h b/ipc/util.h index e1153ad574b7..1a5a0fcd099c 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -78,9 +78,9 @@ struct ipc_params { * . routine to call for an extra check if needed */ struct ipc_ops { - int (*getnew) (struct ipc_namespace *, struct ipc_params *); - int (*associate) (struct kern_ipc_perm *, int); - int (*more_checks) (struct kern_ipc_perm *, struct ipc_params *); + int (*getnew)(struct ipc_namespace *, struct ipc_params *); + int (*associate)(struct kern_ipc_perm *, int); + int (*more_checks)(struct kern_ipc_perm *, struct ipc_params *); }; struct seq_file; @@ -142,7 +142,7 @@ struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, struct ipc64_perm *perm, int extra_perm); #ifndef CONFIG_ARCH_WANT_IPC_PARSE_VERSION - /* On IA-64, we always use the "64-bit version" of the IPC structures. */ +/* On IA-64, we always use the "64-bit version" of the IPC structures. */ # define ipc_parse_version(cmd) IPC_64 #else int ipc_parse_version(int *cmd); diff --git a/kernel/acct.c b/kernel/acct.c index df2851905d14..808a86ff229d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -134,7 +134,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) spin_lock(&acct_lock); if (file != acct->file) { if (act) - res = act>0; + res = act > 0; goto out; } @@ -262,7 +262,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (name) { struct filename *tmp = getname(name); if (IS_ERR(tmp)) - return (PTR_ERR(tmp)); + return PTR_ERR(tmp); error = acct_on(tmp); putname(tmp); } else { -- cgit From 119ce5c8b99cd6b874cec58c81ce9b56e52160c3 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 6 Jun 2014 14:37:53 -0700 Subject: kernel/seccomp.c: kernel-doc warning fix + fix small typo Signed-off-by: Fabian Frederick Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/seccomp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b35c21503a36..f6d76bebe69f 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -39,7 +39,7 @@ * is only needed for handling filters shared across tasks. * @prev: points to a previously installed, or inherited, filter * @len: the number of instructions in the program - * @insns: the BPF program instructions to evaluate + * @insnsi: the BPF program instructions to evaluate * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting @@ -220,7 +220,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) return -ENOMEM; /* - * Installing a seccomp filter requires that the task have + * Installing a seccomp filter requires that the task has * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. * This avoids scenarios where unprivileged tasks can affect the * behavior of privileged children. -- cgit From 6f8fd1d77e64dd6be1075041cdfd8de4c2ca4e2b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 6 Jun 2014 14:38:08 -0700 Subject: sysctl: convert use of typedef ctl_table to struct ctl_table This typedef is unnecessary and should just be removed. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- kernel/utsname_sysctl.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7a910b9081e8..db19e3e2aa4b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -202,7 +202,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; -static int sysrq_sysctl_handler(ctl_table *table, int write, +static int sysrq_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 6fbe811c7ad1..c8eac43267e9 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -17,7 +17,7 @@ #ifdef CONFIG_PROC_SYSCTL -static void *get_uts(ctl_table *table, int write) +static void *get_uts(struct ctl_table *table, int write) { char *which = table->data; struct uts_namespace *uts_ns; @@ -32,7 +32,7 @@ static void *get_uts(ctl_table *table, int write) return which; } -static void put_uts(ctl_table *table, int write, void *which) +static void put_uts(struct ctl_table *table, int write, void *which) { if (!write) up_read(&uts_sem); @@ -44,7 +44,7 @@ static void put_uts(ctl_table *table, int write, void *which) * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_do_uts_string(ctl_table *table, int write, +static int proc_do_uts_string(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; -- cgit From a9fcaaac37b3baba1343f906f52aeb65c4d4e356 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 6 Jun 2014 23:17:28 -0400 Subject: tracing: Fix memory leak on instance deletion When an instance is created, it also gets a snapshot ring buffer allocated (with minimum of pages). But when it is deleted the snapshot buffer is not. There was a helper function added to match the allocation of these ring buffers to a way to free them, but it wasn't used by the deletion of an instance. Using that helper function solves this memory leak. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 26cfff38e2ab..16f7038d1f4d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6349,8 +6349,7 @@ static int instance_delete(const char *name) event_trace_del_tracer(tr); ftrace_destroy_function_files(tr); debugfs_remove_recursive(tr->dir); - free_percpu(tr->trace_buffer.data); - ring_buffer_free(tr->trace_buffer.buffer); + free_trace_buffers(tr); kfree(tr->name); kfree(tr); -- cgit From 3d5c9340d1949733eb37616abd15db36aef9a57c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 5 Jun 2014 12:34:23 +0200 Subject: rtmutex: Handle deadlock detection smarter Even in the case when deadlock detection is not requested by the caller, we can detect deadlocks. Right now the code stops the lock chain walk and keeps the waiter enqueued, even on itself. Silly not to yell when such a scenario is detected and to keep the waiter enqueued. Return -EDEADLK unconditionally and handle it at the call sites. The futex calls return -EDEADLK. The non futex ones dequeue the waiter, throw a warning and put the task into a schedule loop. Tagged for stable as it makes the code more robust. Signed-off-by: Thomas Gleixner Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Brad Mouring Link: http://lkml.kernel.org/r/20140605152801.836501969@linutronix.de Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/locking/rtmutex-debug.h | 5 +++++ kernel/locking/rtmutex.c | 33 ++++++++++++++++++++++++++++----- kernel/locking/rtmutex.h | 5 +++++ 3 files changed, 38 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index 14193d596d78..ab29b6a22669 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -31,3 +31,8 @@ static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, { return (waiter != NULL); } + +static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) +{ + debug_rt_mutex_print_deadlock(w); +} diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index a620d4d08ca6..eb7a46327798 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -314,7 +314,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } put_task_struct(task); - return deadlock_detect ? -EDEADLK : 0; + return -EDEADLK; } retry: /* @@ -377,7 +377,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); raw_spin_unlock(&lock->wait_lock); - ret = deadlock_detect ? -EDEADLK : 0; + ret = -EDEADLK; goto out_unlock_pi; } @@ -548,7 +548,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * which is wrong, as the other waiter is not in a deadlock * situation. */ - if (detect_deadlock && owner == task) + if (owner == task) return -EDEADLK; raw_spin_lock_irqsave(&task->pi_lock, flags); @@ -763,6 +763,26 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, return ret; } +static void rt_mutex_handle_deadlock(int res, int detect_deadlock, + struct rt_mutex_waiter *w) +{ + /* + * If the result is not -EDEADLOCK or the caller requested + * deadlock detection, nothing to do here. + */ + if (res != -EDEADLOCK || detect_deadlock) + return; + + /* + * Yell lowdly and stop the task right here. + */ + rt_mutex_print_deadlock(w); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } +} + /* * Slow path lock function: */ @@ -802,8 +822,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(TASK_RUNNING); - if (unlikely(ret)) + if (unlikely(ret)) { remove_waiter(lock, &waiter); + rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter); + } /* * try_to_take_rt_mutex() sets the waiter bit @@ -1112,7 +1134,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, return 1; } - ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); + /* We enforce deadlock detection for futexes */ + ret = task_blocks_on_rt_mutex(lock, waiter, task, 1); if (ret && !rt_mutex_owner(lock)) { /* diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index a1a1dd06421d..f6a1f3c133b1 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -24,3 +24,8 @@ #define debug_rt_mutex_print_deadlock(w) do { } while (0) #define debug_rt_mutex_detect_deadlock(w,d) (d) #define debug_rt_mutex_reset_waiter(w) do { } while (0) + +static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) +{ + WARN(1, "rtmutex deadlock detected\n"); +} -- cgit From 82084984383babe728e6e3c9a8e5c46278091315 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 5 Jun 2014 11:16:12 +0200 Subject: rtmutex: Detect changes in the pi lock chain When we walk the lock chain, we drop all locks after each step. So the lock chain can change under us before we reacquire the locks. That's harmless in principle as we just follow the wrong lock path. But it can lead to a false positive in the dead lock detection logic: T0 holds L0 T0 blocks on L1 held by T1 T1 blocks on L2 held by T2 T2 blocks on L3 held by T3 T4 blocks on L4 held by T4 Now we walk the chain lock T1 -> lock L2 -> adjust L2 -> unlock T1 -> lock T2 -> adjust T2 -> drop locks T2 times out and blocks on L0 Now we continue: lock T2 -> lock L0 -> deadlock detected, but it's not a deadlock at all. Brad tried to work around that in the deadlock detection logic itself, but the more I looked at it the less I liked it, because it's crystal ball magic after the fact. We actually can detect a chain change very simple: lock T1 -> lock L2 -> adjust L2 -> unlock T1 -> lock T2 -> adjust T2 -> next_lock = T2->pi_blocked_on->lock; drop locks T2 times out and blocks on L0 Now we continue: lock T2 -> if (next_lock != T2->pi_blocked_on->lock) return; So if we detect that T2 is now blocked on a different lock we stop the chain walk. That's also correct in the following scenario: lock T1 -> lock L2 -> adjust L2 -> unlock T1 -> lock T2 -> adjust T2 -> next_lock = T2->pi_blocked_on->lock; drop locks T3 times out and drops L3 T2 acquires L3 and blocks on L4 now Now we continue: lock T2 -> if (next_lock != T2->pi_blocked_on->lock) return; We don't have to follow up the chain at that point, because T2 propagated our priority up to T4 already. [ Folded a cleanup patch from peterz ] Signed-off-by: Thomas Gleixner Reported-by: Brad Mouring Cc: Steven Rostedt Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140605152801.930031935@linutronix.de Cc: stable@vger.kernel.org --- kernel/locking/rtmutex.c | 95 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 71 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index eb7a46327798..a8a83a22bb91 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -260,27 +260,36 @@ static void rt_mutex_adjust_prio(struct task_struct *task) */ int max_lock_depth = 1024; +static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) +{ + return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; +} + /* * Adjust the priority chain. Also used for deadlock detection. * Decreases task's usage by one - may thus free the task. * - * @task: the task owning the mutex (owner) for which a chain walk is probably - * needed + * @task: the task owning the mutex (owner) for which a chain walk is + * probably needed * @deadlock_detect: do we have to carry out deadlock detection? - * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck - * things for a task that has just got its priority adjusted, and - * is waiting on a mutex) + * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck + * things for a task that has just got its priority adjusted, and + * is waiting on a mutex) + * @next_lock: the mutex on which the owner of @orig_lock was blocked before + * we dropped its pi_lock. Is never dereferenced, only used for + * comparison to detect lock chain changes. * @orig_waiter: rt_mutex_waiter struct for the task that has just donated - * its priority to the mutex owner (can be NULL in the case - * depicted above or if the top waiter is gone away and we are - * actually deboosting the owner) - * @top_task: the current top waiter + * its priority to the mutex owner (can be NULL in the case + * depicted above or if the top waiter is gone away and we are + * actually deboosting the owner) + * @top_task: the current top waiter * * Returns 0 or -EDEADLK. */ static int rt_mutex_adjust_prio_chain(struct task_struct *task, int deadlock_detect, struct rt_mutex *orig_lock, + struct rt_mutex *next_lock, struct rt_mutex_waiter *orig_waiter, struct task_struct *top_task) { @@ -338,6 +347,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, if (orig_waiter && !rt_mutex_owner(orig_lock)) goto out_unlock_pi; + /* + * We dropped all locks after taking a refcount on @task, so + * the task might have moved on in the lock chain or even left + * the chain completely and blocks now on an unrelated lock or + * on @orig_lock. + * + * We stored the lock on which @task was blocked in @next_lock, + * so we can detect the chain change. + */ + if (next_lock != waiter->lock) + goto out_unlock_pi; + /* * Drop out, when the task has no waiters. Note, * top_waiter can be NULL, when we are in the deboosting @@ -422,11 +443,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, __rt_mutex_adjust_prio(task); } + /* + * Check whether the task which owns the current lock is pi + * blocked itself. If yes we store a pointer to the lock for + * the lock chain change detection above. After we dropped + * task->pi_lock next_lock cannot be dereferenced anymore. + */ + next_lock = task_blocked_on_lock(task); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); top_waiter = rt_mutex_top_waiter(lock); raw_spin_unlock(&lock->wait_lock); + /* + * We reached the end of the lock chain. Stop right here. No + * point to go back just to figure that out. + */ + if (!next_lock) + goto out_put_task; + if (!detect_deadlock && waiter != top_waiter) goto out_put_task; @@ -536,8 +572,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; - unsigned long flags; + struct rt_mutex *next_lock; int chain_walk = 0, res; + unsigned long flags; /* * Early deadlock detection. We really don't want the task to @@ -569,20 +606,28 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, if (!owner) return 0; + raw_spin_lock_irqsave(&owner->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { - raw_spin_lock_irqsave(&owner->pi_lock, flags); rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); - } - else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) + } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { chain_walk = 1; + } - if (!chain_walk) + /* Store the lock on which owner is blocked or NULL */ + next_lock = task_blocked_on_lock(owner); + + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + /* + * Even if full deadlock detection is on, if the owner is not + * blocked itself, we can avoid finding this out in the chain + * walk. + */ + if (!chain_walk || !next_lock) return 0; /* @@ -594,8 +639,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, - task); + res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, + next_lock, waiter, task); raw_spin_lock(&lock->wait_lock); @@ -644,8 +689,8 @@ static void remove_waiter(struct rt_mutex *lock, { int first = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); + struct rt_mutex *next_lock = NULL; unsigned long flags; - int chain_walk = 0; raw_spin_lock_irqsave(¤t->pi_lock, flags); rt_mutex_dequeue(lock, waiter); @@ -669,13 +714,13 @@ static void remove_waiter(struct rt_mutex *lock, } __rt_mutex_adjust_prio(owner); - if (owner->pi_blocked_on) - chain_walk = 1; + /* Store the lock on which owner is blocked or NULL */ + next_lock = task_blocked_on_lock(owner); raw_spin_unlock_irqrestore(&owner->pi_lock, flags); } - if (!chain_walk) + if (!next_lock) return; /* gets dropped in rt_mutex_adjust_prio_chain()! */ @@ -683,7 +728,7 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); + rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current); raw_spin_lock(&lock->wait_lock); } @@ -696,6 +741,7 @@ static void remove_waiter(struct rt_mutex *lock, void rt_mutex_adjust_pi(struct task_struct *task) { struct rt_mutex_waiter *waiter; + struct rt_mutex *next_lock; unsigned long flags; raw_spin_lock_irqsave(&task->pi_lock, flags); @@ -706,12 +752,13 @@ void rt_mutex_adjust_pi(struct task_struct *task) raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } - + next_lock = waiter->lock; raw_spin_unlock_irqrestore(&task->pi_lock, flags); /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); - rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); + + rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task); } /** -- cgit From 1662867a9b2574bfdb9d4e97186aa131218d7210 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sun, 8 Jun 2014 16:55:57 -0400 Subject: numa,sched: fix load_to_imbalanced logic inversion This function is supposed to return true if the new load imbalance is worse than the old one. It didn't. I can only hope brown paper bags are in style. Now things converge much better on both the 4 node and 8 node systems. I am not sure why this did not seem to impact specjbb performance on the 4 node system, which is the system I have full-time access to. This bug was introduced recently, with commit e63da03639cc ("sched/numa: Allow task switch if load imbalance improves") Signed-off-by: Rik van Riel Signed-off-by: Linus Torvalds --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 17de1956ddad..9855e87d671a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1120,7 +1120,7 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; /* Would this change make things worse? */ - return (old_imb > imb); + return (imb > old_imb); } /* -- cgit From f972eb63b1003fae68d7b7e9b674d4ba5db681c2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 May 2014 15:13:47 -0400 Subject: perf: Pass protection and flags bits through mmap2 interface The mmap2 interface was missing the protection and flags bits needed to accurately determine if a mmap memory area was shared or private and if it was readable or not. Signed-off-by: Peter Zijlstra [tweaked patch to compile and wrote changelog] Signed-off-by: Don Zickus Link: http://lkml.kernel.org/r/1400526833-141779-2-git-send-email-dzickus@redhat.com Signed-off-by: Jiri Olsa --- include/uapi/linux/perf_event.h | 1 + kernel/events/core.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 5312fae47218..9269de254874 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -705,6 +705,7 @@ enum perf_event_type { * u32 min; * u64 ino; * u64 ino_generation; + * u32 prot, flags; * char filename[]; * struct sample_id sample_id; * }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 7da5e561e89a..eea1955c7868 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "internal.h" @@ -5127,6 +5128,7 @@ struct perf_mmap_event { int maj, min; u64 ino; u64 ino_generation; + u32 prot, flags; struct { struct perf_event_header header; @@ -5168,6 +5170,8 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.header.size += sizeof(mmap_event->min); mmap_event->event_id.header.size += sizeof(mmap_event->ino); mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); + mmap_event->event_id.header.size += sizeof(mmap_event->prot); + mmap_event->event_id.header.size += sizeof(mmap_event->flags); } perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); @@ -5186,6 +5190,8 @@ static void perf_event_mmap_output(struct perf_event *event, perf_output_put(&handle, mmap_event->min); perf_output_put(&handle, mmap_event->ino); perf_output_put(&handle, mmap_event->ino_generation); + perf_output_put(&handle, mmap_event->prot); + perf_output_put(&handle, mmap_event->flags); } __output_copy(&handle, mmap_event->file_name, @@ -5204,6 +5210,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) struct file *file = vma->vm_file; int maj = 0, min = 0; u64 ino = 0, gen = 0; + u32 prot = 0, flags = 0; unsigned int size; char tmp[16]; char *buf = NULL; @@ -5234,6 +5241,28 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) gen = inode->i_generation; maj = MAJOR(dev); min = MINOR(dev); + + if (vma->vm_flags & VM_READ) + prot |= PROT_READ; + if (vma->vm_flags & VM_WRITE) + prot |= PROT_WRITE; + if (vma->vm_flags & VM_EXEC) + prot |= PROT_EXEC; + + if (vma->vm_flags & VM_MAYSHARE) + flags = MAP_SHARED; + else + flags = MAP_PRIVATE; + + if (vma->vm_flags & VM_DENYWRITE) + flags |= MAP_DENYWRITE; + if (vma->vm_flags & VM_MAYEXEC) + flags |= MAP_EXECUTABLE; + if (vma->vm_flags & VM_LOCKED) + flags |= MAP_LOCKED; + if (vma->vm_flags & VM_HUGETLB) + flags |= MAP_HUGETLB; + goto got_name; } else { name = (char *)arch_vma_name(vma); @@ -5274,6 +5303,8 @@ got_name: mmap_event->min = min; mmap_event->ino = ino; mmap_event->ino_generation = gen; + mmap_event->prot = prot; + mmap_event->flags = flags; if (!(vma->vm_flags & VM_EXEC)) mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; @@ -5314,6 +5345,8 @@ void perf_event_mmap(struct vm_area_struct *vma) /* .min (attr_mmap2 only) */ /* .ino (attr_mmap2 only) */ /* .ino_generation (attr_mmap2 only) */ + /* .prot (attr_mmap2 only) */ + /* .flags (attr_mmap2 only) */ }; perf_event_mmap_event(&mmap_event); -- cgit From a5a5ba72843dd05f991184d6cb9a4471acce1005 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 30 May 2014 10:49:42 -0400 Subject: Revert "perf: Disable PERF_RECORD_MMAP2 support" This reverts commit 3090ffb5a2515990182f3f55b0688a7817325488. Re-enable the mmap2 interface as we will have a user soon. Since things have changed since perf disabled mmap2, small tweaks to the revert had to be done: o commit 9d4ecc88 forced (n!=8) to become (n<7) o a new libunwind test needed updating to use mmap2 interface Signed-off-by: Don Zickus Link: http://lkml.kernel.org/r/1401461382-209586-1-git-send-email-dzickus@redhat.com Signed-off-by: Jiri Olsa --- kernel/events/core.c | 4 ---- tools/perf/tests/dwarf-unwind.c | 2 +- tools/perf/util/event.c | 34 ++++++++++++++++++++-------------- tools/perf/util/evsel.c | 1 + 4 files changed, 22 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index eea1955c7868..cd28335b3d28 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6929,10 +6929,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (ret) return -EFAULT; - /* disabled for now */ - if (attr->mmap2) - return -EINVAL; - if (attr->__reserved_1) return -EINVAL; diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c index 108f0cd49f4e..96adb730b744 100644 --- a/tools/perf/tests/dwarf-unwind.c +++ b/tools/perf/tests/dwarf-unwind.c @@ -15,7 +15,7 @@ static int mmap_handler(struct perf_tool *tool __maybe_unused, struct perf_sample *sample __maybe_unused, struct machine *machine) { - return machine__process_mmap_event(machine, event, NULL); + return machine__process_mmap2_event(machine, event, NULL); } static int init_live_machine(struct machine *machine) diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index ce43cbae3bd5..d0281bdfa582 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -179,13 +179,14 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, return -1; } - event->header.type = PERF_RECORD_MMAP; + event->header.type = PERF_RECORD_MMAP2; while (1) { char bf[BUFSIZ]; char prot[5]; char execname[PATH_MAX]; char anonstr[] = "//anon"; + unsigned int ino; size_t size; ssize_t n; @@ -196,15 +197,20 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, strcpy(execname, ""); /* 00400000-0040c000 r-xp 00000000 fd:01 41038 /bin/cat */ - n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %*x:%*x %*u %s\n", - &event->mmap.start, &event->mmap.len, prot, - &event->mmap.pgoff, - execname); + n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %x:%x %u %s\n", + &event->mmap2.start, &event->mmap2.len, prot, + &event->mmap2.pgoff, &event->mmap2.maj, + &event->mmap2.min, + &ino, execname); + /* * Anon maps don't have the execname. */ - if (n < 4) + if (n < 7) continue; + + event->mmap2.ino = (u64)ino; + /* * Just like the kernel, see __perf_event_mmap in kernel/perf_event.c */ @@ -239,15 +245,15 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, strcpy(execname, anonstr); size = strlen(execname) + 1; - memcpy(event->mmap.filename, execname, size); + memcpy(event->mmap2.filename, execname, size); size = PERF_ALIGN(size, sizeof(u64)); - event->mmap.len -= event->mmap.start; - event->mmap.header.size = (sizeof(event->mmap) - - (sizeof(event->mmap.filename) - size)); - memset(event->mmap.filename + size, 0, machine->id_hdr_size); - event->mmap.header.size += machine->id_hdr_size; - event->mmap.pid = tgid; - event->mmap.tid = pid; + event->mmap2.len -= event->mmap.start; + event->mmap2.header.size = (sizeof(event->mmap2) - + (sizeof(event->mmap2.filename) - size)); + memset(event->mmap2.filename + size, 0, machine->id_hdr_size); + event->mmap2.header.size += machine->id_hdr_size; + event->mmap2.pid = tgid; + event->mmap2.tid = pid; if (process(tool, event, &synth_sample, machine) != 0) { rc = -1; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 5c28d82b76c4..21154dabc5fa 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -659,6 +659,7 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts) perf_evsel__set_sample_bit(evsel, WEIGHT); attr->mmap = track; + attr->mmap2 = track && !perf_missing_features.mmap2; attr->comm = track; if (opts->sample_transaction) -- cgit From 8b8b36834d0fff67fc8668093f4312dd04dcf21d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 10 Jun 2014 09:46:00 -0400 Subject: ring-buffer: Check if buffer exists before polling The per_cpu buffers are created one per possible CPU. But these do not mean that those CPUs are online, nor do they even exist. With the addition of the ring buffer polling, it assumes that the caller polls on an existing buffer. But this is not the case if the user reads trace_pipe from a CPU that does not exist, and this causes the kernel to crash. Simple fix is to check the cpu against buffer bitmask against to see if the buffer was allocated or not and return -ENODEV if it is not. More updates were done to pass the -ENODEV back up to userspace. Link: http://lkml.kernel.org/r/5393DB61.6060707@oracle.com Reported-by: Sasha Levin Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 5 ++++- kernel/trace/trace.c | 22 ++++++++++++++++------ 3 files changed, 21 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index d69cf637a15a..49a4d6f59108 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -97,7 +97,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k __ring_buffer_alloc((size), (flags), &__key); \ }) -void ring_buffer_wait(struct ring_buffer *buffer, int cpu); +int ring_buffer_wait(struct ring_buffer *buffer, int cpu); int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c634868c2921..7c56c3d06943 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -543,7 +543,7 @@ static void rb_wake_up_waiters(struct irq_work *work) * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ -void ring_buffer_wait(struct ring_buffer *buffer, int cpu) +int ring_buffer_wait(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; DEFINE_WAIT(wait); @@ -557,6 +557,8 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu) if (cpu == RING_BUFFER_ALL_CPUS) work = &buffer->irq_work; else { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -ENODEV; cpu_buffer = buffer->buffers[cpu]; work = &cpu_buffer->irq_work; } @@ -591,6 +593,7 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu) schedule(); finish_wait(&work->waiters, &wait); + return 0; } /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 16f7038d1f4d..56422f1decba 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1085,13 +1085,13 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) } #endif /* CONFIG_TRACER_MAX_TRACE */ -static void wait_on_pipe(struct trace_iterator *iter) +static int wait_on_pipe(struct trace_iterator *iter) { /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) - return; + return 0; - ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); + return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); } #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -4378,6 +4378,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) static int tracing_wait_pipe(struct file *filp) { struct trace_iterator *iter = filp->private_data; + int ret; while (trace_empty(iter)) { @@ -4399,10 +4400,13 @@ static int tracing_wait_pipe(struct file *filp) mutex_unlock(&iter->mutex); - wait_on_pipe(iter); + ret = wait_on_pipe(iter); mutex_lock(&iter->mutex); + if (ret) + return ret; + if (signal_pending(current)) return -EINTR; } @@ -5327,8 +5331,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, goto out_unlock; } mutex_unlock(&trace_types_lock); - wait_on_pipe(iter); + ret = wait_on_pipe(iter); mutex_lock(&trace_types_lock); + if (ret) { + size = ret; + goto out_unlock; + } if (signal_pending(current)) { size = -EINTR; goto out_unlock; @@ -5538,8 +5546,10 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, goto out; } mutex_unlock(&trace_types_lock); - wait_on_pipe(iter); + ret = wait_on_pipe(iter); mutex_lock(&trace_types_lock); + if (ret) + goto out; if (signal_pending(current)) { ret = -EINTR; goto out; -- cgit From a6af8fbf17989e41fef5cacf3988a724fb687d78 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 10 Jun 2014 16:11:35 +0900 Subject: tracing: Cleanup saved_cmdlines_size changes The recent addition of saved_cmdlines_size file had some remaining (minor - mostly coding style) issues. Fix them by passing pointer name to sizeof() and using scnprintf(). Link: http://lkml.kernel.org/p/1402384295-23680-1-git-send-email-namhyung@kernel.org Cc: Namhyung Kim Cc: Ingo Molnar Cc: Yoshihiro YUNOMAE Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 56422f1decba..2b458c60e0da 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1338,7 +1338,7 @@ static int trace_create_savedcmd(void) { int ret; - savedcmd = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL); + savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL); if (!savedcmd) return -ENOMEM; @@ -3840,7 +3840,7 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, int r; arch_spin_lock(&trace_cmdline_lock); - r = sprintf(buf, "%u\n", savedcmd->cmdline_num); + r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); arch_spin_unlock(&trace_cmdline_lock); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -3857,7 +3857,7 @@ static int tracing_resize_saved_cmdlines(unsigned int val) { struct saved_cmdlines_buffer *s, *savedcmd_temp; - s = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL); + s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; -- cgit From a3c54931199565930d6d84f4c3456f6440aefd41 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 28 May 2014 23:09:58 -0400 Subject: auditsc: audit_krule mask accesses need bounds checking Fixes an easy DoS and possible information disclosure. This does nothing about the broken state of x32 auditing. eparis: If the admin has enabled auditd and has specifically loaded audit rules. This bug has been around since before git. Wow... Cc: stable@vger.kernel.org Signed-off-by: Andy Lutomirski Signed-off-by: Eric Paris Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f251a5e8d17a..21eae3c05ec0 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -728,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) return AUDIT_BUILD_CONTEXT; } +static int audit_in_mask(const struct audit_krule *rule, unsigned long val) +{ + int word, bit; + + if (val > 0xffffffff) + return false; + + word = AUDIT_WORD(val); + if (word >= AUDIT_BITMASK_SIZE) + return false; + + bit = AUDIT_BIT(val); + + return rule->mask[word] & bit; +} + /* At syscall entry and exit time, this filter is called if the * audit_state is not low enough that auditing cannot take place, but is * also not high enough that we already know we have to write an audit @@ -745,11 +761,8 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, rcu_read_lock(); if (!list_empty(list)) { - int word = AUDIT_WORD(ctx->major); - int bit = AUDIT_BIT(ctx->major); - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && + if (audit_in_mask(&e->rule, ctx->major) && audit_filter_rules(tsk, &e->rule, ctx, NULL, &state, false)) { rcu_read_unlock(); @@ -769,20 +782,16 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, static int audit_filter_inode_name(struct task_struct *tsk, struct audit_names *n, struct audit_context *ctx) { - int word, bit; int h = audit_hash_ino((u32)n->ino); struct list_head *list = &audit_inode_hash[h]; struct audit_entry *e; enum audit_state state; - word = AUDIT_WORD(ctx->major); - bit = AUDIT_BIT(ctx->major); - if (list_empty(list)) return 0; list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && + if (audit_in_mask(&e->rule, ctx->major) && audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { ctx->current_state = state; return 1; -- cgit From f0b70cc48cc282cb326a4d71b3d1dda7d8fafd2a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 10 Jun 2014 12:06:30 -0400 Subject: tracing: Fix leak of per cpu max data in instances The freeing of an instance, if max data is configured, there will be per cpu data structures created. But these are not freed when the instance is deleted, which causes a memory leak. A new helper function is added that frees the individual buffers within a trace array, instead of duplicating the code. This way changes made for one are applied to the other (normal buffer vs max buffer). Link: http://lkml.kernel.org/r/87k38pbake.fsf@sejong.aot.lge.com Reported-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2b458c60e0da..384ede311717 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6242,22 +6242,25 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) return 0; } +static void free_trace_buffer(struct trace_buffer *buf) +{ + if (buf->buffer) { + ring_buffer_free(buf->buffer); + buf->buffer = NULL; + free_percpu(buf->data); + buf->data = NULL; + } +} + static void free_trace_buffers(struct trace_array *tr) { if (!tr) return; - if (tr->trace_buffer.buffer) { - ring_buffer_free(tr->trace_buffer.buffer); - tr->trace_buffer.buffer = NULL; - free_percpu(tr->trace_buffer.data); - } + free_trace_buffer(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE - if (tr->max_buffer.buffer) { - ring_buffer_free(tr->max_buffer.buffer); - tr->max_buffer.buffer = NULL; - } + free_trace_buffer(&tr->max_buffer); #endif } -- cgit From da9c3413a27be5ba6f996e90495c836dd30b8841 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 10 Jun 2014 13:53:50 -0400 Subject: tracing: Fix check of ftrace_trace_arrays list_empty() check The check that tests if ftrace_trace_arrays is empty in top_trace_array(), uses the .prev pointer: if (list_empty(ftrace_trace_arrays.prev)) instead of testing the variable itself: if (list_empty(&ftrace_trace_arrays)) Although it is technically correct, it is awkward and confusing. Use the proper method. Link: http://lkml.kernel.org/r/87oay1bas8.fsf@sejong.aot.lge.com Reported-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9e82551dd566..9258f5a815db 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -252,7 +252,7 @@ static inline struct trace_array *top_trace_array(void) { struct trace_array *tr; - if (list_empty(ftrace_trace_arrays.prev)) + if (list_empty(&ftrace_trace_arrays)) return NULL; tr = list_entry(ftrace_trace_arrays.prev, -- cgit From 23adbe12ef7d3d4195e80800ab36b37bee28cd03 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 10 Jun 2014 12:45:42 -0700 Subject: fs,userns: Change inode_capable to capable_wrt_inode_uidgid The kernel has no concept of capabilities with respect to inodes; inodes exist independently of namespaces. For example, inode_capable(inode, CAP_LINUX_IMMUTABLE) would be nonsense. This patch changes inode_capable to check for uid and gid mappings and renames it to capable_wrt_inode_uidgid, which should make it more obvious what it does. Fixes CVE-2014-4014. Cc: Theodore Ts'o Cc: Serge Hallyn Cc: "Eric W. Biederman" Cc: Dave Chinner Cc: stable@vger.kernel.org Signed-off-by: Andy Lutomirski Signed-off-by: Linus Torvalds --- fs/attr.c | 8 ++++---- fs/inode.c | 10 +++++++--- fs/namei.c | 11 ++++++----- fs/xfs/xfs_ioctl.c | 2 +- include/linux/capability.h | 2 +- kernel/capability.c | 20 ++++++++------------ 6 files changed, 27 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/fs/attr.c b/fs/attr.c index 5d4e59d56e85..6530ced19697 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -50,14 +50,14 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) if ((ia_valid & ATTR_UID) && (!uid_eq(current_fsuid(), inode->i_uid) || !uid_eq(attr->ia_uid, inode->i_uid)) && - !inode_capable(inode, CAP_CHOWN)) + !capable_wrt_inode_uidgid(inode, CAP_CHOWN)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && (!uid_eq(current_fsuid(), inode->i_uid) || (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && - !inode_capable(inode, CAP_CHOWN)) + !capable_wrt_inode_uidgid(inode, CAP_CHOWN)) return -EPERM; /* Make sure a caller can chmod. */ @@ -67,7 +67,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) /* Also check the setgid bit! */ if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : inode->i_gid) && - !inode_capable(inode, CAP_FSETID)) + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } @@ -160,7 +160,7 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) umode_t mode = attr->ia_mode; if (!in_group_p(inode->i_gid) && - !inode_capable(inode, CAP_FSETID)) + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; } diff --git a/fs/inode.c b/fs/inode.c index 2feb9b69f1be..6eecb7ff0b9a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1839,14 +1839,18 @@ EXPORT_SYMBOL(inode_init_owner); * inode_owner_or_capable - check current task permissions to inode * @inode: inode being checked * - * Return true if current either has CAP_FOWNER to the inode, or - * owns the file. + * Return true if current either has CAP_FOWNER in a namespace with the + * inode owner uid mapped, or owns the file. */ bool inode_owner_or_capable(const struct inode *inode) { + struct user_namespace *ns; + if (uid_eq(current_fsuid(), inode->i_uid)) return true; - if (inode_capable(inode, CAP_FOWNER)) + + ns = current_user_ns(); + if (ns_capable(ns, CAP_FOWNER) && kuid_has_mapping(ns, inode->i_uid)) return true; return false; } diff --git a/fs/namei.c b/fs/namei.c index 80168273396b..985c6f368485 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -332,10 +332,11 @@ int generic_permission(struct inode *inode, int mask) if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ - if (inode_capable(inode, CAP_DAC_OVERRIDE)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) return 0; if (!(mask & MAY_WRITE)) - if (inode_capable(inode, CAP_DAC_READ_SEARCH)) + if (capable_wrt_inode_uidgid(inode, + CAP_DAC_READ_SEARCH)) return 0; return -EACCES; } @@ -345,7 +346,7 @@ int generic_permission(struct inode *inode, int mask) * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) - if (inode_capable(inode, CAP_DAC_OVERRIDE)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) return 0; /* @@ -353,7 +354,7 @@ int generic_permission(struct inode *inode, int mask) */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) - if (inode_capable(inode, CAP_DAC_READ_SEARCH)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH)) return 0; return -EACCES; @@ -2379,7 +2380,7 @@ static inline int check_sticky(struct inode *dir, struct inode *inode) return 0; if (uid_eq(dir->i_uid, fsuid)) return 0; - return !inode_capable(inode, CAP_FOWNER); + return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); } /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0b18776b075e..6152cbe353e8 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1215,7 +1215,7 @@ xfs_ioctl_setattr( * cleared upon successful return from chown() */ if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && - !inode_capable(VFS_I(ip), CAP_FSETID)) + !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID)) ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); /* diff --git a/include/linux/capability.h b/include/linux/capability.h index a6ee1f9a5018..84b13ad67c1c 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -210,7 +210,7 @@ extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); -extern bool inode_capable(const struct inode *inode, int cap); +extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); /* audit system wants to get cap info from files as well */ diff --git a/kernel/capability.c b/kernel/capability.c index 84b2bbf443e7..a5cf13c018ce 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -424,23 +424,19 @@ bool capable(int cap) EXPORT_SYMBOL(capable); /** - * inode_capable - Check superior capability over inode + * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @inode: The inode in question * @cap: The capability in question * - * Return true if the current task has the given superior capability - * targeted at it's own user namespace and that the given inode is owned - * by the current user namespace or a child namespace. - * - * Currently we check to see if an inode is owned by the current - * user namespace by seeing if the inode's owner maps into the - * current user namespace. - * + * Return true if the current task has the given capability targeted at + * its own user namespace and that the given inode's uid and gid are + * mapped into the current user namespace. */ -bool inode_capable(const struct inode *inode, int cap) +bool capable_wrt_inode_uidgid(const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); - return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); + return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) && + kgid_has_mapping(ns, inode->i_gid); } -EXPORT_SYMBOL(inode_capable); +EXPORT_SYMBOL(capable_wrt_inode_uidgid); -- cgit From a992bf836f9c3039a16f4bd068d161c86c6c3e2c Mon Sep 17 00:00:00 2001 From: Yuan Pengfei Date: Tue, 10 Jun 2014 15:18:39 -0700 Subject: gcov: add support for GCC 4.9 This patch handles the gcov-related changes in GCC 4.9: A new counter (time profile) is added. The total number is 9 now. A new profile merge function __gcov_merge_time_profile is added. See gcc/gcov-io.h and libgcc/libgcov-merge.c For the first change, the layout of struct gcov_info is affected. For the second one, a dummy function is added to kernel/gcov/base.c similarly. Signed-off-by: Yuan Pengfei Acked-by: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/base.c | 6 ++++++ kernel/gcov/gcc_4_7.c | 5 +++++ 2 files changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index f45b75b713c0..b358a802fd18 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -85,6 +85,12 @@ void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) } EXPORT_SYMBOL(__gcov_merge_ior); +void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_time_profile); + /** * gcov_enable_events - enable event reporting through gcov_event() * diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 2c6e4631c814..826ba9fb5e32 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,12 @@ #include #include "gcov.h" +#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9 +#define GCOV_COUNTERS 9 +#else #define GCOV_COUNTERS 8 +#endif + #define GCOV_TAG_FUNCTION_LENGTH 3 static struct gcov_info *gcov_info_head; -- cgit From 4cdf77a828b056258f48a9f6078bd2f77d9704bb Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 14 Jun 2014 06:47:12 +0000 Subject: x86/kprobes: Fix build errors and blacklist context_track_user This essentially reverts commit: ecd50f714c42 ("kprobes, x86: Call exception_enter after kprobes handled") since it causes build errors with CONFIG_CONTEXT_TRACKING and that has been made from misunderstandings; context_track_user_*() don't involve much in interrupt context, it just returns if in_interrupt() is true. Instead of changing the do_debug/int3(), this just adds context_track_user_*() to kprobes blacklist, since those are still can be called right before kprobes handles int3 and debug exceptions, and probing those will cause an infinite loop. Reported-by: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Cc: Borislav Petkov Cc: Kees Cook Cc: Jiri Kosina Cc: Rusty Russell Cc: Steven Rostedt Cc: Seiji Aguchi Cc: Andrew Morton Cc: Kees Cook Link: http://lkml.kernel.org/r/20140614064711.7865.45957.stgit@kbuild-fedora.novalocal Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 7 ++++--- kernel/context_tracking.c | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c6eb418c5627..0d0e922fafc1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -343,6 +343,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) if (poke_int3_handler(regs)) return; + prev_state = exception_enter(); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) @@ -351,9 +352,8 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) #ifdef CONFIG_KPROBES if (kprobe_int3_handler(regs)) - return; + goto exit; #endif - prev_state = exception_enter(); if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) @@ -433,6 +433,8 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) unsigned long dr6; int si_code; + prev_state = exception_enter(); + get_debugreg(dr6, 6); /* Filter out all the reserved bits which are preset to 1 */ @@ -465,7 +467,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) if (kprobe_debug_handler(regs)) goto exit; #endif - prev_state = exception_enter(); if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code, SIGTRAP) == NOTIFY_STOP) diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 019d45008448..5664985c46a0 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -19,6 +19,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -104,6 +105,7 @@ void context_tracking_user_enter(void) } local_irq_restore(flags); } +NOKPROBE_SYMBOL(context_tracking_user_enter); #ifdef CONFIG_PREEMPT /** @@ -181,6 +183,7 @@ void context_tracking_user_exit(void) } local_irq_restore(flags); } +NOKPROBE_SYMBOL(context_tracking_user_exit); /** * __context_tracking_task_switch - context switch the syscall callbacks -- cgit From 27e35715df54cbc4f2d044f681802ae30479e7fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 11 Jun 2014 18:44:04 +0000 Subject: rtmutex: Plug slow unlock race When the rtmutex fast path is enabled the slow unlock function can create the following situation: spin_lock(foo->m->wait_lock); foo->m->owner = NULL; rt_mutex_lock(foo->m); <-- fast path free = atomic_dec_and_test(foo->refcnt); rt_mutex_unlock(foo->m); <-- fast path if (free) kfree(foo); spin_unlock(foo->m->wait_lock); <--- Use after free. Plug the race by changing the slow unlock to the following scheme: while (!rt_mutex_has_waiters(m)) { /* Clear the waiters bit in m->owner */ clear_rt_mutex_waiters(m); owner = rt_mutex_owner(m); spin_unlock(m->wait_lock); if (cmpxchg(m->owner, owner, 0) == owner) return; spin_lock(m->wait_lock); } So in case of a new waiter incoming while the owner tries the slow path unlock we have two situations: unlock(wait_lock); lock(wait_lock); cmpxchg(p, owner, 0) == owner mark_rt_mutex_waiters(lock); acquire(lock); Or: unlock(wait_lock); lock(wait_lock); mark_rt_mutex_waiters(lock); cmpxchg(p, owner, 0) != owner enqueue_waiter(); unlock(wait_lock); lock(wait_lock); wakeup_next waiter(); unlock(wait_lock); lock(wait_lock); acquire(lock); If the fast path is disabled, then the simple m->owner = NULL; unlock(m->wait_lock); is sufficient as all access to m->owner is serialized via m->wait_lock; Also document and clarify the wakeup_next_waiter function as suggested by Oleg Nesterov. Reported-by: Steven Rostedt Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140611183852.937945560@linutronix.de Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/locking/rtmutex.c | 115 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 109 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index a8a83a22bb91..fc605941b9b8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -83,6 +83,47 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) owner = *p; } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); } + +/* + * Safe fastpath aware unlock: + * 1) Clear the waiters bit + * 2) Drop lock->wait_lock + * 3) Try to unlock the lock with cmpxchg + */ +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) + __releases(lock->wait_lock) +{ + struct task_struct *owner = rt_mutex_owner(lock); + + clear_rt_mutex_waiters(lock); + raw_spin_unlock(&lock->wait_lock); + /* + * If a new waiter comes in between the unlock and the cmpxchg + * we have two situations: + * + * unlock(wait_lock); + * lock(wait_lock); + * cmpxchg(p, owner, 0) == owner + * mark_rt_mutex_waiters(lock); + * acquire(lock); + * or: + * + * unlock(wait_lock); + * lock(wait_lock); + * mark_rt_mutex_waiters(lock); + * + * cmpxchg(p, owner, 0) != owner + * enqueue_waiter(); + * unlock(wait_lock); + * lock(wait_lock); + * wake waiter(); + * unlock(wait_lock); + * lock(wait_lock); + * acquire(lock); + */ + return rt_mutex_cmpxchg(lock, owner, NULL); +} + #else # define rt_mutex_cmpxchg(l,c,n) (0) static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) @@ -90,6 +131,17 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) lock->owner = (struct task_struct *) ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); } + +/* + * Simple slow path only version: lock->owner is protected by lock->wait_lock. + */ +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) + __releases(lock->wait_lock) +{ + lock->owner = NULL; + raw_spin_unlock(&lock->wait_lock); + return true; +} #endif static inline int @@ -650,7 +702,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* * Wake up the next waiter on the lock. * - * Remove the top waiter from the current tasks waiter list and wake it up. + * Remove the top waiter from the current tasks pi waiter list and + * wake it up. * * Called with lock->wait_lock held. */ @@ -671,10 +724,23 @@ static void wakeup_next_waiter(struct rt_mutex *lock) */ rt_mutex_dequeue_pi(current, waiter); - rt_mutex_set_owner(lock, NULL); + /* + * As we are waking up the top waiter, and the waiter stays + * queued on the lock until it gets the lock, this lock + * obviously has waiters. Just set the bit here and this has + * the added benefit of forcing all new tasks into the + * slow path making sure no task of lower priority than + * the top waiter can steal this lock. + */ + lock->owner = (void *) RT_MUTEX_HAS_WAITERS; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + /* + * It's safe to dereference waiter as it cannot go away as + * long as we hold lock->wait_lock. The waiter task needs to + * acquire it in order to dequeue the waiter. + */ wake_up_process(waiter->task); } @@ -928,12 +994,49 @@ rt_mutex_slowunlock(struct rt_mutex *lock) rt_mutex_deadlock_account_unlock(current); - if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); - return; + /* + * We must be careful here if the fast path is enabled. If we + * have no waiters queued we cannot set owner to NULL here + * because of: + * + * foo->lock->owner = NULL; + * rtmutex_lock(foo->lock); <- fast path + * free = atomic_dec_and_test(foo->refcnt); + * rtmutex_unlock(foo->lock); <- fast path + * if (free) + * kfree(foo); + * raw_spin_unlock(foo->lock->wait_lock); + * + * So for the fastpath enabled kernel: + * + * Nothing can set the waiters bit as long as we hold + * lock->wait_lock. So we do the following sequence: + * + * owner = rt_mutex_owner(lock); + * clear_rt_mutex_waiters(lock); + * raw_spin_unlock(&lock->wait_lock); + * if (cmpxchg(&lock->owner, owner, 0) == owner) + * return; + * goto retry; + * + * The fastpath disabled variant is simple as all access to + * lock->owner is serialized by lock->wait_lock: + * + * lock->owner = NULL; + * raw_spin_unlock(&lock->wait_lock); + */ + while (!rt_mutex_has_waiters(lock)) { + /* Drops lock->wait_lock ! */ + if (unlock_rt_mutex_safe(lock) == true) + return; + /* Relock the rtmutex and try again */ + raw_spin_lock(&lock->wait_lock); } + /* + * The wakeup next waiter path does not suffer from the above + * race. See the comments there. + */ wakeup_next_waiter(lock); raw_spin_unlock(&lock->wait_lock); -- cgit From a6e15a39048ec3229b9a53425f4384f55f6cc1b3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 13 Jun 2014 13:30:35 -0700 Subject: PM / hibernate: introduce "nohibernate" boot parameter To support using kernel features that are not compatible with hibernation, this creates the "nohibernate" kernel boot parameter to disable both hibernation and resume. This allows hibernation support to be a boot-time choice instead of only a compile-time choice. Signed-off-by: Kees Cook Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- Documentation/kernel-parameters.txt | 3 +++ include/linux/suspend.h | 2 ++ kernel/power/hibernate.c | 31 ++++++++++++++++++++++++++++++- kernel/power/main.c | 6 ++---- kernel/power/user.c | 3 +++ 5 files changed, 40 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 6eaa9cdb7094..f8f0466b8b1d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2184,6 +2184,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. in certain environments such as networked servers or real-time systems. + nohibernate [HIBERNATION] Disable hibernation and resume. + nohz= [KNL] Boottime enable/disable dynamic ticks Valid arguments: on, off Default: on @@ -2980,6 +2982,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. noresume Don't check if there's a hibernation image present during boot. nocompress Don't compress/decompress hibernation images. + no Disable hibernation and resume. retain_initrd [RAM] Keep initrd memory after extraction diff --git a/include/linux/suspend.h b/include/linux/suspend.h index f76994b9396c..519064e0c943 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -327,6 +327,7 @@ extern unsigned long get_safe_page(gfp_t gfp_mask); extern void hibernation_set_ops(const struct platform_hibernation_ops *ops); extern int hibernate(void); extern bool system_entering_hibernation(void); +extern bool hibernation_available(void); asmlinkage int swsusp_save(void); extern struct pbe *restore_pblist; #else /* CONFIG_HIBERNATION */ @@ -339,6 +340,7 @@ static inline void swsusp_unset_page_free(struct page *p) {} static inline void hibernation_set_ops(const struct platform_hibernation_ops *ops) {} static inline int hibernate(void) { return -ENOSYS; } static inline bool system_entering_hibernation(void) { return false; } +static inline bool hibernation_available(void) { return false; } #endif /* CONFIG_HIBERNATION */ /* Hibernation and suspend events */ diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 49e0a20fd010..258f492f0347 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -35,6 +35,7 @@ static int nocompress; static int noresume; +static int nohibernate; static int resume_wait; static unsigned int resume_delay; static char resume_file[256] = CONFIG_PM_STD_PARTITION; @@ -62,6 +63,11 @@ bool freezer_test_done; static const struct platform_hibernation_ops *hibernation_ops; +bool hibernation_available(void) +{ + return (nohibernate == 0); +} + /** * hibernation_set_ops - Set the global hibernate operations. * @ops: Hibernation operations to use in subsequent hibernation transitions. @@ -642,6 +648,11 @@ int hibernate(void) { int error; + if (!hibernation_available()) { + pr_debug("PM: Hibernation not available.\n"); + return -EPERM; + } + lock_system_sleep(); /* The snapshot device should not be opened while we're running */ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { @@ -734,7 +745,7 @@ static int software_resume(void) /* * If the user said "noresume".. bail out early. */ - if (noresume) + if (noresume || !hibernation_available()) return 0; /* @@ -900,6 +911,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, int i; char *start = buf; + if (!hibernation_available()) + return sprintf(buf, "[disabled]\n"); + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (!hibernation_modes[i]) continue; @@ -934,6 +948,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, char *p; int mode = HIBERNATION_INVALID; + if (!hibernation_available()) + return -EPERM; + p = memchr(buf, '\n', n); len = p ? p - buf : n; @@ -1101,6 +1118,10 @@ static int __init hibernate_setup(char *str) noresume = 1; else if (!strncmp(str, "nocompress", 10)) nocompress = 1; + else if (!strncmp(str, "no", 2)) { + noresume = 1; + nohibernate = 1; + } return 1; } @@ -1125,9 +1146,17 @@ static int __init resumedelay_setup(char *str) return 1; } +static int __init nohibernate_setup(char *str) +{ + noresume = 1; + nohibernate = 1; + return 1; +} + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); __setup("hibernate=", hibernate_setup); __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); +__setup("nohibernate", nohibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 573410d6647e..8e90f330f139 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -300,13 +300,11 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, s += sprintf(s,"%s ", pm_states[i].label); #endif -#ifdef CONFIG_HIBERNATION - s += sprintf(s, "%s\n", "disk"); -#else + if (hibernation_available()) + s += sprintf(s, "disk "); if (s != buf) /* convert the last space to a newline */ *(s-1) = '\n'; -#endif return (s - buf); } diff --git a/kernel/power/user.c b/kernel/power/user.c index 98d357584cd6..526e8911460a 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -49,6 +49,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) struct snapshot_data *data; int error; + if (!hibernation_available()) + return -EPERM; + lock_system_sleep(); if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { -- cgit From 24f2e0273f80ec262a772059e140a0adef35296d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 13 Jun 2014 13:30:36 -0700 Subject: x86, kaslr: boot-time selectable with hibernation Changes kASLR from being compile-time selectable (blocked by CONFIG_HIBERNATION), to being boot-time selectable (with hibernation available by default) via the "kaslr" kernel command line. Signed-off-by: Kees Cook Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- Documentation/kernel-parameters.txt | 11 +++++++---- arch/x86/Kconfig | 1 - arch/x86/boot/compressed/aslr.c | 9 ++++++++- kernel/power/hibernate.c | 6 ++++++ 4 files changed, 21 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index f8f0466b8b1d..884904975d0b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1474,6 +1474,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. js= [HW,JOY] Analog joystick See Documentation/input/joystick.txt. + kaslr/nokaslr [X86] + Enable/disable kernel and module base offset ASLR + (Address Space Layout Randomization) if built into + the kernel. When CONFIG_HIBERNATION is selected, + kASLR is disabled by default. When kASLR is enabled, + hibernation will be disabled. + keepinitrd [HW,ARM] kernelcore=nn[KMG] [KNL,X86,IA-64,PPC] This parameter @@ -2110,10 +2117,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. noapic [SMP,APIC] Tells the kernel to not make use of any IOAPICs that may be present in the system. - nokaslr [X86] - Disable kernel and module base offset ASLR (Address - Space Layout Randomization) if built into the kernel. - noautogroup Disable scheduler automatic task group creation. nobats [PPC] Do not use BATs for mapping kernel lowmem diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fcefdda5136d..a8f749ef0fdc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1672,7 +1672,6 @@ config RELOCATABLE config RANDOMIZE_BASE bool "Randomize the address of the kernel image" depends on RELOCATABLE - depends on !HIBERNATION default n ---help--- Randomizes the physical and virtual address at which the diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index 4dbf967da50d..fc6091abedb7 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -289,10 +289,17 @@ unsigned char *choose_kernel_location(unsigned char *input, unsigned long choice = (unsigned long)output; unsigned long random; +#ifdef CONFIG_HIBERNATION + if (!cmdline_find_option_bool("kaslr")) { + debug_putstr("KASLR disabled by default...\n"); + goto out; + } +#else if (cmdline_find_option_bool("nokaslr")) { - debug_putstr("KASLR disabled...\n"); + debug_putstr("KASLR disabled by cmdline...\n"); goto out; } +#endif /* Record the various known unsafe memory ranges. */ mem_avoid_init((unsigned long)input, input_size, diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 258f492f0347..fcc2611d3f14 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1153,6 +1153,11 @@ static int __init nohibernate_setup(char *str) return 1; } +static int __init kaslr_nohibernate_setup(char *str) +{ + return nohibernate_setup(str); +} + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); @@ -1160,3 +1165,4 @@ __setup("hibernate=", hibernate_setup); __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); +__setup("kaslr", kaslr_nohibernate_setup); -- cgit From 99bae5f94185c2cc65701e95c54e31e2f4345b88 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 12 Jun 2014 14:31:31 +0800 Subject: cgroup: fix broken css_has_online_children() After running: # mount -t cgroup cpu xxx /cgroup && mkdir /cgroup/sub && \ rmdir /cgroup/sub && umount /cgroup I found the cgroup root still existed: # cat /proc/cgroups #subsys_name hierarchy num_cgroups enabled cpuset 0 1 1 cpu 1 1 1 ... It turned out css_has_online_children() is broken. Signed-off-by: Li Zefan Sigend-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7868fc3c0bc5..d9a8be911f5b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3328,7 +3328,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css) rcu_read_lock(); css_for_each_child(child, css) { - if (css->flags & CSS_ONLINE) { + if (child->flags & CSS_ONLINE) { ret = true; break; } -- cgit From 4af4206be2bd1933cae20c2b6fb2058dbc887f7c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 13 Apr 2014 20:58:54 +0200 Subject: tracing: Fix syscall_*regfunc() vs copy_process() race syscall_regfunc() and syscall_unregfunc() should set/clear TIF_SYSCALL_TRACEPOINT system-wide, but do_each_thread() can race with copy_process() and miss the new child which was not added to the process/thread lists yet. Change copy_process() to update the child's TIF_SYSCALL_TRACEPOINT under tasklist. Link: http://lkml.kernel.org/p/20140413185854.GB20668@redhat.com Cc: stable@vger.kernel.org # 2.6.33 Fixes: a871bd33a6c0 "tracing: Add syscall tracepoints" Acked-by: Frederic Weisbecker Acked-by: Paul E. McKenney Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- include/trace/syscall.h | 15 +++++++++++++++ kernel/fork.c | 2 ++ 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/include/trace/syscall.h b/include/trace/syscall.h index fed853f3d7aa..9674145e2f6a 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -32,4 +33,18 @@ struct syscall_metadata { struct ftrace_event_call *exit_event; }; +#if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_HAVE_SYSCALL_TRACEPOINTS) +static inline void syscall_tracepoint_update(struct task_struct *p) +{ + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) + set_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT); + else + clear_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT); +} +#else +static inline void syscall_tracepoint_update(struct task_struct *p) +{ +} +#endif + #endif /* _TRACE_SYSCALL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index d2799d1fc952..6a13c46cd87d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1487,7 +1487,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); + syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); + proc_fork_connector(p); cgroup_post_fork(p); if (clone_flags & CLONE_THREAD) -- cgit From 8063e41d2ffc0b0ce974ea802158be35902072f3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 13 Apr 2014 20:59:18 +0200 Subject: tracing: Change syscall_*regfunc() to check PF_KTHREAD and use for_each_process_thread() 1. Remove _irqsafe from syscall_regfunc/syscall_unregfunc, read_lock(tasklist) doesn't need to disable irqs. 2. Change this code to avoid the deprecated do_each_thread() and use for_each_process_thread() (stolen from the patch from Frederic). 3. Change syscall_regfunc() to check PF_KTHREAD to skip the kernel threads, ->mm != NULL is the common mistake. Note: probably this check should be simply removed, needs another patch. [fweisbec@gmail.com: s/do_each_thread/for_each_process_thread/] Link: http://lkml.kernel.org/p/20140413185918.GC20668@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/tracepoint.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 33cbd8c203f8..9cf12640de5a 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -492,33 +492,31 @@ static int sys_tracepoint_refcount; void syscall_regfunc(void) { - unsigned long flags; - struct task_struct *g, *t; + struct task_struct *p, *t; if (!sys_tracepoint_refcount) { - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, t) { + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { /* Skip kernel threads. */ - if (t->mm) + if (!(t->flags & PF_KTHREAD)) set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); - } while_each_thread(g, t); - read_unlock_irqrestore(&tasklist_lock, flags); + } + read_unlock(&tasklist_lock); } sys_tracepoint_refcount++; } void syscall_unregfunc(void) { - unsigned long flags; - struct task_struct *g, *t; + struct task_struct *p, *t; sys_tracepoint_refcount--; if (!sys_tracepoint_refcount) { - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, t) { + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); - } while_each_thread(g, t); - read_unlock_irqrestore(&tasklist_lock, flags); + } + read_unlock(&tasklist_lock); } } #endif -- cgit From ea73c79e33c45e1fa0071e216f06fd5682314490 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 13 Apr 2014 20:59:38 +0200 Subject: tracing: syscall_regfunc() should not skip kernel threads syscall_regfunc() ignores the kernel threads because "it has no effect", see cc3b13c1 "Don't trace kernel thread syscalls" which added this check. However, this means that a user-space task spawned by call_usermodehelper() will run without TIF_SYSCALL_TRACEPOINT if sys_tracepoint_refcount != 0. Remove this check. The unnecessary report from ret_from_fork path mentioned by cc3b13c1 is no longer possible, see See commit fb45550d76bb5 "make sure that kernel_thread() callbacks call do_exit() themselves". A kernel_thread() callback can only return and take the int_ret_from_sys_call path after do_execve() succeeds, otherwise the kernel will crash. But in this case it is no longer a kernel thread and thus is needs TIF_SYSCALL_TRACEPOINT. Link: http://lkml.kernel.org/p/20140413185938.GD20668@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/tracepoint.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 9cf12640de5a..3490407dc7b7 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -497,9 +497,7 @@ void syscall_regfunc(void) if (!sys_tracepoint_refcount) { read_lock(&tasklist_lock); for_each_process_thread(p, t) { - /* Skip kernel threads. */ - if (!(t->flags & PF_KTHREAD)) - set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); + set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); } read_unlock(&tasklist_lock); } -- cgit From bddbceb688c6d0decaabc7884fede319d02f96c8 Mon Sep 17 00:00:00 2001 From: Maxime Bizon Date: Mon, 23 Jun 2014 16:35:35 +0200 Subject: workqueue: fix dev_set_uevent_suppress() imbalance Uevents are suppressed during attributes registration, but never restored, so kobject_uevent() does nothing. Signed-off-by: Maxime Bizon Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org Fixes: 226223ab3c4118ddd10688cc2c131135848371ab --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6203d2900877..6f5f9c7323f4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3284,6 +3284,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) } } + dev_set_uevent_suppress(&wq_dev->dev, false); kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); return 0; } -- cgit From 8d056c48e486249e6487910b83e0f3be7c14acf7 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Mon, 23 Jun 2014 13:22:02 -0700 Subject: CPU hotplug, smp: flush any pending IPI callbacks before CPU offline There is a race between the CPU offline code (within stop-machine) and the smp-call-function code, which can lead to getting IPIs on the outgoing CPU, *after* it has gone offline. Specifically, this can happen when using smp_call_function_single_async() to send the IPI, since this API allows sending asynchronous IPIs from IRQ disabled contexts. The exact race condition is described below. During CPU offline, in stop-machine, we don't enforce any rule in the _DISABLE_IRQ stage, regarding the order in which the outgoing CPU and the other CPUs disable their local interrupts. Due to this, we can encounter a situation in which an IPI is sent by one of the other CPUs to the outgoing CPU (while it is *still* online), but the outgoing CPU ends up noticing it only *after* it has gone offline. CPU 1 CPU 2 (Online CPU) (CPU going offline) Enter _PREPARE stage Enter _PREPARE stage Enter _DISABLE_IRQ stage = Got a device interrupt, and | Didn't notice the IPI the interrupt handler sent an | since interrupts were IPI to CPU 2 using | disabled on this CPU. smp_call_function_single_async() | = Enter _DISABLE_IRQ stage Enter _RUN stage Enter _RUN stage = Busy loop with interrupts | Invoke take_cpu_down() disabled. | and take CPU 2 offline = Enter _EXIT stage Enter _EXIT stage Re-enable interrupts Re-enable interrupts The pending IPI is noted immediately, but alas, the CPU is offline at this point. This of course, makes the smp-call-function IPI handler code running on CPU 2 unhappy and it complains about "receiving an IPI on an offline CPU". One real example of the scenario on CPU 1 is the block layer's complete-request call-path: __blk_complete_request() [interrupt-handler] raise_blk_irq() smp_call_function_single_async() However, if we look closely, the block layer does check that the target CPU is online before firing the IPI. So in this case, it is actually the unfortunate ordering/timing of events in the stop-machine phase that leads to receiving IPIs after the target CPU has gone offline. In reality, getting a late IPI on an offline CPU is not too bad by itself (this can happen even due to hardware latencies in IPI send-receive). It is a bug only if the target CPU really went offline without executing all the callbacks queued on its list. (Note that a CPU is free to execute its pending smp-call-function callbacks in a batch, without waiting for the corresponding IPIs to arrive for each one of those callbacks). So, fixing this issue can be broken up into two parts: 1. Ensure that a CPU goes offline only after executing all the callbacks queued on it. 2. Modify the warning condition in the smp-call-function IPI handler code such that it warns only if an offline CPU got an IPI *and* that CPU had gone offline with callbacks still pending in its queue. Achieving part 1 is straight-forward - just flush (execute) all the queued callbacks on the outgoing CPU in the CPU_DYING stage[1], including those callbacks for which the source CPU's IPIs might not have been received on the outgoing CPU yet. Once we do this, an IPI that arrives late on the CPU going offline (either due to the race mentioned above, or due to hardware latencies) will be completely harmless, since the outgoing CPU would have executed all the queued callbacks before going offline. Overall, this fix (parts 1 and 2 put together) additionally guarantees that we will see a warning only when the *IPI-sender code* is buggy - that is, if it queues the callback _after_ the target CPU has gone offline. [1]. The CPU_DYING part needs a little more explanation: by the time we execute the CPU_DYING notifier callbacks, the CPU would have already been marked offline. But we want to flush out the pending callbacks at this stage, ignoring the fact that the CPU is offline. So restructure the IPI handler code so that we can by-pass the "is-cpu-offline?" check in this particular case. (Of course, the right solution here is to fix CPU hotplug to mark the CPU offline _after_ invoking the CPU_DYING notifiers, but this requires a lot of audit to ensure that this change doesn't break any existing code; hence lets go with the solution proposed above until that is done). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Srivatsa S. Bhat Suggested-by: Frederic Weisbecker Cc: "Paul E. McKenney" Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Frederic Weisbecker Cc: Gautham R Shenoy Cc: Ingo Molnar Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Rik van Riel Cc: Rusty Russell Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleixner Tested-by: Sachin Kamat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 306f8180b0d5..80c33f8de14f 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -29,6 +29,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); +static void flush_smp_call_function_queue(bool warn_cpu_offline); + static int hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -51,12 +53,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: + /* Fall-through to the CPU_DEAD[_FROZEN] case. */ case CPU_DEAD: case CPU_DEAD_FROZEN: free_cpumask_var(cfd->cpumask); free_percpu(cfd->csd); break; + + case CPU_DYING: + case CPU_DYING_FROZEN: + /* + * The IPIs for the smp-call-function callbacks queued by other + * CPUs might arrive late, either due to hardware latencies or + * because this CPU disabled interrupts (inside stop-machine) + * before the IPIs were sent. So flush out any pending callbacks + * explicitly (without waiting for the IPIs to arrive), to + * ensure that the outgoing CPU doesn't go offline with work + * still pending. + */ + flush_smp_call_function_queue(false); + break; #endif }; @@ -177,23 +194,47 @@ static int generic_exec_single(int cpu, struct call_single_data *csd, return 0; } -/* - * Invoked by arch to handle an IPI for call function single. Must be - * called from the arch with interrupts disabled. +/** + * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks + * + * Invoked by arch to handle an IPI for call function single. + * Must be called with interrupts disabled. */ void generic_smp_call_function_single_interrupt(void) { + flush_smp_call_function_queue(true); +} + +/** + * flush_smp_call_function_queue - Flush pending smp-call-function callbacks + * + * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an + * offline CPU. Skip this check if set to 'false'. + * + * Flush any pending smp-call-function callbacks queued on this CPU. This is + * invoked by the generic IPI handler, as well as by a CPU about to go offline, + * to ensure that all pending IPI callbacks are run before it goes completely + * offline. + * + * Loop through the call_single_queue and run all the queued callbacks. + * Must be called with interrupts disabled. + */ +static void flush_smp_call_function_queue(bool warn_cpu_offline) +{ + struct llist_head *head; struct llist_node *entry; struct call_single_data *csd, *csd_next; static bool warned; - entry = llist_del_all(&__get_cpu_var(call_single_queue)); + WARN_ON(!irqs_disabled()); + + head = &__get_cpu_var(call_single_queue); + entry = llist_del_all(head); entry = llist_reverse_order(entry); - /* - * Shouldn't receive this interrupt on a cpu that is not yet online. - */ - if (unlikely(!cpu_online(smp_processor_id()) && !warned)) { + /* There shouldn't be any pending callbacks on an offline CPU. */ + if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) && + !warned && !llist_empty(head))) { warned = true; WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); -- cgit From b3acc56bfe1287c6b666e80edc70b89eea2a1a80 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Mon, 23 Jun 2014 13:22:03 -0700 Subject: kexec: save PG_head_mask in VMCOREINFO To allow filtering of huge pages, makedumpfile must be able to identify them in the dump. This can be done by checking the appropriate page flag, so communicate its value to makedumpfile through the VMCOREINFO interface. There's only one small catch. Depending on how many page flags are available on a given architecture, this bit can be called PG_head or PG_compound. I sent a similar patch back in 2012, but Eric Biederman did not like using an #ifdef. So, this time I'm adding a common symbol (PG_head_mask) instead. See https://lkml.org/lkml/2012/11/28/91 for the previous version. Signed-off-by: Petr Tesarik Acked-by: Vivek Goyal Cc: Eric Biederman Cc: Paul Mackerras Cc: Fengguang Wu Cc: Benjamin Herrenschmidt Cc: Shaohua Li Cc: Alexey Kardashevskiy Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 3 +++ kernel/kexec.c | 1 + 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 3c545b48aeab..8304959ad336 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -360,6 +360,9 @@ static inline void ClearPageCompound(struct page *page) ClearPageHead(page); } #endif + +#define PG_head_mask ((1L << PG_head)) + #else /* * Reduce page flag use as much as possible by overlapping diff --git a/kernel/kexec.c b/kernel/kexec.c index 6748688813d0..369f41a94124 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1617,6 +1617,7 @@ static int __init crash_save_vmcoreinfo_init(void) #ifdef CONFIG_MEMORY_FAILURE VMCOREINFO_NUMBER(PG_hwpoison); #endif + VMCOREINFO_NUMBER(PG_head_mask); VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); arch_crash_save_vmcoreinfo(); -- cgit From bde92cf455a03a91badb7046855592d8c008e929 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Mon, 23 Jun 2014 13:22:03 -0700 Subject: kernel/watchdog.c: remove preemption restrictions when restarting lockup detector Peter Wu noticed the following splat on his machine when updating /proc/sys/kernel/watchdog_thresh: BUG: sleeping function called from invalid context at mm/slub.c:965 in_atomic(): 1, irqs_disabled(): 0, pid: 1, name: init 3 locks held by init/1: #0: (sb_writers#3){.+.+.+}, at: [] vfs_write+0x143/0x180 #1: (watchdog_proc_mutex){+.+.+.}, at: [] proc_dowatchdog+0x33/0x110 #2: (cpu_hotplug.lock){.+.+.+}, at: [] get_online_cpus+0x32/0x80 Preemption disabled at:[] proc_dowatchdog+0xe4/0x110 CPU: 0 PID: 1 Comm: init Not tainted 3.16.0-rc1-testing #34 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x4e/0x7a __might_sleep+0x11d/0x190 kmem_cache_alloc_trace+0x4e/0x1e0 perf_event_alloc+0x55/0x440 perf_event_create_kernel_counter+0x26/0xe0 watchdog_nmi_enable+0x75/0x140 update_timers_all_cpus+0x53/0xa0 proc_dowatchdog+0xe4/0x110 proc_sys_call_handler+0xb3/0xc0 proc_sys_write+0x14/0x20 vfs_write+0xad/0x180 SyS_write+0x49/0xb0 system_call_fastpath+0x16/0x1b NMI watchdog: disabled (cpu0): hardware events not enabled What happened is after updating the watchdog_thresh, the lockup detector is restarted to utilize the new value. Part of this process involved disabling preemption. Once preemption was disabled, perf tried to allocate a new event (as part of the restart). This caused the above BUG_ON as you can't sleep with preemption disabled. The preemption restriction seemed agressive as we are not doing anything on that particular cpu, but with all the online cpus (which are protected by the get_online_cpus lock). Remove the restriction and the BUG_ON goes away. Signed-off-by: Don Zickus Acked-by: Michal Hocko Reported-by: Peter Wu Tested-by: Peter Wu Acked-by: David Rientjes Cc: [3.13+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 516203e665fc..30e482240dae 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -527,10 +527,8 @@ static void update_timers_all_cpus(void) int cpu; get_online_cpus(); - preempt_disable(); for_each_online_cpu(cpu) update_timers(cpu); - preempt_enable(); put_online_cpus(); } -- cgit From 7cd2b0a34ab8e4db971920eef8982f985441adfb Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 23 Jun 2014 13:22:04 -0700 Subject: mm, pcp: allow restoring percpu_pagelist_fraction default Oleg reports a division by zero error on zero-length write() to the percpu_pagelist_fraction sysctl: divide error: 0000 [#1] SMP DEBUG_PAGEALLOC CPU: 1 PID: 9142 Comm: badarea_io Not tainted 3.15.0-rc2-vm-nfs+ #19 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff8800d5aeb6e0 ti: ffff8800d87a2000 task.ti: ffff8800d87a2000 RIP: 0010: percpu_pagelist_fraction_sysctl_handler+0x84/0x120 RSP: 0018:ffff8800d87a3e78 EFLAGS: 00010246 RAX: 0000000000000f89 RBX: ffff88011f7fd000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000010 RBP: ffff8800d87a3e98 R08: ffffffff81d002c8 R09: ffff8800d87a3f50 R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000060 R13: ffffffff81c3c3e0 R14: ffffffff81cfddf8 R15: ffff8801193b0800 FS: 00007f614f1e9740(0000) GS:ffff88011f440000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f614f1fa000 CR3: 00000000d9291000 CR4: 00000000000006e0 Call Trace: proc_sys_call_handler+0xb3/0xc0 proc_sys_write+0x14/0x20 vfs_write+0xba/0x1e0 SyS_write+0x46/0xb0 tracesys+0xe1/0xe6 However, if the percpu_pagelist_fraction sysctl is set by the user, it is also impossible to restore it to the kernel default since the user cannot write 0 to the sysctl. This patch allows the user to write 0 to restore the default behavior. It still requires a fraction equal to or larger than 8, however, as stated by the documentation for sanity. If a value in the range [1, 7] is written, the sysctl will return EINVAL. This successfully solves the divide by zero issue at the same time. Signed-off-by: David Rientjes Reported-by: Oleg Drokin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 3 ++- kernel/sysctl.c | 3 +-- mm/page_alloc.c | 40 ++++++++++++++++++++++++++++------------ 3 files changed, 31 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index bd4b34c03738..4415aa915681 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -702,7 +702,8 @@ The batch value of each per cpu pagelist is also updated as a result. It is set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) The initial value is zero. Kernel does not use this value at boot time to set -the high water marks for each per cpu page list. +the high water marks for each per cpu page list. If the user writes '0' to this +sysctl, it will revert to this default behavior. ============================================================== diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7de6555cfea0..075d1903138f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; static int minolduid; -static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -1317,7 +1316,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(percpu_pagelist_fraction), .mode = 0644, .proc_handler = percpu_pagelist_fraction_sysctl_handler, - .extra1 = &min_percpu_pagelist_fract, + .extra1 = &zero, }, #ifdef CONFIG_MMU { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f59fa29eda8..20d17f8266fe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -69,6 +69,7 @@ /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); +#define MIN_PERCPU_PAGELIST_FRACTION (8) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); @@ -4145,7 +4146,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif -static int __meminit zone_batchsize(struct zone *zone) +static int zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU int batch; @@ -4261,8 +4262,8 @@ static void pageset_set_high(struct per_cpu_pageset *p, pageset_update(&p->pcp, high, batch); } -static void __meminit pageset_set_high_and_batch(struct zone *zone, - struct per_cpu_pageset *pcp) +static void pageset_set_high_and_batch(struct zone *zone, + struct per_cpu_pageset *pcp) { if (percpu_pagelist_fraction) pageset_set_high(pcp, @@ -5881,23 +5882,38 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; - unsigned int cpu; + int old_percpu_pagelist_fraction; int ret; + mutex_lock(&pcp_batch_high_lock); + old_percpu_pagelist_fraction = percpu_pagelist_fraction; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (!write || (ret < 0)) - return ret; + if (!write || ret < 0) + goto out; + + /* Sanity checking to avoid pcp imbalance */ + if (percpu_pagelist_fraction && + percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { + percpu_pagelist_fraction = old_percpu_pagelist_fraction; + ret = -EINVAL; + goto out; + } + + /* No change? */ + if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) + goto out; - mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { - unsigned long high; - high = zone->managed_pages / percpu_pagelist_fraction; + unsigned int cpu; + for_each_possible_cpu(cpu) - pageset_set_high(per_cpu_ptr(zone->pageset, cpu), - high); + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); } +out: mutex_unlock(&pcp_batch_high_lock); - return 0; + return ret; } int hashdist = HASHDIST_DEFAULT; -- cgit From ed235875e2ca983197831337a986f0517074e1a0 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Mon, 23 Jun 2014 13:22:05 -0700 Subject: kernel/watchdog.c: print traces for all cpus on lockup detection A 'softlockup' is defined as a bug that causes the kernel to loop in kernel mode for more than a predefined period to time, without giving other tasks a chance to run. Currently, upon detection of this condition by the per-cpu watchdog task, debug information (including a stack trace) is sent to the system log. On some occasions, we have observed that the "victim" rather than the actual "culprit" (i.e. the owner/holder of the contended resource) is reported to the user. Often this information has proven to be insufficient to assist debugging efforts. To avoid loss of useful debug information, for architectures which support NMI, this patch makes it possible to improve soft lockup reporting. This is accomplished by issuing an NMI to each cpu to obtain a stack trace. If NMI is not supported we just revert back to the old method. A sysctl and boot-time parameter is available to toggle this feature. [dzickus@redhat.com: add CONFIG_SMP in certain areas] [akpm@linux-foundation.org: additional CONFIG_SMP=n optimisations] [mq@suse.cz: fix warning] Signed-off-by: Aaron Tomlin Signed-off-by: Don Zickus Cc: David S. Miller Cc: Mateusz Guzik Cc: Oleg Nesterov Signed-off-by: Jan Moskyto Matejka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 5 +++++ Documentation/sysctl/kernel.txt | 17 ++++++++++++++++ include/linux/nmi.h | 1 + kernel/sysctl.c | 11 +++++++++++ kernel/watchdog.c | 39 +++++++++++++++++++++++++++++++++++++ 5 files changed, 73 insertions(+) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 884904975d0b..c1b9aa8c5a52 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3130,6 +3130,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. [KNL] Should the soft-lockup detector generate panics. Format: + softlockup_all_cpu_backtrace= + [KNL] Should the soft-lockup detector generate + backtraces on all cpus. + Format: + sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/laptops/sonypi.txt diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 708bb7f1b7e0..c14374e71775 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -75,6 +75,7 @@ show up in /proc/sys/kernel: - shmall - shmmax [ sysv ipc ] - shmmni +- softlockup_all_cpu_backtrace - stop-a [ SPARC only ] - sysrq ==> Documentation/sysrq.txt - sysctl_writes_strict @@ -783,6 +784,22 @@ via the /proc/sys interface: ============================================================== +softlockup_all_cpu_backtrace: + +This value controls the soft lockup detector thread's behavior +when a soft lockup condition is detected as to whether or not +to gather further debug information. If enabled, each cpu will +be issued an NMI and instructed to capture stack trace. + +This feature is only applicable for architectures which support +NMI. + +0: do nothing. This is the default behavior. + +1: on detection capture more debug information. + +============================================================== + tainted: Non-zero if the kernel has been tainted. Numeric values, which diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a17ab6398d7c..447775ee2c4b 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -57,6 +57,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *); u64 hw_nmi_get_sample_period(int watchdog_thresh); extern int watchdog_user_enabled; extern int watchdog_thresh; +extern int sysctl_softlockup_all_cpu_backtrace; struct ctl_table; extern int proc_dowatchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 075d1903138f..75b22e22a72c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -860,6 +860,17 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_SMP + { + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif /* CONFIG_SMP */ { .procname = "nmi_watchdog", .data = &watchdog_user_enabled, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 30e482240dae..c3319bd1b040 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -31,6 +31,12 @@ int watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; +#ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; +#else +#define sysctl_softlockup_all_cpu_backtrace 0 +#endif + static int __read_mostly watchdog_running; static u64 __read_mostly sample_period; @@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif +static unsigned long soft_lockup_nmi_warn; /* boot commands */ /* @@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str) } __setup("nosoftlockup", nosoftlockup_setup); /* */ +#ifdef CONFIG_SMP +static int __init softlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_softlockup_all_cpu_backtrace = + !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +#endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- @@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); struct pt_regs *regs = get_irq_regs(); int duration; + int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; /* kick the hardlockup detector */ watchdog_interrupt_count(); @@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; + if (softlockup_all_cpu_backtrace) { + /* Prevent multiple soft-lockup reports if one cpu is already + * engaged in dumping cpu back traces + */ + if (test_and_set_bit(0, &soft_lockup_nmi_warn)) { + /* Someone else will report us. Let's give up */ + __this_cpu_write(soft_watchdog_warn, true); + return HRTIMER_RESTART; + } + } + printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) else dump_stack(); + if (softlockup_all_cpu_backtrace) { + /* Avoid generating two back traces for current + * given that one is already made above + */ + trigger_allbutself_cpu_backtrace(); + + clear_bit(0, &soft_lockup_nmi_warn); + /* Barrier to sync with other cpus */ + smp_mb__after_atomic(); + } + if (softlockup_panic) panic("softlockup: hung tasks"); __this_cpu_write(soft_watchdog_warn, true); -- cgit From 391acf970d21219a2a5446282d3b20eace0c0d7a Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 25 Jun 2014 09:57:18 +0800 Subject: cpuset,mempolicy: fix sleeping function called from invalid context When runing with the kernel(3.15-rc7+), the follow bug occurs: [ 9969.258987] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:586 [ 9969.359906] in_atomic(): 1, irqs_disabled(): 0, pid: 160655, name: python [ 9969.441175] INFO: lockdep is turned off. [ 9969.488184] CPU: 26 PID: 160655 Comm: python Tainted: G A 3.15.0-rc7+ #85 [ 9969.581032] Hardware name: FUJITSU-SV PRIMEQUEST 1800E/SB, BIOS PRIMEQUEST 1000 Series BIOS Version 1.39 11/16/2012 [ 9969.706052] ffffffff81a20e60 ffff8803e941fbd0 ffffffff8162f523 ffff8803e941fd18 [ 9969.795323] ffff8803e941fbe0 ffffffff8109995a ffff8803e941fc58 ffffffff81633e6c [ 9969.884710] ffffffff811ba5dc ffff880405c6b480 ffff88041fdd90a0 0000000000002000 [ 9969.974071] Call Trace: [ 9970.003403] [] dump_stack+0x4d/0x66 [ 9970.065074] [] __might_sleep+0xfa/0x130 [ 9970.130743] [] mutex_lock_nested+0x3c/0x4f0 [ 9970.200638] [] ? kmem_cache_alloc+0x1bc/0x210 [ 9970.272610] [] cpuset_mems_allowed+0x27/0x140 [ 9970.344584] [] ? __mpol_dup+0x63/0x150 [ 9970.409282] [] __mpol_dup+0xe5/0x150 [ 9970.471897] [] ? __mpol_dup+0x63/0x150 [ 9970.536585] [] ? copy_process.part.23+0x606/0x1d40 [ 9970.613763] [] ? trace_hardirqs_on+0xd/0x10 [ 9970.683660] [] ? monotonic_to_bootbased+0x2f/0x50 [ 9970.759795] [] copy_process.part.23+0x670/0x1d40 [ 9970.834885] [] do_fork+0xd8/0x380 [ 9970.894375] [] ? __audit_syscall_entry+0x9c/0xf0 [ 9970.969470] [] SyS_clone+0x16/0x20 [ 9971.030011] [] stub_clone+0x69/0x90 [ 9971.091573] [] ? system_call_fastpath+0x16/0x1b The cause is that cpuset_mems_allowed() try to take mutex_lock(&callback_mutex) under the rcu_read_lock(which was hold in __mpol_dup()). And in cpuset_mems_allowed(), the access to cpuset is under rcu_read_lock, so in __mpol_dup, we can reduce the rcu_read_lock protection region to protect the access to cpuset only in current_cpuset_is_being_rebound(). So that we can avoid this bug. This patch is a temporary solution that just addresses the bug mentioned above, can not fix the long-standing issue about cpuset.mems rebinding on fork(): "When the forker's task_struct is duplicated (which includes ->mems_allowed) and it races with an update to cpuset_being_rebound in update_tasks_nodemask() then the task's mems_allowed doesn't get updated. And the child task's mems_allowed can be wrong if the cpuset's nodemask changes before the child has been added to the cgroup's tasklist." Signed-off-by: Gu Zheng Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: stable --- kernel/cpuset.c | 8 +++++++- mm/mempolicy.c | 2 -- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f6b33c696224..d3df02e76643 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1181,7 +1181,13 @@ done: int current_cpuset_is_being_rebound(void) { - return task_cs(current) == cpuset_being_rebound; + int ret; + + rcu_read_lock(); + ret = task_cs(current) == cpuset_being_rebound; + rcu_read_unlock(); + + return ret; } static int update_relax_domain_level(struct cpuset *cs, s64 val) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 284974230459..9a3783ccff67 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2145,7 +2145,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) } else *new = *old; - rcu_read_lock(); if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); if (new->flags & MPOL_F_REBINDING) @@ -2153,7 +2152,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) else mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); } - rcu_read_unlock(); atomic_set(&new->refcnt, 1); return new; } -- cgit From 970317aa48c6ef66cd023c039c2650c897bad927 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 30 Jun 2014 11:49:58 +0800 Subject: cgroup: fix mount failure in a corner case # cat test.sh #! /bin/bash mount -t cgroup -o cpu xxx /cgroup umount /cgroup mount -t cgroup -o cpu,cpuacct xxx /cgroup umount /cgroup # ./test.sh mount: xxx already mounted or /cgroup busy mount: according to mtab, xxx is already mounted on /cgroup It's because the cgroupfs_root of the first mount was under destruction asynchronously. Fix this by delaying and then retrying mount for this case. v3: - put the refcnt immediately after getting it. (Tejun) v2: - use percpu_ref_tryget_live() rather that introducing percpu_ref_alive(). (Tejun) - adjust comment. tj: Updated the comment a bit. Cc: # 3.15 Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d9a8be911f5b..64068667be84 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1648,10 +1648,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + struct cgroup_subsys *ss; struct cgroup_root *root; struct cgroup_sb_opts opts; struct dentry *dentry; int ret; + int i; bool new_sb; /* @@ -1677,6 +1679,27 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } + /* + * Destruction of cgroup root is asynchronous, so subsystems may + * still be dying after the previous unmount. Let's drain the + * dying subsystems. We just need to ensure that the ones + * unmounted previously finish dying and don't care about new ones + * starting. Testing ref liveliness is good enough. + */ + for_each_subsys(ss, i) { + if (!(opts.subsys_mask & (1 << i)) || + ss->root == &cgrp_dfl_root) + continue; + + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + cgroup_put(&ss->root->cgrp); + } + for_each_root(root) { bool name_match = false; -- cgit From 3a32bd72d77058d768dbb38183ad517f720dd1bc Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 30 Jun 2014 11:50:59 +0800 Subject: cgroup: fix a race between cgroup_mount() and cgroup_kill_sb() We've converted cgroup to kernfs so cgroup won't be intertwined with vfs objects and locking, but there are dark areas. Run two instances of this script concurrently: for ((; ;)) { mount -t cgroup -o cpuacct xxx /cgroup umount /cgroup } After a while, I saw two mount processes were stuck at retrying, because they were waiting for a subsystem to become free, but the root associated with this subsystem never got freed. This can happen, if thread A is in the process of killing superblock but hasn't called percpu_ref_kill(), and at this time thread B is mounting the same cgroup root and finds the root in the root list and performs percpu_ref_try_get(). To fix this, we try to increase both the refcnt of the superblock and the percpu refcnt of cgroup root. v2: - we should try to get both the superblock refcnt and cgroup_root refcnt, because cgroup_root may have no superblock assosiated with it. - adjust/add comments. tj: Updated comments. Renamed @sb to @pinned_sb. Cc: # 3.15 Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 64068667be84..70776aec2562 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1648,6 +1648,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + struct super_block *pinned_sb = NULL; struct cgroup_subsys *ss; struct cgroup_root *root; struct cgroup_sb_opts opts; @@ -1740,15 +1741,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, } /* - * A root's lifetime is governed by its root cgroup. - * tryget_live failure indicate that the root is being - * destroyed. Wait for destruction to complete so that the - * subsystems are free. We can use wait_queue for the wait - * but this path is super cold. Let's just sleep for a bit - * and retry. + * We want to reuse @root whose lifetime is governed by its + * ->cgrp. Let's check whether @root is alive and keep it + * that way. As cgroup_kill_sb() can happen anytime, we + * want to block it by pinning the sb so that @root doesn't + * get killed before mount is complete. + * + * With the sb pinned, tryget_live can reliably indicate + * whether @root can be reused. If it's being killed, + * drain it. We can use wait_queue for the wait but this + * path is super cold. Let's just sleep a bit and retry. */ - if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { + pinned_sb = kernfs_pin_sb(root->kf_root, NULL); + if (IS_ERR(pinned_sb) || + !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { mutex_unlock(&cgroup_mutex); + if (!IS_ERR_OR_NULL(pinned_sb)) + deactivate_super(pinned_sb); msleep(10); ret = restart_syscall(); goto out_free; @@ -1793,6 +1802,16 @@ out_free: CGROUP_SUPER_MAGIC, &new_sb); if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); + + /* + * If @pinned_sb, we're reusing an existing root and holding an + * extra ref on its sb. Mount is complete. Put the extra ref. + */ + if (pinned_sb) { + WARN_ON(new_sb); + deactivate_super(pinned_sb); + } + return dentry; } -- cgit From 48212542067a7ff6cbe829dbae279c2ff7557b44 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 27 Jun 2014 19:01:36 +0200 Subject: tracing/uprobes: Revert "Support mix of ftrace and perf" This reverts commit 43fe98913c9f67e3b523615ee3316f9520a623e0. This patch is very wrong. Firstly, this change leads to unbalanced uprobe_unregister(). Just for example, # perf probe -x /lib/libc.so.6 syscall # echo 1 >> /sys/kernel/debug/tracing/events/probe_libc/enable # perf record -e probe_libc:syscall whatever after that uprobe is dead (unregistered) but the user of ftrace/perf can't know this, and it looks as if nobody hits this probe. This would be easy to fix, but there are other reasons why it is not simple to mix ftrace and perf. If nothing else, they can't share the same ->consumer.filter. This is fixable too, but probably we need to fix the poorly designed uprobe_register() interface first. At least "register" and "apply" should be clearly separated. Link: http://lkml.kernel.org/p/20140627170136.GA18319@redhat.com Cc: Tom Zanussi Cc: "zhangwei(Jovi)" Cc: stable@vger.kernel.org # v3.14 Acked-by: Namhyung Kim Acked-by: Srikar Dronamraju Reviewed-by: Masami Hiramatsu Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 04fdb5de823c..08e7970bf3f9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -893,6 +893,9 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, int ret; if (file) { + if (tu->tp.flags & TP_FLAG_PROFILE) + return -EINTR; + link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) return -ENOMEM; @@ -901,8 +904,12 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, list_add_tail_rcu(&link->list, &tu->tp.files); tu->tp.flags |= TP_FLAG_TRACE; - } else + } else { + if (tu->tp.flags & TP_FLAG_TRACE) + return -EINTR; + tu->tp.flags |= TP_FLAG_PROFILE; + } ret = uprobe_buffer_enable(); if (ret < 0) -- cgit From 06d0713904e508f765e0d7146c14b67bbd248fe7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 27 Jun 2014 19:01:40 +0200 Subject: uprobes: Change unregister/apply to WARN() if uprobe/consumer is gone Add WARN_ON's into uprobe_unregister() and uprobe_apply() to ensure that nobody tries to play with the dead uprobe/consumer. This helps to catch the bugs like the one fixed by the previous patch. In the longer term we should fix this poorly designed interface. uprobe_register() should return "struct uprobe *" which should be passed to apply/unregister. Plus other semantic changes, see the changelog in commit 41ccba029e94. Link: http://lkml.kernel.org/p/20140627170140.GA18322@redhat.com Acked-by: Namhyung Kim Acked-by: Srikar Dronamraju Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/events/uprobes.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c445e392e93f..6f3254e8c137 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -846,7 +846,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u { int err; - if (!consumer_del(uprobe, uc)) /* WARN? */ + if (WARN_ON(!consumer_del(uprobe, uc))) return; err = register_for_each_vma(uprobe, NULL); @@ -927,7 +927,7 @@ int uprobe_apply(struct inode *inode, loff_t offset, int ret = -ENOENT; uprobe = find_uprobe(inode, offset); - if (!uprobe) + if (WARN_ON(!uprobe)) return ret; down_write(&uprobe->register_rwsem); @@ -952,7 +952,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume struct uprobe *uprobe; uprobe = find_uprobe(inode, offset); - if (!uprobe) + if (WARN_ON(!uprobe)) return; down_write(&uprobe->register_rwsem); -- cgit From f786106e8081bbec57053fec7fcf25dc25d02144 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 27 Jun 2014 19:01:43 +0200 Subject: tracing/uprobes: Kill the bogus UPROBE_HANDLER_REMOVE code in uprobe_dispatcher() I do not know why dd9fa555d7bb "tracing/uprobes: Move argument fetching to uprobe_dispatcher()" added the UPROBE_HANDLER_REMOVE, but it looks wrong. OK, perhaps it makes sense to avoid store_trace_args() if the tracee is nacked by uprobe_perf_filter(). But then we should kill the same code in uprobe_perf_func() and unify the TRACE/PROFILE filtering (we need to do this anyway to mix perf/ftrace). Until then this code actually adds the pessimization because uprobe_perf_filter() will be called twice and return T in likely case. Link: http://lkml.kernel.org/p/20140627170143.GA18329@redhat.com Acked-by: Namhyung Kim Acked-by: Srikar Dronamraju Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 08e7970bf3f9..c4cf0abd60ba 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1208,12 +1208,6 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) current->utask->vaddr = (unsigned long) &udd; -#ifdef CONFIG_PERF_EVENTS - if ((tu->tp.flags & TP_FLAG_TRACE) == 0 && - !uprobe_perf_filter(&tu->consumer, 0, current->mm)) - return UPROBE_HANDLER_REMOVE; -#endif - if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; -- cgit From fb6bab6a5ad46d00b5ffa22268f21df1cd7c59df Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 27 Jun 2014 19:01:46 +0200 Subject: tracing/uprobes: Fix the usage of uprobe_buffer_enable() in probe_event_enable() The usage of uprobe_buffer_enable() added by dcad1a20 is very wrong, 1. uprobe_buffer_enable() and uprobe_buffer_disable() are not balanced, _enable() should be called only if !enabled. 2. If uprobe_buffer_enable() fails probe_event_enable() should clear tp.flags and free event_file_link. 3. If uprobe_register() fails it should do uprobe_buffer_disable(). Link: http://lkml.kernel.org/p/20140627170146.GA18332@redhat.com Acked-by: Namhyung Kim Acked-by: Srikar Dronamraju Reviewed-by: Masami Hiramatsu Fixes: dcad1a204f72 "tracing/uprobes: Fetch args before reserving a ring buffer" Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c4cf0abd60ba..3c9b97e6b1f4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -911,26 +911,33 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, tu->tp.flags |= TP_FLAG_PROFILE; } - ret = uprobe_buffer_enable(); - if (ret < 0) - return ret; - WARN_ON(!uprobe_filter_is_empty(&tu->filter)); if (enabled) return 0; + ret = uprobe_buffer_enable(); + if (ret) + goto err_flags; + tu->consumer.filter = filter; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); - if (ret) { - if (file) { - list_del(&link->list); - kfree(link); - tu->tp.flags &= ~TP_FLAG_TRACE; - } else - tu->tp.flags &= ~TP_FLAG_PROFILE; - } + if (ret) + goto err_buffer; + return 0; + + err_buffer: + uprobe_buffer_disable(); + + err_flags: + if (file) { + list_del(&link->list); + kfree(link); + tu->tp.flags &= ~TP_FLAG_TRACE; + } else { + tu->tp.flags &= ~TP_FLAG_PROFILE; + } return ret; } -- cgit From 099ed151675cd1d2dbeae1dac697975f6a68716d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 24 Jun 2014 23:50:09 -0400 Subject: tracing: Remove ftrace_stop/start() from reading the trace file Disabling reading and writing to the trace file should not be able to disable all function tracing callbacks. There's other users today (like kprobes and perf). Reading a trace file should not stop those from happening. Cc: stable@vger.kernel.org # 3.0+ Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 384ede311717..f243444a3772 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1396,7 +1396,6 @@ void tracing_start(void) arch_spin_unlock(&global_trace.max_lock); - ftrace_start(); out: raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); } @@ -1443,7 +1442,6 @@ void tracing_stop(void) struct ring_buffer *buffer; unsigned long flags; - ftrace_stop(); raw_spin_lock_irqsave(&global_trace.start_lock, flags); if (global_trace.stop_count++) goto out; -- cgit From 76bb5ab8f6e3e7bebdcefec4146ff305e7d0b465 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 30 Jun 2014 15:47:32 -0400 Subject: cpuset: break kernfs active protection in cpuset_write_resmask() Writing to either "cpuset.cpus" or "cpuset.mems" file flushes cpuset_hotplug_work so that cpu or memory hotunplug doesn't end up migrating tasks off a cpuset after new resources are added to it. As cpuset_hotplug_work calls into cgroup core via cgroup_transfer_tasks(), this flushing adds the dependency to cgroup core locking from cpuset_write_resmak(). This used to be okay because cgroup interface files were protected by a different mutex; however, 8353da1f91f1 ("cgroup: remove cgroup_tree_mutex") simplified the cgroup core locking and this dependency became a deadlock hazard - cgroup file removal performed under cgroup core lock tries to drain on-going file operation which is trying to flush cpuset_hotplug_work blocked on the same cgroup core lock. The locking simplification was done because kernfs added an a lot easier way to deal with circular dependencies involving kernfs active protection. Let's use the same strategy in cpuset and break active protection in cpuset_write_resmask(). While it isn't the prettiest, this is a very rare, likely unique, situation which also goes away on the unified hierarchy. The commands to trigger the deadlock warning without the patch and the lockdep output follow. localhost:/ # mount -t cgroup -o cpuset xxx /cpuset localhost:/ # mkdir /cpuset/tmp localhost:/ # echo 1 > /cpuset/tmp/cpuset.cpus localhost:/ # echo 0 > cpuset/tmp/cpuset.mems localhost:/ # echo $$ > /cpuset/tmp/tasks localhost:/ # echo 0 > /sys/devices/system/cpu/cpu1/online ====================================================== [ INFO: possible circular locking dependency detected ] 3.16.0-rc1-0.1-default+ #7 Not tainted ------------------------------------------------------- kworker/1:0/32649 is trying to acquire lock: (cgroup_mutex){+.+.+.}, at: [] cgroup_transfer_tasks+0x37/0x150 but task is already holding lock: (cpuset_hotplug_work){+.+...}, at: [] process_one_work+0x192/0x520 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (cpuset_hotplug_work){+.+...}: ... -> #1 (s_active#175){++++.+}: ... -> #0 (cgroup_mutex){+.+.+.}: ... other info that might help us debug this: Chain exists of: cgroup_mutex --> s_active#175 --> cpuset_hotplug_work Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(cpuset_hotplug_work); lock(s_active#175); lock(cpuset_hotplug_work); lock(cgroup_mutex); *** DEADLOCK *** 2 locks held by kworker/1:0/32649: #0: ("events"){.+.+.+}, at: [] process_one_work+0x192/0x520 #1: (cpuset_hotplug_work){+.+...}, at: [] process_one_work+0x192/0x520 stack backtrace: CPU: 1 PID: 32649 Comm: kworker/1:0 Not tainted 3.16.0-rc1-0.1-default+ #7 ... Call Trace: [] dump_stack+0x72/0x8a [] print_circular_bug+0x10f/0x120 [] check_prev_add+0x43e/0x4b0 [] validate_chain+0x656/0x7c0 [] __lock_acquire+0x382/0x660 [] lock_acquire+0xf9/0x170 [] mutex_lock_nested+0x6f/0x380 [] cgroup_transfer_tasks+0x37/0x150 [] hotplug_update_tasks_insane+0x110/0x1d0 [] cpuset_hotplug_update_tasks+0x13d/0x180 [] cpuset_hotplug_workfn+0x18c/0x630 [] process_one_work+0x254/0x520 [] worker_thread+0x13d/0x3d0 [] kthread+0xf8/0x100 [] ret_from_fork+0x7c/0xb0 Signed-off-by: Tejun Heo Reported-by: Li Zefan Tested-by: Li Zefan --- kernel/cpuset.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d3df02e76643..116a4164720a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1623,7 +1623,17 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, * resources, wait for the previously scheduled operations before * proceeding, so that we don't end up keep removing tasks added * after execution capability is restored. + * + * cpuset_hotplug_work calls back into cgroup core via + * cgroup_transfer_tasks() and waiting for it from a cgroupfs + * operation like this one can lead to a deadlock through kernfs + * active_ref protection. Let's break the protection. Losing the + * protection is okay as we check whether @cs is online after + * grabbing cpuset_mutex anyway. This only happens on the legacy + * hierarchies. */ + css_get(&cs->css); + kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); mutex_lock(&cpuset_mutex); @@ -1651,6 +1661,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); + kernfs_unbreak_active_protection(of->kn); + css_put(&cs->css); return retval ?: nbytes; } -- cgit From d18bbc215f81710e1eab7120becafa910554d68d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 2 Jul 2014 15:22:38 -0700 Subject: kernel/printk/printk.c: revert "printk: enable interrupts before calling console_trylock_for_printk()" Revert commit 939f04bec1a4 ("printk: enable interrupts before calling console_trylock_for_printk()"). Andreas reported: : None of the post 3.15 kernel boot for me. They all hang at the GRUB : screen telling me it loaded and started the kernel, but the kernel : itself stops before it prints anything (or even replaces the GRUB : background graphics). 939f04bec1a4 is modest latency reduction. Revert it until we understand the reason for these failures. Reported-by: Andreas Bombe Cc: Jan Kara Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ea2d5f6962ed..13e839dbca07 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1416,9 +1416,10 @@ static int have_callable_console(void) /* * Can we actually use the console at this time on this cpu? * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. + * Console drivers may assume that per-cpu resources have + * been allocated. So unless they're explicitly marked as + * being able to cope (CON_ANYTIME) don't call them until + * this CPU is officially up. */ static inline int can_use_console(unsigned int cpu) { @@ -1431,10 +1432,8 @@ static inline int can_use_console(unsigned int cpu) * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. */ -static int console_trylock_for_printk(void) +static int console_trylock_for_printk(unsigned int cpu) { - unsigned int cpu = smp_processor_id(); - if (!console_trylock()) return 0; /* @@ -1609,8 +1608,7 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; - local_irq_restore(flags); - return 0; + goto out_restore_irqs; } zap_locks(); } @@ -1718,27 +1716,21 @@ asmlinkage int vprintk_emit(int facility, int level, logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); - lockdep_on(); - local_irq_restore(flags); /* If called from the scheduler, we can not call up(). */ - if (in_sched) - return printed_len; - - /* - * Disable preemption to avoid being preempted while holding - * console_sem which would prevent anyone from printing to console - */ - preempt_disable(); - /* - * Try to acquire and then immediately release the console semaphore. - * The release will print out buffers and wake up /dev/kmsg and syslog() - * users. - */ - if (console_trylock_for_printk()) - console_unlock(); - preempt_enable(); + if (!in_sched) { + /* + * Try to acquire and then immediately release the console + * semaphore. The release will print out buffers and wake up + * /dev/kmsg and syslog() users. + */ + if (console_trylock_for_printk(this_cpu)) + console_unlock(); + } + lockdep_on(); +out_restore_irqs: + local_irq_restore(flags); return printed_len; } EXPORT_SYMBOL(vprintk_emit); -- cgit From 8844aad89ed61545b4db6a3467e1b21ca1c49460 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 30 Jun 2014 16:24:44 -0600 Subject: genirq: Fix memory leak when calling irq_free_hwirqs() irq_free_hwirqs() always calls irq_free_descs() with a cnt == 0 which makes it a no-op since the interrupt count to free is decremented in itself. Fixes: 7b6ef1262549f6afc5c881aaef80beb8fd15f908 Signed-off-by: Keith Busch Acked-by: David Rientjes Link: http://lkml.kernel.org/r/1404167084-8070-1-git-send-email-keith.busch@intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 7339e42a85ab..1487a123db5c 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -455,9 +455,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_hwirqs); */ void irq_free_hwirqs(unsigned int from, int cnt) { - int i; + int i, j; - for (i = from; cnt > 0; i++, cnt--) { + for (i = from, j = cnt; j > 0; i++, j--) { irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); arch_teardown_hwirq(i); } -- cgit From 5a6024f1604eef119cf3a6fa413fe0261a81a8f3 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Mon, 7 Jul 2014 09:56:48 -0400 Subject: workqueue: zero cpumask of wq_numa_possible_cpumask on init When hot-adding and onlining CPU, kernel panic occurs, showing following call trace. BUG: unable to handle kernel paging request at 0000000000001d08 IP: [] __alloc_pages_nodemask+0x9d/0xb10 PGD 0 Oops: 0000 [#1] SMP ... Call Trace: [] ? cpumask_next_and+0x35/0x50 [] ? find_busiest_group+0x113/0x8f0 [] ? deactivate_slab+0x349/0x3c0 [] new_slab+0x91/0x300 [] __slab_alloc+0x2bb/0x482 [] ? copy_process.part.25+0xfc/0x14c0 [] ? load_balance+0x218/0x890 [] ? sched_clock+0x9/0x10 [] ? trace_clock_local+0x9/0x10 [] kmem_cache_alloc_node+0x8c/0x200 [] copy_process.part.25+0xfc/0x14c0 [] ? trace_buffer_unlock_commit+0x4d/0x60 [] ? kthread_create_on_node+0x140/0x140 [] do_fork+0xbc/0x360 [] kernel_thread+0x26/0x30 [] kthreadd+0x2c2/0x300 [] ? kthread_create_on_cpu+0x60/0x60 [] ret_from_fork+0x7c/0xb0 [] ? kthread_create_on_cpu+0x60/0x60 In my investigation, I found the root cause is wq_numa_possible_cpumask. All entries of wq_numa_possible_cpumask is allocated by alloc_cpumask_var_node(). And these entries are used without initializing. So these entries have wrong value. When hot-adding and onlining CPU, wq_update_unbound_numa() is called. wq_update_unbound_numa() calls alloc_unbound_pwq(). And alloc_unbound_pwq() calls get_unbound_pool(). In get_unbound_pool(), worker_pool->node is set as follow: 3592 /* if cpumask is contained inside a NUMA node, we belong to that node */ 3593 if (wq_numa_enabled) { 3594 for_each_node(node) { 3595 if (cpumask_subset(pool->attrs->cpumask, 3596 wq_numa_possible_cpumask[node])) { 3597 pool->node = node; 3598 break; 3599 } 3600 } 3601 } But wq_numa_possible_cpumask[node] does not have correct cpumask. So, wrong node is selected. As a result, kernel panic occurs. By this patch, all entries of wq_numa_possible_cpumask are allocated by zalloc_cpumask_var_node to initialize them. And the panic disappeared. Signed-off-by: Yasuaki Ishimatsu Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org Fixes: bce903809ab3 ("workqueue: add wq_numa_tbl_len and wq_numa_possible_cpumask[]") --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6f5f9c7323f4..35974ac69600 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4880,7 +4880,7 @@ static void __init wq_numa_init(void) BUG_ON(!tbl); for_each_node(node) - BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, + BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node_online(node) ? node : NUMA_NO_NODE)); for_each_possible_cpu(cpu) { -- cgit