From df0734702a7cbba49d6765bd5ba069340bf9c5db Mon Sep 17 00:00:00 2001 From: Song Liu <songliubraving@fb.com> Date: Fri, 2 Nov 2018 10:16:15 -0700 Subject: bpf: show real jited prog address in /proc/kallsyms Currently, /proc/kallsyms shows page address of jited bpf program. The main reason here is to not expose randomized start address. However, This is not ideal for detailed profiling (find hot instructions from stack traces). This patch replaces the page address with real prog start address. This change is OK because these addresses are still protected by sysctl kptr_restrict (see kallsyms_show_value()), and only programs loaded by root are added to kallsyms (see bpf_prog_kallsyms_add()). Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> --- kernel/bpf/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6377225b2082..1a796e0799ec 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -553,7 +553,6 @@ bool is_bpf_text_address(unsigned long addr) int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { - unsigned long symbol_start, symbol_end; struct bpf_prog_aux *aux; unsigned int it = 0; int ret = -ERANGE; @@ -566,10 +565,9 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, if (it++ != symnum) continue; - bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); bpf_get_prog_name(aux->prog, sym); - *value = symbol_start; + *value = (unsigned long)aux->prog->bpf_func; *type = BPF_SYM_ELF_TYPE; ret = 0; -- cgit From de57e99ceb65d0d7775cc14a8ba5931d7de1d708 Mon Sep 17 00:00:00 2001 From: Song Liu <songliubraving@fb.com> Date: Fri, 2 Nov 2018 10:16:16 -0700 Subject: bpf: show real jited address in bpf_prog_info->jited_ksyms Currently, jited_ksyms in bpf_prog_info shows page addresses of jited bpf program. The main reason here is to not expose randomized start address. However, this is not ideal for detailed profiling (find hot instructions from stack traces). This patch replaces the page address with real prog start address. This change is OK because bpf_prog_get_info_by_fd() is only available to root. Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ccb93277aae2..34a9eef5992c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2172,7 +2172,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, user_ksyms = u64_to_user_ptr(info.jited_ksyms); for (i = 0; i < ulen; i++) { ksym_addr = (ulong) prog->aux->func[i]->bpf_func; - ksym_addr &= PAGE_MASK; if (put_user((u64) ksym_addr, &user_ksyms[i])) return -EFAULT; } -- cgit From ff1889fc531f582f902175c0acc80321af540b24 Mon Sep 17 00:00:00 2001 From: Song Liu <songliubraving@fb.com> Date: Fri, 2 Nov 2018 10:16:17 -0700 Subject: bpf: show main program address and length in bpf_prog_info Currently, when there is no subprog (prog->aux->func_cnt == 0), bpf_prog_info does not return any jited_ksyms or jited_func_lens. This patch adds main program address (prog->bpf_func) and main program length (prog->jited_len) to bpf_prog_info. Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> --- kernel/bpf/syscall.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 34a9eef5992c..9418174c276c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2158,11 +2158,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } ulen = info.nr_jited_ksyms; - info.nr_jited_ksyms = prog->aux->func_cnt; + info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; if (info.nr_jited_ksyms && ulen) { if (bpf_dump_raw_ok()) { + unsigned long ksym_addr; u64 __user *user_ksyms; - ulong ksym_addr; u32 i; /* copy the address of the kernel symbol @@ -2170,9 +2170,17 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, */ ulen = min_t(u32, info.nr_jited_ksyms, ulen); user_ksyms = u64_to_user_ptr(info.jited_ksyms); - for (i = 0; i < ulen; i++) { - ksym_addr = (ulong) prog->aux->func[i]->bpf_func; - if (put_user((u64) ksym_addr, &user_ksyms[i])) + if (prog->aux->func_cnt) { + for (i = 0; i < ulen; i++) { + ksym_addr = (unsigned long) + prog->aux->func[i]->bpf_func; + if (put_user((u64) ksym_addr, + &user_ksyms[i])) + return -EFAULT; + } + } else { + ksym_addr = (unsigned long) prog->bpf_func; + if (put_user((u64) ksym_addr, &user_ksyms[0])) return -EFAULT; } } else { @@ -2181,7 +2189,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } ulen = info.nr_jited_func_lens; - info.nr_jited_func_lens = prog->aux->func_cnt; + info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; if (info.nr_jited_func_lens && ulen) { if (bpf_dump_raw_ok()) { u32 __user *user_lens; @@ -2190,9 +2198,16 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, /* copy the JITed image lengths for each function */ ulen = min_t(u32, info.nr_jited_func_lens, ulen); user_lens = u64_to_user_ptr(info.jited_func_lens); - for (i = 0; i < ulen; i++) { - func_len = prog->aux->func[i]->jited_len; - if (put_user(func_len, &user_lens[i])) + if (prog->aux->func_cnt) { + for (i = 0; i < ulen; i++) { + func_len = + prog->aux->func[i]->jited_len; + if (put_user(func_len, &user_lens[i])) + return -EFAULT; + } + } else { + func_len = prog->jited_len; + if (put_user(func_len, &user_lens[0])) return -EFAULT; } } else { -- cgit From 28c2fae726bf5003cd209b0d5910a642af98316f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann <daniel@iogearbox.net> Date: Fri, 2 Nov 2018 11:35:46 +0100 Subject: bpf: fix bpf_prog_get_info_by_fd to return 0 func_lens for unpriv While dbecd7388476 ("bpf: get kernel symbol addresses via syscall") zeroed info.nr_jited_ksyms in bpf_prog_get_info_by_fd() for queries from unprivileged users, commit 815581c11cc2 ("bpf: get JITed image lengths of functions via syscall") forgot about doing so and therefore returns the #elems of the user set up buffer which is incorrect. It also needs to indicate a info.nr_jited_func_lens of zero. Fixes: 815581c11cc2 ("bpf: get JITed image lengths of functions via syscall") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Cc: Sandipan Das <sandipan@linux.vnet.ibm.com> Cc: Song Liu <songliubraving@fb.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- kernel/bpf/syscall.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9418174c276c..cf5040fd5434 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2078,6 +2078,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; + info.nr_jited_func_lens = 0; goto done; } -- cgit From 40fa3780bac2b654edf23f6b13f4e2dd550aea10 Mon Sep 17 00:00:00 2001 From: Valentin Schneider <valentin.schneider@arm.com> Date: Tue, 23 Oct 2018 14:37:31 +0100 Subject: sched/core: Take the hotplug lock in sched_init_smp() When running on linux-next (8c60c36d0b8c ("Add linux-next specific files for 20181019")) + CONFIG_PROVE_LOCKING=y on a big.LITTLE system (e.g. Juno or HiKey960), we get the following report: [ 0.748225] Call trace: [ 0.750685] lockdep_assert_cpus_held+0x30/0x40 [ 0.755236] static_key_enable_cpuslocked+0x20/0xc8 [ 0.760137] build_sched_domains+0x1034/0x1108 [ 0.764601] sched_init_domains+0x68/0x90 [ 0.768628] sched_init_smp+0x30/0x80 [ 0.772309] kernel_init_freeable+0x278/0x51c [ 0.776685] kernel_init+0x10/0x108 [ 0.780190] ret_from_fork+0x10/0x18 The static_key in question is 'sched_asym_cpucapacity' introduced by commit: df054e8445a4 ("sched/topology: Add static_key for asymmetric CPU capacity optimizations") In this particular case, we enable it because smp_prepare_cpus() will end up fetching the capacity-dmips-mhz entry from the devicetree, so we already have some asymmetry detected when entering sched_init_smp(). This didn't get detected in tip/sched/core because we were missing: commit cb538267ea1e ("jump_label/lockdep: Assert we hold the hotplug lock for _cpuslocked() operations") Calls to build_sched_domains() post sched_init_smp() will hold the hotplug lock, it just so happens that this very first call is a special case. As stated by a comment in sched_init_smp(), "There's no userspace yet to cause hotplug operations" so this is a harmless warning. However, to both respect the semantics of underlying callees and make lockdep happy, take the hotplug lock in sched_init_smp(). This also satisfies the comment atop sched_init_domains() that says "Callers must hold the hotplug lock". Reported-by: Sudeep Holla <sudeep.holla@arm.com> Tested-by: Sudeep Holla <sudeep.holla@arm.com> Signed-off-by: Valentin Schneider <valentin.schneider@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Dietmar.Eggemann@arm.com Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: morten.rasmussen@arm.com Cc: quentin.perret@arm.com Link: http://lkml.kernel.org/r/1540301851-3048-1-git-send-email-valentin.schneider@arm.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fd2fce8a001b..02a20ef196a6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5859,11 +5859,14 @@ void __init sched_init_smp(void) /* * There's no userspace yet to cause hotplug operations; hence all the * CPU masks are stable and all blatant races in the below code cannot - * happen. + * happen. The hotplug lock is nevertheless taken to satisfy lockdep, + * but there won't be any contention on it. */ + cpus_read_lock(); mutex_lock(&sched_domains_mutex); sched_init_domains(cpu_active_mask); mutex_unlock(&sched_domains_mutex); + cpus_read_unlock(); /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) -- cgit From e1ff516a56ad56c476b47795d3811eef79d25fbe Mon Sep 17 00:00:00 2001 From: Yi Wang <wang.yi59@zte.com.cn> Date: Mon, 5 Nov 2018 08:50:13 +0800 Subject: sched/fair: Fix a comment in task_numa_fault() Duplicated 'case it'. Signed-off-by: Yi Wang <wang.yi59@zte.com.cn> Reviewed-by: Xi Xu <xu.xi8@zte.com.cn> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: zhong.weidong@zte.com.cn Link: http://lkml.kernel.org/r/1541379013-11352-1-git-send-email-wang.yi59@zte.com.cn Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ee271bb661cc..3648d0300fdf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2400,8 +2400,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) local = 1; /* - * Retry task to preferred node migration periodically, in case it - * case it previously failed, or the scheduler moved us. + * Retry to migrate task to preferred node periodically, in case it + * previously failed, or the scheduler moved us. */ if (time_after(jiffies, p->numa_migrate_retry)) { task_numa_placement(p); -- cgit From f75d651587f719a813ebbbfeee570e6570731d55 Mon Sep 17 00:00:00 2001 From: Randy Dunlap <rdunlap@infradead.org> Date: Sun, 4 Nov 2018 18:40:14 -0800 Subject: resource/docs: Fix new kernel-doc warnings The first group of warnings is caused by a "/**" kernel-doc notation marker but the function comments are not in kernel-doc format. Also add another error return value here. ../kernel/resource.c:337: warning: Function parameter or member 'start' not described in 'find_next_iomem_res' ../kernel/resource.c:337: warning: Function parameter or member 'end' not described in 'find_next_iomem_res' ../kernel/resource.c:337: warning: Function parameter or member 'flags' not described in 'find_next_iomem_res' ../kernel/resource.c:337: warning: Function parameter or member 'desc' not described in 'find_next_iomem_res' ../kernel/resource.c:337: warning: Function parameter or member 'first_lvl' not described in 'find_next_iomem_res' ../kernel/resource.c:337: warning: Function parameter or member 'res' not described in 'find_next_iomem_res' Add the missing function parameter documentation for the other warnings: ../kernel/resource.c:409: warning: Function parameter or member 'arg' not described in 'walk_iomem_res_desc' ../kernel/resource.c:409: warning: Function parameter or member 'func' not described in 'walk_iomem_res_desc' Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Borislav Petkov <bp@suse.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Fixes: b69c2e20f6e4 ("resource: Clean it up a bit") Link: http://lkml.kernel.org/r/dda2e4d8-bedd-3167-20fe-8c7d2d35b354@infradead.org Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/resource.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index b3a3a1fc499e..17bcb189d530 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -318,14 +318,14 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -/** +/* * Finds the lowest iomem resource that covers part of [start..end]. The * caller must specify start, end, flags, and desc (which may be * IORES_DESC_NONE). * * If a resource is found, returns 0 and *res is overwritten with the part * of the resource that's within [start..end]; if none is found, returns - * -1. + * -1. Returns -EINVAL for other invalid parameters. * * This function walks the whole tree and not just first level children * unless @first_lvl is true. @@ -390,7 +390,9 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, } /** - * Walks through iomem resources and calls func() with matching resource + * walk_iomem_res_desc - walk through iomem resources + * + * Walks through iomem resources and calls @func() with matching resource * ranges. This walks through whole tree and not just first level children. * All the memory ranges which overlap start,end and also match flags and * desc are valid candidates. @@ -399,6 +401,8 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, * @flags: I/O resource flags * @start: start addr * @end: end addr + * @arg: function argument for the callback @func + * @func: callback function that is called for each qualifying resource area * * NOTE: For a new descriptor search, define a new IORES_DESC in * <linux/ioport.h> and set it in 'desc' of a target resource entry. -- cgit From ee474b81fe5aa5dc0faae920bf66240fbf55f891 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu <mhiramat@kernel.org> Date: Thu, 1 Nov 2018 23:29:28 +0900 Subject: tracing/kprobes: Fix strpbrk() argument order Fix strpbrk()'s argument order, it must pass acceptable string in 2nd argument. Note that this can cause a kernel panic where it recovers backup character to code->data. Link: http://lkml.kernel.org/r/154108256792.2604.1816052586385217811.stgit@devbox Fixes: a6682814f371 ("tracing/kprobes: Allow kprobe-events to record module symbol") Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> --- kernel/trace/trace_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 3ef15a6683c0..bd30e9398d2a 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -535,7 +535,7 @@ int traceprobe_update_arg(struct probe_arg *arg) if (code[1].op != FETCH_OP_IMM) return -EINVAL; - tmp = strpbrk("+-", code->data); + tmp = strpbrk(code->data, "+-"); if (tmp) c = *tmp; ret = traceprobe_split_symbol_offset(code->data, -- cgit From f26621e60b35369bca9228bc936dc723b3e421af Mon Sep 17 00:00:00 2001 From: Borislav Petkov <bp@suse.de> Date: Mon, 5 Nov 2018 10:33:07 +0100 Subject: resource/docs: Complete kernel-doc style function documentation Add the missing kernel-doc style function parameters documentation. Signed-off-by: Borislav Petkov <bp@suse.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: akpm@linux-foundation.org Cc: linux-tip-commits@vger.kernel.org Cc: rdunlap@infradead.org Fixes: b69c2e20f6e4 ("resource: Clean it up a bit") Link: http://lkml.kernel.org/r/20181105093307.GA12445@zn.tnic Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/resource.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 17bcb189d530..b0fbf685c77a 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -318,17 +318,24 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -/* - * Finds the lowest iomem resource that covers part of [start..end]. The - * caller must specify start, end, flags, and desc (which may be +/** + * Finds the lowest iomem resource that covers part of [@start..@end]. The + * caller must specify @start, @end, @flags, and @desc (which may be * IORES_DESC_NONE). * - * If a resource is found, returns 0 and *res is overwritten with the part - * of the resource that's within [start..end]; if none is found, returns - * -1. Returns -EINVAL for other invalid parameters. + * If a resource is found, returns 0 and @*res is overwritten with the part + * of the resource that's within [@start..@end]; if none is found, returns + * -1 or -EINVAL for other invalid parameters. * * This function walks the whole tree and not just first level children * unless @first_lvl is true. + * + * @start: start address of the resource searched for + * @end: end address of same resource + * @flags: flags which the resource must have + * @desc: descriptor the resource must have + * @first_lvl: walk only the first level children, if set + * @res: return ptr, if resource found */ static int find_next_iomem_res(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, @@ -390,9 +397,7 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, } /** - * walk_iomem_res_desc - walk through iomem resources - * - * Walks through iomem resources and calls @func() with matching resource + * Walks through iomem resources and calls func() with matching resource * ranges. This walks through whole tree and not just first level children. * All the memory ranges which overlap start,end and also match flags and * desc are valid candidates. -- cgit From d2f007dbe7e4c9583eea6eb04d60001e85c6f1bd Mon Sep 17 00:00:00 2001 From: Jann Horn <jannh@google.com> Date: Mon, 5 Nov 2018 20:55:09 +0100 Subject: userns: also map extents in the reverse map to kernel IDs The current logic first clones the extent array and sorts both copies, then maps the lower IDs of the forward mapping into the lower namespace, but doesn't map the lower IDs of the reverse mapping. This means that code in a nested user namespace with >5 extents will see incorrect IDs. It also breaks some access checks, like inode_owner_or_capable() and privileged_wrt_inode_uidgid(), so a process can incorrectly appear to be capable relative to an inode. To fix it, we have to make sure that the "lower_first" members of extents in both arrays are translated; and we have to make sure that the reverse map is sorted *after* the translation (since otherwise the translation can break the sorting). This is CVE-2018-18955. Fixes: 6397fac4915a ("userns: bump idmap limits to 340") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn <jannh@google.com> Tested-by: Eric W. Biederman <ebiederm@xmission.com> Reviewed-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> --- kernel/user_namespace.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e5222b5fb4fe..923414a246e9 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -974,10 +974,6 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) goto out; - ret = sort_idmaps(&new_map); - if (ret < 0) - goto out; - ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. @@ -1004,6 +1000,14 @@ static ssize_t map_write(struct file *file, const char __user *buf, e->lower_first = lower_first; } + /* + * If we want to use binary search for lookup, this clones the extent + * array and sorts both copies. + */ + ret = sort_idmaps(&new_map); + if (ret < 0) + goto out; + /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, -- cgit From e6a2d72c10405b30ddba5af2e44a9d3d925a56d3 Mon Sep 17 00:00:00 2001 From: Juri Lelli <juri.lelli@redhat.com> Date: Wed, 7 Nov 2018 12:10:32 +0100 Subject: posix-cpu-timers: Remove useless call to check_dl_overrun() check_dl_overrun() is used to send a SIGXCPU to users that asked to be informed when a SCHED_DEADLINE runtime overruns occur. The function is called by check_thread_timers() already, so the call in check_process_timers() is redundant/wrong (even though harmless). Remove it. Fixes: 34be39305a77 ("sched/deadline: Implement "runtime overrun signal" support") Signed-off-by: Juri Lelli <juri.lelli@redhat.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com> Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org> Cc: linux-rt-users@vger.kernel.org Cc: mtk.manpages@gmail.com Cc: Mathieu Poirier <mathieu.poirier@linaro.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Luca Abeni <luca.abeni@santannapisa.it> Cc: Claudio Scordino <claudio@evidence.eu.com> Link: https://lkml.kernel.org/r/20181107111032.32291-1-juri.lelli@redhat.com --- kernel/time/posix-cpu-timers.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index ce32cf741b25..8f0644af40be 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -917,9 +917,6 @@ static void check_process_timers(struct task_struct *tsk, struct task_cputime cputime; unsigned long soft; - if (dl_task(tsk)) - check_dl_overrun(tsk); - /* * If cputimer is not running, then there are no active * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). -- cgit From c469933e772132aad040bd6a2adc8edf9ad6f825 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi <patrick.bellasi@arm.com> Date: Mon, 5 Nov 2018 14:53:58 +0000 Subject: sched/fair: Fix cpu_util_wake() for 'execl' type workloads A ~10% regression has been reported for UnixBench's execl throughput test by Aaron Lu and Ye Xiaolong: https://lkml.org/lkml/2018/10/30/765 That test is pretty simple, it does a "recursive" execve() syscall on the same binary. Starting from the syscall, this sequence is possible: do_execve() do_execveat_common() __do_execve_file() sched_exec() select_task_rq_fair() <==| Task already enqueued find_idlest_cpu() find_idlest_group() capacity_spare_wake() <==| Functions not called from cpu_util_wake() | the wakeup path which means we can end up calling cpu_util_wake() not only from the "wakeup path", as its name would suggest. Indeed, the task doing an execve() syscall is already enqueued on the CPU we want to get the cpu_util_wake() for. The estimated utilization for a CPU computed in cpu_util_wake() was written under the assumption that function can be called only from the wakeup path. If instead the task is already enqueued, we end up with a utilization which does not remove the current task's contribution from the estimated utilization of the CPU. This will wrongly assume a reduced spare capacity on the current CPU and increase the chances to migrate the task on execve. The regression is tracked down to: commit d519329f72a6 ("sched/fair: Update util_est only on util_avg updates") because in that patch we turn on by default the UTIL_EST sched feature. However, the real issue is introduced by: commit f9be3e5961c5 ("sched/fair: Use util_est in LB and WU paths") Let's fix this by ensuring to always discount the task estimated utilization from the CPU's estimated utilization when the task is also the current one. The same benchmark of the bug report, executed on a dual socket 40 CPUs Intel(R) Xeon(R) CPU E5-2690 v2 @ 3.00GHz machine, reports these "Execl Throughput" figures (higher the better): mainline : 48136.5 lps mainline+fix : 55376.5 lps which correspond to a 15% speedup. Moreover, since {cpu_util,capacity_spare}_wake() are not really only used from the wakeup path, let's remove this ambiguity by using a better matching name: {cpu_util,capacity_spare}_without(). Since we are at that, let's also improve the existing documentation. Reported-by: Aaron Lu <aaron.lu@intel.com> Reported-by: Ye Xiaolong <xiaolong.ye@intel.com> Tested-by: Aaron Lu <aaron.lu@intel.com> Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com> Cc: Juri Lelli <juri.lelli@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Morten Rasmussen <morten.rasmussen@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Quentin Perret <quentin.perret@arm.com> Cc: Steve Muckle <smuckle@google.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Todd Kjos <tkjos@google.com> Cc: Vincent Guittot <vincent.guittot@linaro.org> Fixes: f9be3e5961c5 (sched/fair: Use util_est in LB and WU paths) Link: https://lore.kernel.org/lkml/20181025093100.GB13236@e110439-lin/ Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/fair.c | 62 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3648d0300fdf..ac855b2f4774 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5674,11 +5674,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return target; } -static unsigned long cpu_util_wake(int cpu, struct task_struct *p); +static unsigned long cpu_util_without(int cpu, struct task_struct *p); -static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) +static unsigned long capacity_spare_without(int cpu, struct task_struct *p) { - return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0); + return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0); } /* @@ -5738,7 +5738,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); - spare_cap = capacity_spare_wake(i, p); + spare_cap = capacity_spare_without(i, p); if (spare_cap > max_spare_cap) max_spare_cap = spare_cap; @@ -5889,8 +5889,8 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return prev_cpu; /* - * We need task's util for capacity_spare_wake, sync it up to prev_cpu's - * last_update_time. + * We need task's util for capacity_spare_without, sync it up to + * prev_cpu's last_update_time. */ if (!(sd_flag & SD_BALANCE_FORK)) sync_entity_load_avg(&p->se); @@ -6216,10 +6216,19 @@ static inline unsigned long cpu_util(int cpu) } /* - * cpu_util_wake: Compute CPU utilization with any contributions from - * the waking task p removed. + * cpu_util_without: compute cpu utilization without any contributions from *p + * @cpu: the CPU which utilization is requested + * @p: the task which utilization should be discounted + * + * The utilization of a CPU is defined by the utilization of tasks currently + * enqueued on that CPU as well as tasks which are currently sleeping after an + * execution on that CPU. + * + * This method returns the utilization of the specified CPU by discounting the + * utilization of the specified task, whenever the task is currently + * contributing to the CPU utilization. */ -static unsigned long cpu_util_wake(int cpu, struct task_struct *p) +static unsigned long cpu_util_without(int cpu, struct task_struct *p) { struct cfs_rq *cfs_rq; unsigned int util; @@ -6231,7 +6240,7 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p) cfs_rq = &cpu_rq(cpu)->cfs; util = READ_ONCE(cfs_rq->avg.util_avg); - /* Discount task's blocked util from CPU's util */ + /* Discount task's util from CPU's util */ util -= min_t(unsigned int, util, task_util(p)); /* @@ -6240,14 +6249,14 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p) * a) if *p is the only task sleeping on this CPU, then: * cpu_util (== task_util) > util_est (== 0) * and thus we return: - * cpu_util_wake = (cpu_util - task_util) = 0 + * cpu_util_without = (cpu_util - task_util) = 0 * * b) if other tasks are SLEEPING on this CPU, which is now exiting * IDLE, then: * cpu_util >= task_util * cpu_util > util_est (== 0) * and thus we discount *p's blocked utilization to return: - * cpu_util_wake = (cpu_util - task_util) >= 0 + * cpu_util_without = (cpu_util - task_util) >= 0 * * c) if other tasks are RUNNABLE on that CPU and * util_est > cpu_util @@ -6260,8 +6269,33 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p) * covered by the following code when estimated utilization is * enabled. */ - if (sched_feat(UTIL_EST)) - util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); + if (sched_feat(UTIL_EST)) { + unsigned int estimated = + READ_ONCE(cfs_rq->avg.util_est.enqueued); + + /* + * Despite the following checks we still have a small window + * for a possible race, when an execl's select_task_rq_fair() + * races with LB's detach_task(): + * + * detach_task() + * p->on_rq = TASK_ON_RQ_MIGRATING; + * ---------------------------------- A + * deactivate_task() \ + * dequeue_task() + RaceTime + * util_est_dequeue() / + * ---------------------------------- B + * + * The additional check on "current == p" it's required to + * properly fix the execl regression and it helps in further + * reducing the chances for the above race. + */ + if (unlikely(task_on_rq_queued(p) || current == p)) { + estimated -= min_t(unsigned int, estimated, + (_task_util_est(p) | UTIL_AVG_UNCHANGED)); + } + util = max(util, estimated); + } /* * Utilization (estimated) can exceed the CPU capacity, thus let's -- cgit From dded2e159208a9edc21dd5c5f583afa28d378d39 Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@c-s.fr> Date: Thu, 27 Sep 2018 17:17:49 +0000 Subject: kdb: use correct pointer when 'btc' calls 'btt' On a powerpc 8xx, 'btc' fails as follows: Entering kdb (current=0x(ptrval), pid 282) due to Keyboard Entry kdb> btc btc: cpu status: Currently on cpu 0 Available cpus: 0 kdb_getarea: Bad address 0x0 when booting the kernel with 'debug_boot_weak_hash', it fails as well Entering kdb (current=0xba99ad80, pid 284) due to Keyboard Entry kdb> btc btc: cpu status: Currently on cpu 0 Available cpus: 0 kdb_getarea: Bad address 0xba99ad80 On other platforms, Oopses have been observed too, see https://github.com/linuxppc/linux/issues/139 This is due to btc calling 'btt' with %p pointer as an argument. This patch replaces %p by %px to get the real pointer value as expected by 'btt' Fixes: ad67b74d2469 ("printk: hash addresses printed with %p") Cc: <stable@vger.kernel.org> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr> Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org> Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org> --- kernel/debug/kdb/kdb_bt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 6ad4a9fcbd6f..7921ae4fca8d 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -179,14 +179,14 @@ kdb_bt(int argc, const char **argv) kdb_printf("no process for cpu %ld\n", cpu); return 0; } - sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu)); + sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu)); kdb_parse(buf); return 0; } kdb_printf("btc: cpu status: "); kdb_parse("cpu\n"); for_each_online_cpu(cpu) { - sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu)); + sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu)); kdb_parse(buf); touch_nmi_watchdog(); } -- cgit From 568fb6f42ac6851320adaea25f8f1b94de14e40a Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@c-s.fr> Date: Thu, 27 Sep 2018 17:17:57 +0000 Subject: kdb: print real address of pointers instead of hashed addresses Since commit ad67b74d2469 ("printk: hash addresses printed with %p"), all pointers printed with %p are printed with hashed addresses instead of real addresses in order to avoid leaking addresses in dmesg and syslog. But this applies to kdb too, with is unfortunate: Entering kdb (current=0x(ptrval), pid 329) due to Keyboard Entry kdb> ps 15 sleeping system daemon (state M) processes suppressed, use 'ps A' to see all. Task Addr Pid Parent [*] cpu State Thread Command 0x(ptrval) 329 328 1 0 R 0x(ptrval) *sh 0x(ptrval) 1 0 0 0 S 0x(ptrval) init 0x(ptrval) 3 2 0 0 D 0x(ptrval) rcu_gp 0x(ptrval) 4 2 0 0 D 0x(ptrval) rcu_par_gp 0x(ptrval) 5 2 0 0 D 0x(ptrval) kworker/0:0 0x(ptrval) 6 2 0 0 D 0x(ptrval) kworker/0:0H 0x(ptrval) 7 2 0 0 D 0x(ptrval) kworker/u2:0 0x(ptrval) 8 2 0 0 D 0x(ptrval) mm_percpu_wq 0x(ptrval) 10 2 0 0 D 0x(ptrval) rcu_preempt The whole purpose of kdb is to debug, and for debugging real addresses need to be known. In addition, data displayed by kdb doesn't go into dmesg. This patch replaces all %p by %px in kdb in order to display real addresses. Fixes: ad67b74d2469 ("printk: hash addresses printed with %p") Cc: <stable@vger.kernel.org> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr> Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org> --- kernel/debug/kdb/kdb_main.c | 14 +++++++------- kernel/debug/kdb/kdb_support.c | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index bb4fe4e1a601..959242084b40 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1192,7 +1192,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, if (reason == KDB_REASON_DEBUG) { /* special case below */ } else { - kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", + kdb_printf("\nEntering kdb (current=0x%px, pid %d) ", kdb_current, kdb_current ? kdb_current->pid : 0); #if defined(CONFIG_SMP) kdb_printf("on processor %d ", raw_smp_processor_id()); @@ -1208,7 +1208,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, */ switch (db_result) { case KDB_DB_BPT: - kdb_printf("\nEntering kdb (0x%p, pid %d) ", + kdb_printf("\nEntering kdb (0x%px, pid %d) ", kdb_current, kdb_current->pid); #if defined(CONFIG_SMP) kdb_printf("on processor %d ", raw_smp_processor_id()); @@ -2048,7 +2048,7 @@ static int kdb_lsmod(int argc, const char **argv) if (mod->state == MODULE_STATE_UNFORMED) continue; - kdb_printf("%-20s%8u 0x%p ", mod->name, + kdb_printf("%-20s%8u 0x%px ", mod->name, mod->core_layout.size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD kdb_printf("%4d ", module_refcount(mod)); @@ -2059,7 +2059,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf(" (Loading)"); else kdb_printf(" (Live)"); - kdb_printf(" 0x%p", mod->core_layout.base); + kdb_printf(" 0x%px", mod->core_layout.base); #ifdef CONFIG_MODULE_UNLOAD { @@ -2341,7 +2341,7 @@ void kdb_ps1(const struct task_struct *p) return; cpu = kdb_process_cpu(p); - kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n", + kdb_printf("0x%px %8d %8d %d %4d %c 0x%px %c%s\n", (void *)p, p->pid, p->parent->pid, kdb_task_has_cpu(p), kdb_process_cpu(p), kdb_task_state_char(p), @@ -2354,7 +2354,7 @@ void kdb_ps1(const struct task_struct *p) } else { if (KDB_TSK(cpu) != p) kdb_printf(" Error: does not match running " - "process table (0x%p)\n", KDB_TSK(cpu)); + "process table (0x%px)\n", KDB_TSK(cpu)); } } } @@ -2687,7 +2687,7 @@ int kdb_register_flags(char *cmd, for_each_kdbcmd(kp, i) { if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { kdb_printf("Duplicate kdb command registered: " - "%s, func %p help %s\n", cmd, func, help); + "%s, func %px help %s\n", cmd, func, help); return 1; } } diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 990b3cc526c8..987eb73284d2 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -40,7 +40,7 @@ int kdbgetsymval(const char *symname, kdb_symtab_t *symtab) { if (KDB_DEBUG(AR)) - kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname, + kdb_printf("kdbgetsymval: symname=%s, symtab=%px\n", symname, symtab); memset(symtab, 0, sizeof(*symtab)); symtab->sym_start = kallsyms_lookup_name(symname); @@ -88,7 +88,7 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) char *knt1 = NULL; if (KDB_DEBUG(AR)) - kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab); + kdb_printf("kdbnearsym: addr=0x%lx, symtab=%px\n", addr, symtab); memset(symtab, 0, sizeof(*symtab)); if (addr < 4096) @@ -149,7 +149,7 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) symtab->mod_name = "kernel"; if (KDB_DEBUG(AR)) kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, " - "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret, + "symtab->mod_name=%px, symtab->sym_name=%px (%s)\n", ret, symtab->sym_start, symtab->mod_name, symtab->sym_name, symtab->sym_name); @@ -887,13 +887,13 @@ void debug_kusage(void) __func__, dah_first); if (dah_first) { h_used = (struct debug_alloc_header *)debug_alloc_pool; - kdb_printf("%s: h_used %p size %d\n", __func__, h_used, + kdb_printf("%s: h_used %px size %d\n", __func__, h_used, h_used->size); } do { h_used = (struct debug_alloc_header *) ((char *)h_free + dah_overhead + h_free->size); - kdb_printf("%s: h_used %p size %d caller %p\n", + kdb_printf("%s: h_used %px size %d caller %px\n", __func__, h_used, h_used->size, h_used->caller); h_free = (struct debug_alloc_header *) (debug_alloc_pool + h_free->next); @@ -902,7 +902,7 @@ void debug_kusage(void) ((char *)h_free + dah_overhead + h_free->size); if ((char *)h_used - debug_alloc_pool != sizeof(debug_alloc_pool_aligned)) - kdb_printf("%s: h_used %p size %d caller %p\n", + kdb_printf("%s: h_used %px size %d caller %px\n", __func__, h_used, h_used->size, h_used->caller); out: spin_unlock(&dap_lock); -- cgit From c2b94c72d93d0929f48157eef128c4f9d2e603ce Mon Sep 17 00:00:00 2001 From: Prarit Bhargava <prarit@redhat.com> Date: Thu, 20 Sep 2018 08:59:14 -0400 Subject: kdb: Use strscpy with destination buffer size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc 8.1.0 warns with: kernel/debug/kdb/kdb_support.c: In function ‘kallsyms_symbol_next’: kernel/debug/kdb/kdb_support.c:239:4: warning: ‘strncpy’ specified bound depends on the length of the source argument [-Wstringop-overflow=] strncpy(prefix_name, name, strlen(name)+1); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ kernel/debug/kdb/kdb_support.c:239:31: note: length computed here Use strscpy() with the destination buffer size, and use ellipses when displaying truncated symbols. v2: Use strscpy() Signed-off-by: Prarit Bhargava <prarit@redhat.com> Cc: Jonathan Toppins <jtoppins@redhat.com> Cc: Jason Wessel <jason.wessel@windriver.com> Cc: Daniel Thompson <daniel.thompson@linaro.org> Cc: kgdb-bugreport@lists.sourceforge.net Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org> Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org> --- kernel/debug/kdb/kdb_io.c | 15 +++++++++------ kernel/debug/kdb/kdb_private.h | 2 +- kernel/debug/kdb/kdb_support.c | 10 +++++----- 3 files changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index ed5d34925ad0..6a4b41484afe 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -216,7 +216,7 @@ static char *kdb_read(char *buffer, size_t bufsize) int count; int i; int diag, dtab_count; - int key; + int key, buf_size, ret; diag = kdbgetintenv("DTABCOUNT", &dtab_count); @@ -336,9 +336,8 @@ poll_again: else p_tmp = tmpbuffer; len = strlen(p_tmp); - count = kallsyms_symbol_complete(p_tmp, - sizeof(tmpbuffer) - - (p_tmp - tmpbuffer)); + buf_size = sizeof(tmpbuffer) - (p_tmp - tmpbuffer); + count = kallsyms_symbol_complete(p_tmp, buf_size); if (tab == 2 && count > 0) { kdb_printf("\n%d symbols are found.", count); if (count > dtab_count) { @@ -350,9 +349,13 @@ poll_again: } kdb_printf("\n"); for (i = 0; i < count; i++) { - if (WARN_ON(!kallsyms_symbol_next(p_tmp, i))) + ret = kallsyms_symbol_next(p_tmp, i, buf_size); + if (WARN_ON(!ret)) break; - kdb_printf("%s ", p_tmp); + if (ret != -E2BIG) + kdb_printf("%s ", p_tmp); + else + kdb_printf("%s... ", p_tmp); *(p_tmp + len) = '\0'; } if (i >= dtab_count) diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 1e5a502ba4a7..2118d8258b7c 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -83,7 +83,7 @@ typedef struct __ksymtab { unsigned long sym_start; unsigned long sym_end; } kdb_symtab_t; -extern int kallsyms_symbol_next(char *prefix_name, int flag); +extern int kallsyms_symbol_next(char *prefix_name, int flag, int buf_size); extern int kallsyms_symbol_complete(char *prefix_name, int max_len); /* Exported Symbols for kernel loadable modules to use. */ diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 987eb73284d2..b14b0925c184 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -221,11 +221,13 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len) * Parameters: * prefix_name prefix of a symbol name to lookup * flag 0 means search from the head, 1 means continue search. + * buf_size maximum length that can be written to prefix_name + * buffer * Returns: * 1 if a symbol matches the given prefix. * 0 if no string found */ -int kallsyms_symbol_next(char *prefix_name, int flag) +int kallsyms_symbol_next(char *prefix_name, int flag, int buf_size) { int prefix_len = strlen(prefix_name); static loff_t pos; @@ -235,10 +237,8 @@ int kallsyms_symbol_next(char *prefix_name, int flag) pos = 0; while ((name = kdb_walk_kallsyms(&pos))) { - if (strncmp(name, prefix_name, prefix_len) == 0) { - strncpy(prefix_name, name, strlen(name)+1); - return 1; - } + if (!strncmp(name, prefix_name, prefix_len)) + return strscpy(prefix_name, name, buf_size); } return 0; } -- cgit From 9eb62f0e1bc70ebc9b15837a0c4e8f12a7b910cb Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" <gustavo@embeddedor.com> Date: Thu, 16 Aug 2018 09:01:41 -0500 Subject: kdb: kdb_main: refactor code in kdb_md_line Replace the whole switch statement with a for loop. This makes the code clearer and easy to read. This also addresses the following Coverity warnings: Addresses-Coverity-ID: 115090 ("Missing break in switch") Addresses-Coverity-ID: 115091 ("Missing break in switch") Addresses-Coverity-ID: 114700 ("Missing break in switch") Suggested-by: Daniel Thompson <daniel.thompson@linaro.org> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com> Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org> [daniel.thompson@linaro.org: Tiny grammar change in description] Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org> --- kernel/debug/kdb/kdb_main.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 959242084b40..d72b32c66f7d 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1493,6 +1493,7 @@ static void kdb_md_line(const char *fmtstr, unsigned long addr, char cbuf[32]; char *c = cbuf; int i; + int j; unsigned long word; memset(cbuf, '\0', sizeof(cbuf)); @@ -1538,25 +1539,9 @@ static void kdb_md_line(const char *fmtstr, unsigned long addr, wc.word = word; #define printable_char(c) \ ({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; }) - switch (bytesperword) { - case 8: + for (j = 0; j < bytesperword; j++) *c++ = printable_char(*cp++); - *c++ = printable_char(*cp++); - *c++ = printable_char(*cp++); - *c++ = printable_char(*cp++); - addr += 4; - case 4: - *c++ = printable_char(*cp++); - *c++ = printable_char(*cp++); - addr += 2; - case 2: - *c++ = printable_char(*cp++); - addr++; - case 1: - *c++ = printable_char(*cp++); - addr++; - break; - } + addr += bytesperword; #undef printable_char } } -- cgit From 01cb37351bafc1b44b962842926210115e231f0a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" <gustavo@embeddedor.com> Date: Sat, 4 Aug 2018 23:18:25 -0500 Subject: kdb: kdb_keyboard: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Notice that in this particular case, I replaced the code comments with a proper "fall through" annotation, which is what GCC is expecting to find. Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com> Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org> Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org> --- kernel/debug/kdb/kdb_keyboard.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index 118527aa60ea..750497b0003a 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -173,11 +173,11 @@ int kdb_get_kbd_char(void) case KT_LATIN: if (isprint(keychar)) break; /* printable characters */ - /* drop through */ + /* fall through */ case KT_SPEC: if (keychar == K_ENTER) break; - /* drop through */ + /* fall through */ default: return -1; /* ignore unprintables */ } -- cgit From 646558ff1643467d3b941b47f519867cbca462c3 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" <gustavo@embeddedor.com> Date: Sat, 4 Aug 2018 21:48:44 -0500 Subject: kdb: kdb_support: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Notice that in this particular case, I replaced the code comments with a proper "fall through" annotation, which is what GCC is expecting to find. Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com> Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org> Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org> --- kernel/debug/kdb/kdb_support.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index b14b0925c184..50bf9b119bad 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -432,7 +432,7 @@ int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size) *word = w8; break; } - /* drop through */ + /* fall through */ default: diag = KDB_BADWIDTH; kdb_printf("kdb_getphysword: bad width %ld\n", (long) size); @@ -481,7 +481,7 @@ int kdb_getword(unsigned long *word, unsigned long addr, size_t size) *word = w8; break; } - /* drop through */ + /* fall through */ default: diag = KDB_BADWIDTH; kdb_printf("kdb_getword: bad width %ld\n", (long) size); @@ -525,7 +525,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size) diag = kdb_putarea(addr, w8); break; } - /* drop through */ + /* fall through */ default: diag = KDB_BADWIDTH; kdb_printf("kdb_putword: bad width %ld\n", (long) size); -- cgit From afd594240806acc138cf696c09f2f4829d55d02f Mon Sep 17 00:00:00 2001 From: Edward Cree <ecree@solarflare.com> Date: Fri, 16 Nov 2018 12:00:07 +0000 Subject: bpf: fix off-by-one error in adjust_subprog_starts When patching in a new sequence for the first insn of a subprog, the start of that subprog does not change (it's the first insn of the sequence), so adjust_subprog_starts should check start <= off (rather than < off). Also added a test to test_verifier.c (it's essentially the syz reproducer). Fixes: cc8b0b92a169 ("bpf: introduce function calls (function boundaries)") Reported-by: syzbot+4fc427c7af994b0948be@syzkaller.appspotmail.com Signed-off-by: Edward Cree <ecree@solarflare.com> Acked-by: Yonghong Song <yhs@fb.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- kernel/bpf/verifier.c | 2 +- tools/testing/selftests/bpf/test_verifier.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1971ca325fb4..6dd419550aba 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5650,7 +5650,7 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len return; /* NOTE: fake 'exit' subprog should be updated as well. */ for (i = 0; i <= env->subprog_cnt; i++) { - if (env->subprog_info[i].start < off) + if (env->subprog_info[i].start <= off) continue; env->subprog_info[i].start += len - 1; } diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 6f61df62f690..550b7e46bf4a 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -13896,6 +13896,25 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, }, + { + "calls: ctx read at start of subprog", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5), + BPF_JMP_REG(BPF_JSGT, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, + .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .result_unpriv = REJECT, + .result = ACCEPT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) -- cgit From 569a933b03f3c48b392fe67c0086b3a6b9306b5a Mon Sep 17 00:00:00 2001 From: Roman Gushchin <guroan@gmail.com> Date: Wed, 14 Nov 2018 10:00:34 -0800 Subject: bpf: allocate local storage buffers using GFP_ATOMIC Naresh reported an issue with the non-atomic memory allocation of cgroup local storage buffers: [ 73.047526] BUG: sleeping function called from invalid context at /srv/oe/build/tmp-rpb-glibc/work-shared/intel-corei7-64/kernel-source/mm/slab.h:421 [ 73.060915] in_atomic(): 1, irqs_disabled(): 0, pid: 3157, name: test_cgroup_sto [ 73.068342] INFO: lockdep is turned off. [ 73.072293] CPU: 2 PID: 3157 Comm: test_cgroup_sto Not tainted 4.20.0-rc2-next-20181113 #1 [ 73.080548] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS 2.0b 07/27/2017 [ 73.088018] Call Trace: [ 73.090463] dump_stack+0x70/0xa5 [ 73.093783] ___might_sleep+0x152/0x240 [ 73.097619] __might_sleep+0x4a/0x80 [ 73.101191] __kmalloc_node+0x1cf/0x2f0 [ 73.105031] ? cgroup_storage_update_elem+0x46/0x90 [ 73.109909] cgroup_storage_update_elem+0x46/0x90 cgroup_storage_update_elem() (as well as other update map update callbacks) is called with disabled preemption, so GFP_ATOMIC allocation should be used: e.g. alloc_htab_elem() in hashtab.c. Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org> Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org> Signed-off-by: Roman Gushchin <guro@fb.com> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- kernel/bpf/local_storage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index c97a8f968638..bed9d48a7ae9 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -139,7 +139,8 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return -ENOENT; new = kmalloc_node(sizeof(struct bpf_storage_buffer) + - map->value_size, __GFP_ZERO | GFP_USER, + map->value_size, + __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, map->numa_node); if (!new) return -ENOMEM; -- cgit From 8fcb2312d1e3300e81aa871aad00d4c038cfc184 Mon Sep 17 00:00:00 2001 From: Olof Johansson <olof@lixom.net> Date: Fri, 16 Nov 2018 15:08:00 -0800 Subject: kernel/sched/psi.c: simplify cgroup_move_task() The existing code triggered an invalid warning about 'rq' possibly being used uninitialized. Instead of doing the silly warning suppression by initializa it to NULL, refactor the code to bail out early instead. Warning was: kernel/sched/psi.c: In function `cgroup_move_task': kernel/sched/psi.c:639:13: warning: `rq' may be used uninitialized in this function [-Wmaybe-uninitialized] Link: http://lkml.kernel.org/r/20181103183339.8669-1-olof@lixom.net Fixes: 2ce7135adc9ad ("psi: cgroup support") Signed-off-by: Olof Johansson <olof@lixom.net> Reviewed-by: Andrew Morton <akpm@linux-foundation.org> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/sched/psi.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7cdecfc010af..3d7355d7c3e3 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -633,38 +633,39 @@ void psi_cgroup_free(struct cgroup *cgroup) */ void cgroup_move_task(struct task_struct *task, struct css_set *to) { - bool move_psi = !psi_disabled; unsigned int task_flags = 0; struct rq_flags rf; struct rq *rq; - if (move_psi) { - rq = task_rq_lock(task, &rf); + if (psi_disabled) { + /* + * Lame to do this here, but the scheduler cannot be locked + * from the outside, so we move cgroups from inside sched/. + */ + rcu_assign_pointer(task->cgroups, to); + return; + } - if (task_on_rq_queued(task)) - task_flags = TSK_RUNNING; - else if (task->in_iowait) - task_flags = TSK_IOWAIT; + rq = task_rq_lock(task, &rf); - if (task->flags & PF_MEMSTALL) - task_flags |= TSK_MEMSTALL; + if (task_on_rq_queued(task)) + task_flags = TSK_RUNNING; + else if (task->in_iowait) + task_flags = TSK_IOWAIT; - if (task_flags) - psi_task_change(task, task_flags, 0); - } + if (task->flags & PF_MEMSTALL) + task_flags |= TSK_MEMSTALL; - /* - * Lame to do this here, but the scheduler cannot be locked - * from the outside, so we move cgroups from inside sched/. - */ + if (task_flags) + psi_task_change(task, task_flags, 0); + + /* See comment above */ rcu_assign_pointer(task->cgroups, to); - if (move_psi) { - if (task_flags) - psi_task_change(task, 0, task_flags); + if (task_flags) + psi_task_change(task, 0, task_flags); - task_rq_unlock(rq, task, &rf); - } + task_rq_unlock(rq, task, &rf); } #endif /* CONFIG_CGROUPS */ -- cgit From cb216b84d6ea24fa10f1e7aac35de77246841041 Mon Sep 17 00:00:00 2001 From: Robin Murphy <robin.murphy@arm.com> Date: Wed, 21 Nov 2018 16:00:51 +0000 Subject: swiotlb: Skip cache maintenance on map error If swiotlb_bounce_page() failed, calling arch_sync_dma_for_device() may lead to such delights as performing cache maintenance on whatever address phys_to_virt(SWIOTLB_MAP_ERROR) looks like, which is typically outside the kernel memory map and goes about as well as expected. Don't do that. Fixes: a4a4330db46a ("swiotlb: add support for non-coherent DMA") Tested-by: John Stultz <john.stultz@linaro.org> Signed-off-by: Robin Murphy <robin.murphy@arm.com> Signed-off-by: Christoph Hellwig <hch@lst.de> --- kernel/dma/swiotlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 5731daa09a32..045930e32c0e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -679,7 +679,8 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, } if (!dev_is_dma_coherent(dev) && - (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0) + (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0 && + dev_addr != DIRECT_MAPPING_ERROR) arch_sync_dma_for_device(dev, phys, size, dir); return dev_addr; -- cgit From 813961de3ee6474dd5703e883471fd941d6c8f69 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov <ast@kernel.org> Date: Thu, 22 Nov 2018 10:49:56 -0800 Subject: bpf: fix integer overflow in queue_stack_map Fix the following issues: - allow queue_stack_map for root only - fix u32 max_entries overflow - disallow value_size == 0 Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") Reported-by: Wei Wu <ww9210@gmail.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Cc: Mauricio Vasquez B <mauricio.vasquez@polito.it> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> --- kernel/bpf/queue_stack_maps.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 8bbd72d3a121..b384ea9f3254 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -7,6 +7,7 @@ #include <linux/bpf.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/capability.h> #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ @@ -45,8 +46,12 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || + attr->value_size == 0 || attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) return -EINVAL; @@ -63,15 +68,10 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { int ret, numa_node = bpf_map_attr_numa_node(attr); struct bpf_queue_stack *qs; - u32 size, value_size; - u64 queue_size, cost; - - size = attr->max_entries + 1; - value_size = attr->value_size; - - queue_size = sizeof(*qs) + (u64) value_size * size; + u64 size, queue_size, cost; - cost = queue_size; + size = (u64) attr->max_entries + 1; + cost = queue_size = sizeof(*qs) + size * attr->value_size; if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-E2BIG); -- cgit From 09d3f015d1e1b4fee7e9bbdcf54201d239393391 Mon Sep 17 00:00:00 2001 From: Andrea Parri <andrea.parri@amarulasolutions.com> Date: Thu, 22 Nov 2018 17:10:31 +0100 Subject: uprobes: Fix handle_swbp() vs. unregister() + register() race once more Commit: 142b18ddc8143 ("uprobes: Fix handle_swbp() vs unregister() + register() race") added the UPROBE_COPY_INSN flag, and corresponding smp_wmb() and smp_rmb() memory barriers, to ensure that handle_swbp() uses fully-initialized uprobes only. However, the smp_rmb() is mis-placed: this barrier should be placed after handle_swbp() has tested for the flag, thus guaranteeing that (program-order) subsequent loads from the uprobe can see the initial stores performed by prepare_uprobe(). Move the smp_rmb() accordingly. Also amend the comments associated to the two memory barriers to indicate their actual locations. Signed-off-by: Andrea Parri <andrea.parri@amarulasolutions.com> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vince Weaver <vincent.weaver@maine.edu> Cc: stable@kernel.org Fixes: 142b18ddc8143 ("uprobes: Fix handle_swbp() vs unregister() + register() race") Link: http://lkml.kernel.org/r/20181122161031.15179-1-andrea.parri@amarulasolutions.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/events/uprobes.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 96d4bee83489..322e97bbb437 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -829,7 +829,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, BUG_ON((uprobe->offset & ~PAGE_MASK) + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - smp_wmb(); /* pairs with rmb() in find_active_uprobe() */ + smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */ set_bit(UPROBE_COPY_INSN, &uprobe->flags); out: @@ -2178,10 +2178,18 @@ static void handle_swbp(struct pt_regs *regs) * After we hit the bp, _unregister + _register can install the * new and not-yet-analyzed uprobe at the same address, restart. */ - smp_rmb(); /* pairs with wmb() in install_breakpoint() */ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; + /* + * Pairs with the smp_wmb() in prepare_uprobe(). + * + * Guarantees that if we see the UPROBE_COPY_INSN bit set, then + * we must also see the stores to &uprobe->arch performed by the + * prepare_uprobe() call. + */ + smp_rmb(); + /* Tracing handlers use ->utask to communicate with fetch methods */ if (!get_utask()) goto out; -- cgit From 1efb6ee3edea57f57f9fb05dba8dcb3f7333f61f Mon Sep 17 00:00:00 2001 From: Martynas Pumputis <m@lambda.lt> Date: Fri, 23 Nov 2018 17:43:26 +0100 Subject: bpf: fix check of allowed specifiers in bpf_trace_printk A format string consisting of "%p" or "%s" followed by an invalid specifier (e.g. "%p%\n" or "%s%") could pass the check which would make format_decode (lib/vsprintf.c) to warn. Fixes: 9c959c863f82 ("tracing: Allow BPF programs to call bpf_trace_printk()") Reported-by: syzbot+1ec5c5ec949c4adaa0c4@syzkaller.appspotmail.com Signed-off-by: Martynas Pumputis <m@lambda.lt> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> --- kernel/trace/bpf_trace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 08fcfe440c63..9864a35c8bb5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -196,11 +196,13 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, i++; } else if (fmt[i] == 'p' || fmt[i] == 's') { mod[fmt_cnt]++; - i++; - if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) + /* disallow any further format extensions */ + if (fmt[i + 1] != 0 && + !isspace(fmt[i + 1]) && + !ispunct(fmt[i + 1])) return -EINVAL; fmt_cnt++; - if (fmt[i - 1] == 's') { + if (fmt[i] == 's') { if (str_seen) /* allow only one '%s' per fmt string */ return -EINVAL; -- cgit From 8114865ff82e200b383e46821c25cb0625b842b5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" <rostedt@goodmis.org> Date: Sun, 18 Nov 2018 17:10:15 -0500 Subject: function_graph: Create function_graph_enter() to consolidate architecture code Currently all the architectures do basically the same thing in preparing the function graph tracer on entry to a function. This code can be pulled into a generic location and then this will allow the function graph tracer to be fixed, as well as extended. Create a new function graph helper function_graph_enter() that will call the hook function (ftrace_graph_entry) and the shadow stack operation (ftrace_push_return_trace), and remove the need of the architecture code to manage the shadow stack. This is needed to prepare for a fix of a design bug on how the curr_ret_stack is used. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> --- include/linux/ftrace.h | 3 +++ kernel/trace/trace_functions_graph.c | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a397907e8d72..5717e8f81c59 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -779,6 +779,9 @@ extern void return_to_handler(void); extern int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, unsigned long frame_pointer, unsigned long *retp); +extern int +function_graph_enter(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp); unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret, unsigned long *retp); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 169b3c44ee97..28f2602435d0 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -182,6 +182,22 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, return 0; } +int function_graph_enter(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp) +{ + struct ftrace_graph_ent trace; + + trace.func = func; + trace.depth = current->curr_ret_stack + 1; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) + return -EBUSY; + + return ftrace_push_return_trace(ret, func, &trace.depth, + frame_pointer, retp); +} + /* Retrieve a function return address to the trace stack on thread info.*/ static void ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, -- cgit From e2c95a61656d29ceaac97b6a975c8a1f26e26f15 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann <daniel@iogearbox.net> Date: Mon, 26 Nov 2018 14:05:38 +0100 Subject: bpf, ppc64: generalize fetching subprog into bpf_jit_get_func_addr Make fetching of the BPF call address from ppc64 JIT generic. ppc64 was using a slightly different variant rather than through the insns' imm field encoding as the target address would not fit into that space. Therefore, the target subprog number was encoded into the insns' offset and fetched through fp->aux->func[off]->bpf_func instead. Given there are other JITs with this issue and the mechanism of fetching the address is JIT-generic, move it into the core as a helper instead. On the JIT side, we get information on whether the retrieved address is a fixed one, that is, not changing through JIT passes, or a dynamic one. For the former, JITs can optimize their imm emission because this doesn't change jump offsets throughout JIT process. Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Reviewed-by: Sandipan Das <sandipan@linux.ibm.com> Tested-by: Sandipan Das <sandipan@linux.ibm.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- arch/powerpc/net/bpf_jit_comp64.c | 57 ++++++++++++++++++++++++++------------- include/linux/filter.h | 4 +++ kernel/bpf/core.c | 34 +++++++++++++++++++++++ 3 files changed, 76 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 50b129785aee..17482f5de3e2 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -166,7 +166,33 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) PPC_BLR(); } -static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func) +static void bpf_jit_emit_func_call_hlp(u32 *image, struct codegen_context *ctx, + u64 func) +{ +#ifdef PPC64_ELF_ABI_v1 + /* func points to the function descriptor */ + PPC_LI64(b2p[TMP_REG_2], func); + /* Load actual entry point from function descriptor */ + PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0); + /* ... and move it to LR */ + PPC_MTLR(b2p[TMP_REG_1]); + /* + * Load TOC from function descriptor at offset 8. + * We can clobber r2 since we get called through a + * function pointer (so caller will save/restore r2) + * and since we don't use a TOC ourself. + */ + PPC_BPF_LL(2, b2p[TMP_REG_2], 8); +#else + /* We can clobber r12 */ + PPC_FUNC_ADDR(12, func); + PPC_MTLR(12); +#endif + PPC_BLRL(); +} + +static void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, + u64 func) { unsigned int i, ctx_idx = ctx->idx; @@ -273,7 +299,7 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, { const struct bpf_insn *insn = fp->insnsi; int flen = fp->len; - int i; + int i, ret; /* Start of epilogue code - will only be valid 2nd pass onwards */ u32 exit_addr = addrs[flen]; @@ -284,8 +310,9 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 src_reg = b2p[insn[i].src_reg]; s16 off = insn[i].off; s32 imm = insn[i].imm; + bool func_addr_fixed; + u64 func_addr; u64 imm64; - u8 *func; u32 true_cond; u32 tmp_idx; @@ -711,23 +738,15 @@ emit_clear: case BPF_JMP | BPF_CALL: ctx->seen |= SEEN_FUNC; - /* bpf function call */ - if (insn[i].src_reg == BPF_PSEUDO_CALL) - if (!extra_pass) - func = NULL; - else if (fp->aux->func && off < fp->aux->func_cnt) - /* use the subprog id from the off - * field to lookup the callee address - */ - func = (u8 *) fp->aux->func[off]->bpf_func; - else - return -EINVAL; - /* kernel helper call */ - else - func = (u8 *) __bpf_call_base + imm; - - bpf_jit_emit_func_call(image, ctx, (u64)func); + ret = bpf_jit_get_func_addr(fp, &insn[i], extra_pass, + &func_addr, &func_addr_fixed); + if (ret < 0) + return ret; + if (func_addr_fixed) + bpf_jit_emit_func_call_hlp(image, ctx, func_addr); + else + bpf_jit_emit_func_call_rel(image, ctx, func_addr); /* move return value from r3 to BPF_REG_0 */ PPC_MR(b2p[BPF_REG_0], 3); break; diff --git a/include/linux/filter.h b/include/linux/filter.h index de629b706d1d..448dcc448f1f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -866,6 +866,10 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr); void bpf_jit_free(struct bpf_prog *fp); +int bpf_jit_get_func_addr(const struct bpf_prog *prog, + const struct bpf_insn *insn, bool extra_pass, + u64 *func_addr, bool *func_addr_fixed); + struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1a796e0799ec..b1a3545d0ec8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -672,6 +672,40 @@ void __weak bpf_jit_free(struct bpf_prog *fp) bpf_prog_unlock_free(fp); } +int bpf_jit_get_func_addr(const struct bpf_prog *prog, + const struct bpf_insn *insn, bool extra_pass, + u64 *func_addr, bool *func_addr_fixed) +{ + s16 off = insn->off; + s32 imm = insn->imm; + u8 *addr; + + *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL; + if (!*func_addr_fixed) { + /* Place-holder address till the last pass has collected + * all addresses for JITed subprograms in which case we + * can pick them up from prog->aux. + */ + if (!extra_pass) + addr = NULL; + else if (prog->aux->func && + off >= 0 && off < prog->aux->func_cnt) + addr = (u8 *)prog->aux->func[off]->bpf_func; + else + return -EINVAL; + } else { + /* Address of a BPF helper call. Since part of the core + * kernel, it's always at a fixed location. __bpf_call_base + * and the helper with imm relative to it are both in core + * kernel. + */ + addr = (u8 *)__bpf_call_base + imm; + } + + *func_addr = (unsigned long)addr; + return 0; +} + static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff) -- cgit From d125f3f866df88da5a85df00291f88f0baa89f7c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" <rostedt@goodmis.org> Date: Mon, 19 Nov 2018 07:40:39 -0500 Subject: function_graph: Make ftrace_push_return_trace() static As all architectures now call function_graph_enter() to do the entry work, no architecture should ever call ftrace_push_return_trace(). Make it static. This is needed to prepare for a fix of a design bug on how the curr_ret_stack is used. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> --- include/linux/ftrace.h | 3 --- kernel/trace/trace_functions_graph.c | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 5717e8f81c59..dd16e8218db3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -776,9 +776,6 @@ struct ftrace_ret_stack { */ extern void return_to_handler(void); -extern int -ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, - unsigned long frame_pointer, unsigned long *retp); extern int function_graph_enter(unsigned long ret, unsigned long func, unsigned long frame_pointer, unsigned long *retp); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 28f2602435d0..88ca787a1cdc 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -118,7 +118,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags); /* Add a function return address to the trace stack on thread info.*/ -int +static int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, unsigned long frame_pointer, unsigned long *retp) { -- cgit From 39eb456dacb543de90d3bc6a8e0ac5cf51ac475e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" <rostedt@goodmis.org> Date: Mon, 19 Nov 2018 08:07:12 -0500 Subject: function_graph: Use new curr_ret_depth to manage depth instead of curr_ret_stack Currently, the depth of the ret_stack is determined by curr_ret_stack index. The issue is that there's a race between setting of the curr_ret_stack and calling of the callback attached to the return of the function. Commit 03274a3ffb44 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") moved the calling of the callback to after the setting of the curr_ret_stack, even stating that it was safe to do so, when in fact, it was the reason there was a barrier() there (yes, I should have commented that barrier()). Not only does the curr_ret_stack keep track of the current call graph depth, it also keeps the ret_stack content from being overwritten by new data. The function profiler, uses the "subtime" variable of ret_stack structure and by moving the curr_ret_stack, it allows for interrupts to use the same structure it was using, corrupting the data, and breaking the profiler. To fix this, there needs to be two variables to handle the call stack depth and the pointer to where the ret_stack is being used, as they need to change at two different locations. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> --- include/linux/sched.h | 1 + kernel/trace/ftrace.c | 3 +++ kernel/trace/trace_functions_graph.c | 21 +++++++++++++-------- 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index a51c13c2b1a0..d6183a55e8eb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1116,6 +1116,7 @@ struct task_struct { #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; + int curr_ret_depth; /* Stack of return addresses for return function tracing: */ struct ftrace_ret_stack *ret_stack; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f536f601bd46..48513954713c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6814,6 +6814,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->curr_ret_stack = -1; + t->curr_ret_depth = -1; /* Make sure the tasks see the -1 first: */ smp_wmb(); t->ret_stack = ret_stack_list[start++]; @@ -7038,6 +7039,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) { t->curr_ret_stack = -1; + t->curr_ret_depth = -1; /* * The idle task has no parent, it either has its own * stack or no stack at all. @@ -7068,6 +7070,7 @@ void ftrace_graph_init_task(struct task_struct *t) /* Make sure we do not use the parent ret_stack */ t->ret_stack = NULL; t->curr_ret_stack = -1; + t->curr_ret_depth = -1; if (ftrace_graph_active) { struct ftrace_ret_stack *ret_stack; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 88ca787a1cdc..02d4081a7f5a 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, /* Add a function return address to the trace stack on thread info.*/ static int -ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, +ftrace_push_return_trace(unsigned long ret, unsigned long func, unsigned long frame_pointer, unsigned long *retp) { unsigned long long calltime; @@ -177,8 +177,6 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, #ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR current->ret_stack[index].retp = retp; #endif - *depth = current->curr_ret_stack; - return 0; } @@ -188,14 +186,20 @@ int function_graph_enter(unsigned long ret, unsigned long func, struct ftrace_graph_ent trace; trace.func = func; - trace.depth = current->curr_ret_stack + 1; + trace.depth = ++current->curr_ret_depth; /* Only trace if the calling function expects to */ if (!ftrace_graph_entry(&trace)) - return -EBUSY; + goto out; - return ftrace_push_return_trace(ret, func, &trace.depth, - frame_pointer, retp); + if (ftrace_push_return_trace(ret, func, + frame_pointer, retp)) + goto out; + + return 0; + out: + current->curr_ret_depth--; + return -EBUSY; } /* Retrieve a function return address to the trace stack on thread info.*/ @@ -257,7 +261,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, trace->func = current->ret_stack[index].func; trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); - trace->depth = index; + trace->depth = current->curr_ret_depth; } /* @@ -273,6 +277,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) trace.rettime = trace_clock_local(); barrier(); current->curr_ret_stack--; + current->curr_ret_depth--; /* * The curr_ret_stack can be less than -1 only if it was * filtered out and it's about to return from the function. -- cgit From 552701dd0fa7c3d448142e87210590ba424694a0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" <rostedt@goodmis.org> Date: Mon, 19 Nov 2018 15:18:40 -0500 Subject: function_graph: Move return callback before update of curr_ret_stack In the past, curr_ret_stack had two functions. One was to denote the depth of the call graph, the other is to keep track of where on the ret_stack the data is used. Although they may be slightly related, there are two cases where they need to be used differently. The one case is that it keeps the ret_stack data from being corrupted by an interrupt coming in and overwriting the data still in use. The other is just to know where the depth of the stack currently is. The function profiler uses the ret_stack to save a "subtime" variable that is part of the data on the ret_stack. If curr_ret_stack is modified too early, then this variable can be corrupted. The "max_depth" option, when set to 1, will record the first functions going into the kernel. To see all top functions (when dealing with timings), the depth variable needs to be lowered before calling the return hook. But by lowering the curr_ret_stack, it makes the data on the ret_stack still being used by the return hook susceptible to being overwritten. Now that there's two variables to handle both cases (curr_ret_depth), we can move them to the locations where they can handle both cases. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> --- kernel/trace/trace_functions_graph.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 02d4081a7f5a..4f0d72ae6362 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -261,7 +261,13 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, trace->func = current->ret_stack[index].func; trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); - trace->depth = current->curr_ret_depth; + trace->depth = current->curr_ret_depth--; + /* + * We still want to trace interrupts coming in if + * max_depth is set to 1. Make sure the decrement is + * seen before ftrace_graph_return. + */ + barrier(); } /* @@ -275,9 +281,14 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) ftrace_pop_return_trace(&trace, &ret, frame_pointer); trace.rettime = trace_clock_local(); + ftrace_graph_return(&trace); + /* + * The ftrace_graph_return() may still access the current + * ret_stack structure, we need to make sure the update of + * curr_ret_stack is after that. + */ barrier(); current->curr_ret_stack--; - current->curr_ret_depth--; /* * The curr_ret_stack can be less than -1 only if it was * filtered out and it's about to return from the function. @@ -288,13 +299,6 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } - /* - * The trace should run after decrementing the ret counter - * in case an interrupt were to come in. We don't want to - * lose the interrupt if max_depth is set. - */ - ftrace_graph_return(&trace); - if (unlikely(!ret)) { ftrace_graph_stop(); WARN_ON(1); -- cgit From 7c6ea35ef50810aa12ab26f21cb858d980881576 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" <rostedt@goodmis.org> Date: Tue, 20 Nov 2018 12:40:25 -0500 Subject: function_graph: Reverse the order of pushing the ret_stack and the callback The function graph profiler uses the ret_stack to store the "subtime" and reuse it by nested functions and also on the return. But the current logic has the profiler callback called before the ret_stack is updated, and it is just modifying the ret_stack that will later be allocated (it's just lucky that the "subtime" is not touched when it is allocated). This could also cause a crash if we are at the end of the ret_stack when this happens. By reversing the order of the allocating the ret_stack and then calling the callbacks attached to a function being traced, the ret_stack entry is no longer used before it is allocated. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> --- kernel/trace/trace_functions_graph.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4f0d72ae6362..2561460d7baf 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -188,15 +188,17 @@ int function_graph_enter(unsigned long ret, unsigned long func, trace.func = func; trace.depth = ++current->curr_ret_depth; - /* Only trace if the calling function expects to */ - if (!ftrace_graph_entry(&trace)) - goto out; - if (ftrace_push_return_trace(ret, func, frame_pointer, retp)) goto out; + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) + goto out_ret; + return 0; + out_ret: + current->curr_ret_stack--; out: current->curr_ret_depth--; return -EBUSY; -- cgit From b1b35f2e218a5b57d03bbc3b0667d5064570dc60 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" <rostedt@goodmis.org> Date: Tue, 20 Nov 2018 12:51:07 -0500 Subject: function_graph: Have profiler use curr_ret_stack and not depth The profiler uses trace->depth to find its entry on the ret_stack, but the depth may not match the actual location of where its entry is (if an interrupt were to preempt the processing of the profiler for another function, the depth and the curr_ret_stack will be different). Have it use the curr_ret_stack as the index to find its ret_stack entry instead of using the depth variable, as that is no longer guaranteed to be the same. Cc: stable@kernel.org Fixes: 03274a3ffb449 ("tracing/fgraph: Adjust fgraph depth before calling trace return callback") Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 48513954713c..77734451cb05 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -817,7 +817,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int profile_graph_entry(struct ftrace_graph_ent *trace) { - int index = trace->depth; + int index = current->curr_ret_stack; function_profile_call(trace->func, 0, NULL, NULL); @@ -852,7 +852,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) if (!fgraph_graph_time) { int index; - index = trace->depth; + index = current->curr_ret_stack; /* Append this call time to the parent time to subtract */ if (index) -- cgit From c5511d03ec090980732e929c318a7a6374b5550e Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" <peterz@infradead.org> Date: Sun, 25 Nov 2018 19:33:36 +0100 Subject: sched/smt: Make sched_smt_present track topology Currently the 'sched_smt_present' static key is enabled when at CPU bringup SMT topology is observed, but it is never disabled. However there is demand to also disable the key when the topology changes such that there is no SMT present anymore. Implement this by making the key count the number of cores that have SMT enabled. In particular, the SMT topology bits are set before interrrupts are enabled and similarly, are cleared after interrupts are disabled for the last time and the CPU dies. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Ingo Molnar <mingo@kernel.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Tom Lendacky <thomas.lendacky@amd.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: David Woodhouse <dwmw@amazon.co.uk> Cc: Tim Chen <tim.c.chen@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Casey Schaufler <casey.schaufler@intel.com> Cc: Asit Mallick <asit.k.mallick@intel.com> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: Jon Masters <jcm@redhat.com> Cc: Waiman Long <longman9394@gmail.com> Cc: Greg KH <gregkh@linuxfoundation.org> Cc: Dave Stewart <david.c.stewart@intel.com> Cc: Kees Cook <keescook@chromium.org> Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.246110444@linutronix.de --- kernel/sched/core.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 091e089063be..6fedf3a98581 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5738,15 +5738,10 @@ int sched_cpu_activate(unsigned int cpu) #ifdef CONFIG_SCHED_SMT /* - * The sched_smt_present static key needs to be evaluated on every - * hotplug event because at boot time SMT might be disabled when - * the number of booted CPUs is limited. - * - * If then later a sibling gets hotplugged, then the key would stay - * off and SMT scheduling would never be functional. + * When going up, increment the number of cores with SMT present. */ - if (cpumask_weight(cpu_smt_mask(cpu)) > 1) - static_branch_enable_cpuslocked(&sched_smt_present); + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_inc_cpuslocked(&sched_smt_present); #endif set_cpu_active(cpu, true); @@ -5790,6 +5785,14 @@ int sched_cpu_deactivate(unsigned int cpu) */ synchronize_rcu_mult(call_rcu, call_rcu_sched); +#ifdef CONFIG_SCHED_SMT + /* + * When going down, decrement the number of cores with SMT present. + */ + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_dec_cpuslocked(&sched_smt_present); +#endif + if (!sched_smp_initialized) return 0; -- cgit From 321a874a7ef85655e93b3206d0f36b4a6097f948 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sun, 25 Nov 2018 19:33:38 +0100 Subject: sched/smt: Expose sched_smt_present static key Make the scheduler's 'sched_smt_present' static key globaly available, so it can be used in the x86 speculation control code. Provide a query function and a stub for the CONFIG_SMP=n case. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Ingo Molnar <mingo@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Tom Lendacky <thomas.lendacky@amd.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: David Woodhouse <dwmw@amazon.co.uk> Cc: Tim Chen <tim.c.chen@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Casey Schaufler <casey.schaufler@intel.com> Cc: Asit Mallick <asit.k.mallick@intel.com> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: Jon Masters <jcm@redhat.com> Cc: Waiman Long <longman9394@gmail.com> Cc: Greg KH <gregkh@linuxfoundation.org> Cc: Dave Stewart <david.c.stewart@intel.com> Cc: Kees Cook <keescook@chromium.org> Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.430168326@linutronix.de --- include/linux/sched/smt.h | 18 ++++++++++++++++++ kernel/sched/sched.h | 4 +--- 2 files changed, 19 insertions(+), 3 deletions(-) create mode 100644 include/linux/sched/smt.h (limited to 'kernel') diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h new file mode 100644 index 000000000000..c9e0be514110 --- /dev/null +++ b/include/linux/sched/smt.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_SMT_H +#define _LINUX_SCHED_SMT_H + +#include <linux/static_key.h> + +#ifdef CONFIG_SCHED_SMT +extern struct static_key_false sched_smt_present; + +static __always_inline bool sched_smt_active(void) +{ + return static_branch_likely(&sched_smt_present); +} +#else +static inline bool sched_smt_active(void) { return false; } +#endif + +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 618577fc9aa8..4e524ab589c9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -23,6 +23,7 @@ #include <linux/sched/prio.h> #include <linux/sched/rt.h> #include <linux/sched/signal.h> +#include <linux/sched/smt.h> #include <linux/sched/stat.h> #include <linux/sched/sysctl.h> #include <linux/sched/task.h> @@ -936,9 +937,6 @@ static inline int cpu_of(struct rq *rq) #ifdef CONFIG_SCHED_SMT - -extern struct static_key_false sched_smt_present; - extern void __update_idle_core(struct rq *rq); static inline void update_idle_core(struct rq *rq) -- cgit From a74cfffb03b73d41e08f84c2e5c87dec0ce3db9f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sun, 25 Nov 2018 19:33:39 +0100 Subject: x86/speculation: Rework SMT state change arch_smt_update() is only called when the sysfs SMT control knob is changed. This means that when SMT is enabled in the sysfs control knob the system is considered to have SMT active even if all siblings are offline. To allow finegrained control of the speculation mitigations, the actual SMT state is more interesting than the fact that siblings could be enabled. Rework the code, so arch_smt_update() is invoked from each individual CPU hotplug function, and simplify the update function while at it. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Ingo Molnar <mingo@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Tom Lendacky <thomas.lendacky@amd.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: David Woodhouse <dwmw@amazon.co.uk> Cc: Tim Chen <tim.c.chen@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Casey Schaufler <casey.schaufler@intel.com> Cc: Asit Mallick <asit.k.mallick@intel.com> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: Jon Masters <jcm@redhat.com> Cc: Waiman Long <longman9394@gmail.com> Cc: Greg KH <gregkh@linuxfoundation.org> Cc: Dave Stewart <david.c.stewart@intel.com> Cc: Kees Cook <keescook@chromium.org> Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.521974984@linutronix.de --- arch/x86/kernel/cpu/bugs.c | 11 +++++------ include/linux/sched/smt.h | 2 ++ kernel/cpu.c | 15 +++++++++------ 3 files changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a723af0c4400..5625b323ff32 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/nospec.h> #include <linux/prctl.h> +#include <linux/sched/smt.h> #include <asm/spec-ctrl.h> #include <asm/cmdline.h> @@ -344,16 +345,14 @@ void arch_smt_update(void) return; mutex_lock(&spec_ctrl_mutex); - mask = x86_spec_ctrl_base; - if (cpu_smt_control == CPU_SMT_ENABLED) + + mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP; + if (sched_smt_active()) mask |= SPEC_CTRL_STIBP; - else - mask &= ~SPEC_CTRL_STIBP; if (mask != x86_spec_ctrl_base) { pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n", - cpu_smt_control == CPU_SMT_ENABLED ? - "Enabling" : "Disabling"); + mask & SPEC_CTRL_STIBP ? "Enabling" : "Disabling"); x86_spec_ctrl_base = mask; on_each_cpu(update_stibp_msr, NULL, 1); } diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h index c9e0be514110..59d3736c454c 100644 --- a/include/linux/sched/smt.h +++ b/include/linux/sched/smt.h @@ -15,4 +15,6 @@ static __always_inline bool sched_smt_active(void) static inline bool sched_smt_active(void) { return false; } #endif +void arch_smt_update(void); + #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index 3c7f3b4c453c..91d5c38eb7e5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -10,6 +10,7 @@ #include <linux/sched/signal.h> #include <linux/sched/hotplug.h> #include <linux/sched/task.h> +#include <linux/sched/smt.h> #include <linux/unistd.h> #include <linux/cpu.h> #include <linux/oom.h> @@ -367,6 +368,12 @@ static void lockdep_release_cpus_lock(void) #endif /* CONFIG_HOTPLUG_CPU */ +/* + * Architectures that need SMT-specific errata handling during SMT hotplug + * should override this. + */ +void __weak arch_smt_update(void) { } + #ifdef CONFIG_HOTPLUG_SMT enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; EXPORT_SYMBOL_GPL(cpu_smt_control); @@ -1011,6 +1018,7 @@ out: * concurrent CPU hotplug via cpu_add_remove_lock. */ lockup_detector_cleanup(); + arch_smt_update(); return ret; } @@ -1139,6 +1147,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) ret = cpuhp_up_callbacks(cpu, st, target); out: cpus_write_unlock(); + arch_smt_update(); return ret; } @@ -2055,12 +2064,6 @@ static void cpuhp_online_cpu_device(unsigned int cpu) kobject_uevent(&dev->kobj, KOBJ_ONLINE); } -/* - * Architectures that need SMT-specific errata handling during SMT hotplug - * should override this. - */ -void __weak arch_smt_update(void) { }; - static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { int cpu, ret = 0; -- cgit From 46f7ecb1e7359f183f5bbd1e08b90e10e52164f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sun, 25 Nov 2018 19:33:50 +0100 Subject: ptrace: Remove unused ptrace_may_access_sched() and MODE_IBRS The IBPB control code in x86 removed the usage. Remove the functionality which was introduced for this. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Ingo Molnar <mingo@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Tom Lendacky <thomas.lendacky@amd.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: David Woodhouse <dwmw@amazon.co.uk> Cc: Tim Chen <tim.c.chen@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Casey Schaufler <casey.schaufler@intel.com> Cc: Asit Mallick <asit.k.mallick@intel.com> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: Jon Masters <jcm@redhat.com> Cc: Waiman Long <longman9394@gmail.com> Cc: Greg KH <gregkh@linuxfoundation.org> Cc: Dave Stewart <david.c.stewart@intel.com> Cc: Kees Cook <keescook@chromium.org> Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185005.559149393@linutronix.de --- include/linux/ptrace.h | 17 ----------------- kernel/ptrace.c | 10 ---------- 2 files changed, 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 6c2ffed907f5..de20ede2c5c8 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -64,15 +64,12 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_NOAUDIT 0x04 #define PTRACE_MODE_FSCREDS 0x08 #define PTRACE_MODE_REALCREDS 0x10 -#define PTRACE_MODE_SCHED 0x20 -#define PTRACE_MODE_IBPB 0x40 /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) -#define PTRACE_MODE_SPEC_IBPB (PTRACE_MODE_ATTACH_REALCREDS | PTRACE_MODE_IBPB) /** * ptrace_may_access - check whether the caller is permitted to access @@ -90,20 +87,6 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); */ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); -/** - * ptrace_may_access - check whether the caller is permitted to access - * a target task. - * @task: target task - * @mode: selects type of access and caller credentials - * - * Returns true on success, false on denial. - * - * Similar to ptrace_may_access(). Only to be called from context switch - * code. Does not call into audit and the regular LSM hooks due to locking - * constraints. - */ -extern bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode); - static inline int ptrace_reparented(struct task_struct *child) { return !same_thread_group(child->real_parent, child->parent); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 80b34dffdfb9..c2cee9db5204 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -261,9 +261,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) { - if (mode & PTRACE_MODE_SCHED) - return false; - if (mode & PTRACE_MODE_NOAUDIT) return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE); else @@ -331,16 +328,9 @@ ok: !ptrace_has_cap(mm->user_ns, mode))) return -EPERM; - if (mode & PTRACE_MODE_SCHED) - return 0; return security_ptrace_access_check(task, mode); } -bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode) -{ - return __ptrace_may_access(task, mode | PTRACE_MODE_SCHED); -} - bool ptrace_may_access(struct task_struct *task, unsigned int mode) { int err; -- cgit From cdbb096adddb3f42584cecb5ec2e07c26815b71f Mon Sep 17 00:00:00 2001 From: Yonghong Song <yhs@fb.com> Date: Tue, 27 Nov 2018 13:23:27 -0800 Subject: bpf: btf: implement btf_name_valid_identifier() Function btf_name_valid_identifier() have been implemented in bpf-next commit 2667a2626f4d ("bpf: btf: Add BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO"). Backport this function so later patch can use it. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Signed-off-by: Yonghong Song <yhs@fb.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- kernel/bpf/btf.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ee4c82667d65..93c233ab2db6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5,6 +5,7 @@ #include <uapi/linux/types.h> #include <linux/seq_file.h> #include <linux/compiler.h> +#include <linux/ctype.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/anon_inodes.h> @@ -426,6 +427,30 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset) offset < btf->hdr.str_len; } +/* Only C-style identifier is permitted. This can be relaxed if + * necessary. + */ +static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +{ + /* offset must be valid */ + const char *src = &btf->strings[offset]; + const char *src_limit; + + if (!isalpha(*src) && *src != '_') + return false; + + /* set a limit on identifier length */ + src_limit = src + KSYM_NAME_LEN; + src++; + while (*src && src < src_limit) { + if (!isalnum(*src) && *src != '_') + return false; + src++; + } + + return !*src; +} + static const char *btf_name_by_offset(const struct btf *btf, u32 offset) { if (!offset) -- cgit From eb04bbb608e683f8fd3ef7f716e2fa32dd90861f Mon Sep 17 00:00:00 2001 From: Yonghong Song <yhs@fb.com> Date: Tue, 27 Nov 2018 13:23:28 -0800 Subject: bpf: btf: check name validity for various types This patch added name checking for the following types: . BTF_KIND_PTR, BTF_KIND_ARRAY, BTF_KIND_VOLATILE, BTF_KIND_CONST, BTF_KIND_RESTRICT: the name must be null . BTF_KIND_STRUCT, BTF_KIND_UNION: the struct/member name is either null or a valid identifier . BTF_KIND_ENUM: the enum type name is either null or a valid identifier; the enumerator name must be a valid identifier. . BTF_KIND_FWD: the name must be a valid identifier . BTF_KIND_TYPEDEF: the name must be a valid identifier For those places a valid name is required, the name must be a valid C identifier. This can be relaxed later if we found use cases for a different (non-C) frontend. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Acked-by: Martin KaFai Lau <kafai@fb.com> Signed-off-by: Yonghong Song <yhs@fb.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- kernel/bpf/btf.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 93c233ab2db6..4da543d6bea2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1168,6 +1168,22 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* typedef type must have a valid name, and other ref types, + * volatile, const, restrict, should have a null name. + */ + if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) { + if (!t->name_off || + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + } else { + if (t->name_off) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + } + btf_verifier_log_type(env, t, NULL); return 0; @@ -1325,6 +1341,13 @@ static s32 btf_fwd_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* fwd type must have a valid name */ + if (!t->name_off || + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); return 0; @@ -1381,6 +1404,12 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* array type should not have a name */ + if (t->name_off) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; @@ -1557,6 +1586,13 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* struct type either no name or a valid one */ + if (t->name_off && + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); last_offset = 0; @@ -1568,6 +1604,12 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* struct member either no name or a valid one */ + if (member->name_off && + !btf_name_valid_identifier(btf, member->name_off)) { + btf_verifier_log_member(env, t, member, "Invalid name"); + return -EINVAL; + } /* A member cannot be in type void */ if (!member->type || !BTF_TYPE_ID_VALID(member->type)) { btf_verifier_log_member(env, t, member, @@ -1755,6 +1797,13 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* enum type either no name or a valid one */ + if (t->name_off && + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + btf_verifier_log_type(env, t, NULL); for (i = 0; i < nr_enums; i++) { @@ -1764,6 +1813,14 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* enum member must have a valid name */ + if (!enums[i].name_off || + !btf_name_valid_identifier(btf, enums[i].name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + btf_verifier_log(env, "\t%s val=%d\n", btf_name_by_offset(btf, enums[i].name_off), enums[i].val); -- cgit From 5cf99a0f3161bc3ae2391269d134d6bf7e26f00e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" <rostedt@goodmis.org> Date: Thu, 29 Nov 2018 08:50:27 -0500 Subject: tracing/fgraph: Fix set_graph_function from showing interrupts The tracefs file set_graph_function is used to only function graph functions that are listed in that file (or all functions if the file is empty). The way this is implemented is that the function graph tracer looks at every function, and if the current depth is zero and the function matches something in the file then it will trace that function. When other functions are called, the depth will be greater than zero (because the original function will be at depth zero), and all functions will be traced where the depth is greater than zero. The issue is that when a function is first entered, and the handler that checks this logic is called, the depth is set to zero. If an interrupt comes in and a function in the interrupt handler is traced, its depth will be greater than zero and it will automatically be traced, even if the original function was not. But because the logic only looks at depth it may trace interrupts when it should not be. The recent design change of the function graph tracer to fix other bugs caused the depth to be zero while the function graph callback handler is being called for a longer time, widening the race of this happening. This bug was actually there for a longer time, but because the race window was so small it seldom happened. The Fixes tag below is for the commit that widen the race window, because that commit belongs to a series that will also help fix the original bug. Cc: stable@kernel.org Fixes: 39eb456dacb5 ("function_graph: Use new curr_ret_depth to manage depth instead of curr_ret_stack") Reported-by: Joe Lawrence <joe.lawrence@redhat.com> Tested-by: Joe Lawrence <joe.lawrence@redhat.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> --- kernel/trace/trace.h | 57 ++++++++++++++++++++++++++++++++++-- kernel/trace/trace_functions_graph.c | 4 +++ kernel/trace/trace_irqsoff.c | 2 ++ kernel/trace/trace_sched_wakeup.c | 2 ++ 4 files changed, 62 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3b8c0e24ab30..447bd96ee658 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -512,12 +512,44 @@ enum { * can only be modified by current, we can reuse trace_recursion. */ TRACE_IRQ_BIT, + + /* Set if the function is in the set_graph_function file */ + TRACE_GRAPH_BIT, + + /* + * In the very unlikely case that an interrupt came in + * at a start of graph tracing, and we want to trace + * the function in that interrupt, the depth can be greater + * than zero, because of the preempted start of a previous + * trace. In an even more unlikely case, depth could be 2 + * if a softirq interrupted the start of graph tracing, + * followed by an interrupt preempting a start of graph + * tracing in the softirq, and depth can even be 3 + * if an NMI came in at the start of an interrupt function + * that preempted a softirq start of a function that + * preempted normal context!!!! Luckily, it can't be + * greater than 3, so the next two bits are a mask + * of what the depth is when we set TRACE_GRAPH_BIT + */ + + TRACE_GRAPH_DEPTH_START_BIT, + TRACE_GRAPH_DEPTH_END_BIT, }; #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) #define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) +#define trace_recursion_depth() \ + (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3) +#define trace_recursion_set_depth(depth) \ + do { \ + current->trace_recursion &= \ + ~(3 << TRACE_GRAPH_DEPTH_START_BIT); \ + current->trace_recursion |= \ + ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT; \ + } while (0) + #define TRACE_CONTEXT_BITS 4 #define TRACE_FTRACE_START TRACE_FTRACE_BIT @@ -843,8 +875,9 @@ extern void __trace_graph_return(struct trace_array *tr, extern struct ftrace_hash *ftrace_graph_hash; extern struct ftrace_hash *ftrace_graph_notrace_hash; -static inline int ftrace_graph_addr(unsigned long addr) +static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) { + unsigned long addr = trace->func; int ret = 0; preempt_disable_notrace(); @@ -855,6 +888,14 @@ static inline int ftrace_graph_addr(unsigned long addr) } if (ftrace_lookup_ip(ftrace_graph_hash, addr)) { + + /* + * This needs to be cleared on the return functions + * when the depth is zero. + */ + trace_recursion_set(TRACE_GRAPH_BIT); + trace_recursion_set_depth(trace->depth); + /* * If no irqs are to be traced, but a set_graph_function * is set, and called by an interrupt handler, we still @@ -872,6 +913,13 @@ out: return ret; } +static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) +{ + if (trace_recursion_test(TRACE_GRAPH_BIT) && + trace->depth == trace_recursion_depth()) + trace_recursion_clear(TRACE_GRAPH_BIT); +} + static inline int ftrace_graph_notrace_addr(unsigned long addr) { int ret = 0; @@ -885,7 +933,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr) return ret; } #else -static inline int ftrace_graph_addr(unsigned long addr) +static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) { return 1; } @@ -894,6 +942,8 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr) { return 0; } +static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) +{ } #endif /* CONFIG_DYNAMIC_FTRACE */ extern unsigned int fgraph_max_depth; @@ -901,7 +951,8 @@ extern unsigned int fgraph_max_depth; static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace) { /* trace it when it is-nested-in or is a function enabled. */ - return !(trace->depth || ftrace_graph_addr(trace->func)) || + return !(trace_recursion_test(TRACE_GRAPH_BIT) || + ftrace_graph_addr(trace)) || (trace->depth < 0) || (fgraph_max_depth && trace->depth >= fgraph_max_depth); } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 2561460d7baf..086af4f5c3e8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -509,6 +509,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace) int cpu; int pc; + ftrace_graph_addr_finish(trace); + local_irq_save(flags); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -532,6 +534,8 @@ void set_graph_array(struct trace_array *tr) static void trace_graph_thresh_return(struct ftrace_graph_ret *trace) { + ftrace_graph_addr_finish(trace); + if (tracing_thresh && (trace->rettime - trace->calltime < tracing_thresh)) return; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index b7357f9f82a3..98ea6d28df15 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -208,6 +208,8 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) unsigned long flags; int pc; + ftrace_graph_addr_finish(trace); + if (!func_prolog_dec(tr, &data, &flags)) return; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index a86b303e6c67..7d04b9890755 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -270,6 +270,8 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace) unsigned long flags; int pc; + ftrace_graph_addr_finish(trace); + if (!func_prolog_preempt_disable(tr, &data, &pc)) return; -- cgit From ef1a8409348966f0b25ff97a170d6d0367710ea9 Mon Sep 17 00:00:00 2001 From: Alexander Popov <alex.popov@linux.com> Date: Tue, 13 Nov 2018 00:08:48 +0300 Subject: stackleak: Disable function tracing and kprobes for stackleak_erase() The stackleak_erase() function is called on the trampoline stack at the end of syscall. This stack is not big enough for ftrace and kprobes operations, e.g. it can be exhausted if we use kprobe_events for stackleak_erase(). So let's disable function tracing and kprobes of stackleak_erase(). Reported-by: kernel test robot <lkp@intel.com> Fixes: 10e9ae9fabaf ("gcc-plugins: Add STACKLEAK plugin for tracking the kernel stack") Signed-off-by: Alexander Popov <alex.popov@linux.com> Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org> Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Kees Cook <keescook@chromium.org> --- kernel/stackleak.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stackleak.c b/kernel/stackleak.c index e42892926244..08cb57eed389 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -11,6 +11,7 @@ */ #include <linux/stackleak.h> +#include <linux/kprobes.h> #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include <linux/jump_label.h> @@ -47,7 +48,7 @@ int stack_erasing_sysctl(struct ctl_table *table, int write, #define skip_erasing() false #endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ -asmlinkage void stackleak_erase(void) +asmlinkage void notrace stackleak_erase(void) { /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */ unsigned long kstack_ptr = current->lowest_stack; @@ -101,6 +102,7 @@ asmlinkage void stackleak_erase(void) /* Reset the 'lowest_stack' value for the next syscall */ current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64; } +NOKPROBE_SYMBOL(stackleak_erase); void __used stackleak_track_stack(void) { -- cgit From e0c274472d5d27f277af722e017525e0b33784cd Mon Sep 17 00:00:00 2001 From: Johannes Weiner <hannes@cmpxchg.org> Date: Fri, 30 Nov 2018 14:09:58 -0800 Subject: psi: make disabling/enabling easier for vendor kernels Mel Gorman reports a hackbench regression with psi that would prohibit shipping the suse kernel with it default-enabled, but he'd still like users to be able to opt in at little to no cost to others. With the current combination of CONFIG_PSI and the psi_disabled bool set from the commandline, this is a challenge. Do the following things to make it easier: 1. Add a config option CONFIG_PSI_DEFAULT_DISABLED that allows distros to enable CONFIG_PSI in their kernel but leave the feature disabled unless a user requests it at boot-time. To avoid double negatives, rename psi_disabled= to psi=. 2. Make psi_disabled a static branch to eliminate any branch costs when the feature is disabled. In terms of numbers before and after this patch, Mel says: : The following is a comparision using CONFIG_PSI=n as a baseline against : your patch and a vanilla kernel : : 4.20.0-rc4 4.20.0-rc4 4.20.0-rc4 : kconfigdisable-v1r1 vanilla psidisable-v1r1 : Amean 1 1.3100 ( 0.00%) 1.3923 ( -6.28%) 1.3427 ( -2.49%) : Amean 3 3.8860 ( 0.00%) 4.1230 * -6.10%* 3.8860 ( -0.00%) : Amean 5 6.8847 ( 0.00%) 8.0390 * -16.77%* 6.7727 ( 1.63%) : Amean 7 9.9310 ( 0.00%) 10.8367 * -9.12%* 9.9910 ( -0.60%) : Amean 12 16.6577 ( 0.00%) 18.2363 * -9.48%* 17.1083 ( -2.71%) : Amean 18 26.5133 ( 0.00%) 27.8833 * -5.17%* 25.7663 ( 2.82%) : Amean 24 34.3003 ( 0.00%) 34.6830 ( -1.12%) 32.0450 ( 6.58%) : Amean 30 40.0063 ( 0.00%) 40.5800 ( -1.43%) 41.5087 ( -3.76%) : Amean 32 40.1407 ( 0.00%) 41.2273 ( -2.71%) 39.9417 ( 0.50%) : : It's showing that the vanilla kernel takes a hit (as the bisection : indicated it would) and that disabling PSI by default is reasonably : close in terms of performance for this particular workload on this : particular machine so; Link: http://lkml.kernel.org/r/20181127165329.GA29728@cmpxchg.org Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Tested-by: Mel Gorman <mgorman@techsingularity.net> Reported-by: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ include/linux/psi.h | 3 ++- init/Kconfig | 9 ++++++++ kernel/sched/psi.c | 30 +++++++++++++++++-------- kernel/sched/stats.h | 8 +++---- 5 files changed, 40 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 675170c36078..5d6ba930d4f4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3505,6 +3505,10 @@ before loading. See Documentation/blockdev/ramdisk.txt. + psi= [KNL] Enable or disable pressure stall information + tracking. + Format: <bool> + psmouse.proto= [HW,MOUSE] Highest PS2 mouse protocol extension to probe for; one of (bare|imps|exps|lifebook|any). psmouse.rate= [HW,MOUSE] Set desired mouse report rate, in reports diff --git a/include/linux/psi.h b/include/linux/psi.h index 8e0725aac0aa..7006008d5b72 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -1,6 +1,7 @@ #ifndef _LINUX_PSI_H #define _LINUX_PSI_H +#include <linux/jump_label.h> #include <linux/psi_types.h> #include <linux/sched.h> @@ -9,7 +10,7 @@ struct css_set; #ifdef CONFIG_PSI -extern bool psi_disabled; +extern struct static_key_false psi_disabled; void psi_init(void); diff --git a/init/Kconfig b/init/Kconfig index a4112e95724a..cf5b5a0dcbc2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -509,6 +509,15 @@ config PSI Say N if unsure. +config PSI_DEFAULT_DISABLED + bool "Require boot parameter to enable pressure stall information tracking" + default n + depends on PSI + help + If set, pressure stall information tracking will be disabled + per default but can be enabled through passing psi_enable=1 + on the kernel commandline during boot. + endmenu # "CPU/Task time and stats accounting" config CPU_ISOLATION diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 3d7355d7c3e3..fe24de3fbc93 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -136,8 +136,18 @@ static int psi_bug __read_mostly; -bool psi_disabled __read_mostly; -core_param(psi_disabled, psi_disabled, bool, 0644); +DEFINE_STATIC_KEY_FALSE(psi_disabled); + +#ifdef CONFIG_PSI_DEFAULT_DISABLED +bool psi_enable; +#else +bool psi_enable = true; +#endif +static int __init setup_psi(char *str) +{ + return kstrtobool(str, &psi_enable) == 0; +} +__setup("psi=", setup_psi); /* Running averages - we need to be higher-res than loadavg */ #define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ @@ -169,8 +179,10 @@ static void group_init(struct psi_group *group) void __init psi_init(void) { - if (psi_disabled) + if (!psi_enable) { + static_branch_enable(&psi_disabled); return; + } psi_period = jiffies_to_nsecs(PSI_FREQ); group_init(&psi_system); @@ -549,7 +561,7 @@ void psi_memstall_enter(unsigned long *flags) struct rq_flags rf; struct rq *rq; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; *flags = current->flags & PF_MEMSTALL; @@ -579,7 +591,7 @@ void psi_memstall_leave(unsigned long *flags) struct rq_flags rf; struct rq *rq; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (*flags) @@ -600,7 +612,7 @@ void psi_memstall_leave(unsigned long *flags) #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return 0; cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); @@ -612,7 +624,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup) void psi_cgroup_free(struct cgroup *cgroup) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; cancel_delayed_work_sync(&cgroup->psi.clock_work); @@ -637,7 +649,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) struct rq_flags rf; struct rq *rq; - if (psi_disabled) { + if (static_branch_likely(&psi_disabled)) { /* * Lame to do this here, but the scheduler cannot be locked * from the outside, so we move cgroups from inside sched/. @@ -673,7 +685,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; update_stats(group); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 4904c4677000..aa0de240fb41 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -66,7 +66,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) { int clear = 0, set = TSK_RUNNING; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (!wakeup || p->sched_psi_wake_requeue) { @@ -86,7 +86,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) { int clear = TSK_RUNNING, set = 0; - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (!sleep) { @@ -102,7 +102,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) static inline void psi_ttwu_dequeue(struct task_struct *p) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; /* * Is the task being migrated during a wakeup? Make sure to @@ -128,7 +128,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) static inline void psi_task_tick(struct rq *rq) { - if (psi_disabled) + if (static_branch_likely(&psi_disabled)) return; if (unlikely(rq->curr->flags & PF_MEMSTALL)) -- cgit From 903e8ff86753e6f327bb92166a0665e4ecb8e2e7 Mon Sep 17 00:00:00 2001 From: Anders Roxell <anders.roxell@linaro.org> Date: Fri, 30 Nov 2018 14:10:05 -0800 Subject: kernel/kcov.c: mark funcs in __sanitizer_cov_trace_pc() as notrace Since __sanitizer_cov_trace_pc() is marked as notrace, function calls in __sanitizer_cov_trace_pc() shouldn't be traced either. ftrace_graph_caller() gets called for each function that isn't marked 'notrace', like canonicalize_ip(). This is the call trace from a run: [ 139.644550] ftrace_graph_caller+0x1c/0x24 [ 139.648352] canonicalize_ip+0x18/0x28 [ 139.652313] __sanitizer_cov_trace_pc+0x14/0x58 [ 139.656184] sched_clock+0x34/0x1e8 [ 139.659759] trace_clock_local+0x40/0x88 [ 139.663722] ftrace_push_return_trace+0x8c/0x1f0 [ 139.667767] prepare_ftrace_return+0xa8/0x100 [ 139.671709] ftrace_graph_caller+0x1c/0x24 Rework so that check_kcov_mode() and canonicalize_ip() that are called from __sanitizer_cov_trace_pc() are also marked as notrace. Link: http://lkml.kernel.org/r/20181128081239.18317-1-anders.roxell@linaro.org Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signen-off-by: Anders Roxell <anders.roxell@linaro.org> Co-developed-by: Arnd Bergmann <arnd@arndb.de> Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/kcov.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 3ebd09efe72a..97959d7b77e2 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -56,7 +56,7 @@ struct kcov { struct task_struct *t; }; -static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) +static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) { unsigned int mode; @@ -78,7 +78,7 @@ static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) return mode == needed_mode; } -static unsigned long canonicalize_ip(unsigned long ip) +static notrace unsigned long canonicalize_ip(unsigned long ip) { #ifdef CONFIG_RANDOMIZE_BASE ip -= kaslr_offset(); -- cgit From c3494801cd1785e2c25f1a5735fa19ddcf9665da Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov <ast@kernel.org> Date: Mon, 3 Dec 2018 22:46:04 -0800 Subject: bpf: check pending signals while verifying programs Malicious user space may try to force the verifier to use as much cpu time and memory as possible. Hence check for pending signals while verifying the program. Note that suspend of sys_bpf(PROG_LOAD) syscall will lead to EAGAIN, since the kernel has to release the resources used for program verification. Reported-by: Anatoly Trosinenko <anatoly.trosinenko@gmail.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Edward Cree <ecree@solarflare.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6dd419550aba..751bb30b7c5c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5148,6 +5148,9 @@ static int do_check(struct bpf_verifier_env *env) goto process_bpf_exit; } + if (signal_pending(current)) + return -EAGAIN; + if (need_resched()) cond_resched(); -- cgit From 4f7b3e82589e0de723780198ec7983e427144c0a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov <ast@kernel.org> Date: Mon, 3 Dec 2018 22:46:05 -0800 Subject: bpf: improve verifier branch analysis pathological bpf programs may try to force verifier to explode in the number of branch states: 20: (d5) if r1 s<= 0x24000028 goto pc+0 21: (b5) if r0 <= 0xe1fa20 goto pc+2 22: (d5) if r1 s<= 0x7e goto pc+0 23: (b5) if r0 <= 0xe880e000 goto pc+0 24: (c5) if r0 s< 0x2100ecf4 goto pc+0 25: (d5) if r1 s<= 0xe880e000 goto pc+1 26: (c5) if r0 s< 0xf4041810 goto pc+0 27: (d5) if r1 s<= 0x1e007e goto pc+0 28: (b5) if r0 <= 0xe86be000 goto pc+0 29: (07) r0 += 16614 30: (c5) if r0 s< 0x6d0020da goto pc+0 31: (35) if r0 >= 0x2100ecf4 goto pc+0 Teach verifier to recognize always taken and always not taken branches. This analysis is already done for == and != comparison. Expand it to all other branches. It also helps real bpf programs to be verified faster: before after bpf_lb-DLB_L3.o 2003 1940 bpf_lb-DLB_L4.o 3173 3089 bpf_lb-DUNKNOWN.o 1080 1065 bpf_lxc-DDROP_ALL.o 29584 28052 bpf_lxc-DUNKNOWN.o 36916 35487 bpf_netdev.o 11188 10864 bpf_overlay.o 6679 6643 bpf_lcx_jit.o 39555 38437 Reported-by: Anatoly Trosinenko <anatoly.trosinenko@gmail.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Edward Cree <ecree@solarflare.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> --- kernel/bpf/verifier.c | 93 +++++++++++++++++++++++++---- tools/testing/selftests/bpf/test_verifier.c | 4 +- 2 files changed, 82 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 751bb30b7c5c..55a49703f423 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3751,6 +3751,79 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, } } +/* compute branch direction of the expression "if (reg opcode val) goto target;" + * and return: + * 1 - branch will be taken and "goto target" will be executed + * 0 - branch will not be taken and fall-through to next insn + * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10] + */ +static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) +{ + if (__is_pointer_value(false, reg)) + return -1; + + switch (opcode) { + case BPF_JEQ: + if (tnum_is_const(reg->var_off)) + return !!tnum_equals_const(reg->var_off, val); + break; + case BPF_JNE: + if (tnum_is_const(reg->var_off)) + return !tnum_equals_const(reg->var_off, val); + break; + case BPF_JGT: + if (reg->umin_value > val) + return 1; + else if (reg->umax_value <= val) + return 0; + break; + case BPF_JSGT: + if (reg->smin_value > (s64)val) + return 1; + else if (reg->smax_value < (s64)val) + return 0; + break; + case BPF_JLT: + if (reg->umax_value < val) + return 1; + else if (reg->umin_value >= val) + return 0; + break; + case BPF_JSLT: + if (reg->smax_value < (s64)val) + return 1; + else if (reg->smin_value >= (s64)val) + return 0; + break; + case BPF_JGE: + if (reg->umin_value >= val) + return 1; + else if (reg->umax_value < val) + return 0; + break; + case BPF_JSGE: + if (reg->smin_value >= (s64)val) + return 1; + else if (reg->smax_value < (s64)val) + return 0; + break; + case BPF_JLE: + if (reg->umax_value <= val) + return 1; + else if (reg->umin_value > val) + return 0; + break; + case BPF_JSLE: + if (reg->smax_value <= (s64)val) + return 1; + else if (reg->smin_value > (s64)val) + return 0; + break; + } + + return -1; +} + /* Adjusts the register min/max values in the case that the dst_reg is the * variable register that we are working on, and src_reg is a constant or we're * simply doing a BPF_K check. @@ -4152,21 +4225,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, dst_reg = ®s[insn->dst_reg]; - /* detect if R == 0 where R was initialized to zero earlier */ - if (BPF_SRC(insn->code) == BPF_K && - (opcode == BPF_JEQ || opcode == BPF_JNE) && - dst_reg->type == SCALAR_VALUE && - tnum_is_const(dst_reg->var_off)) { - if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) || - (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) { - /* if (imm == imm) goto pc+off; - * only follow the goto, ignore fall-through - */ + if (BPF_SRC(insn->code) == BPF_K) { + int pred = is_branch_taken(dst_reg, insn->imm, opcode); + + if (pred == 1) { + /* only follow the goto, ignore fall-through */ *insn_idx += insn->off; return 0; - } else { - /* if (imm != imm) goto pc+off; - * only follow fall-through branch, since + } else if (pred == 0) { + /* only follow fall-through branch, since * that's where the program will go */ return 0; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 5dd4410a716c..df6f751cc1e8 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -8576,7 +8576,7 @@ static struct bpf_test tests[] = { BPF_JMP_IMM(BPF_JA, 0, 0, -7), }, .fixup_map_hash_8b = { 4 }, - .errstr = "R0 invalid mem access 'inv'", + .errstr = "unbounded min value", .result = REJECT, }, { @@ -10547,7 +10547,7 @@ static struct bpf_test tests[] = { "check deducing bounds from const, 5", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1), + BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 1, 1), BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, -- cgit From ceefbc96fa5c5b975d87bf8e89ba8416f6b764d9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov <ast@kernel.org> Date: Mon, 3 Dec 2018 22:46:06 -0800 Subject: bpf: add per-insn complexity limit malicious bpf program may try to force the verifier to remember a lot of distinct verifier states. Put a limit to number of per-insn 'struct bpf_verifier_state'. Note that hitting the limit doesn't reject the program. It potentially makes the verifier do more steps to analyze the program. It means that malicious programs will hit BPF_COMPLEXITY_LIMIT_INSNS sooner instead of spending cpu time walking long link list. The limit of BPF_COMPLEXITY_LIMIT_STATES==64 affects cilium progs with slight increase in number of "steps" it takes to successfully verify the programs: before after bpf_lb-DLB_L3.o 1940 1940 bpf_lb-DLB_L4.o 3089 3089 bpf_lb-DUNKNOWN.o 1065 1065 bpf_lxc-DDROP_ALL.o 28052 | 28162 bpf_lxc-DUNKNOWN.o 35487 | 35541 bpf_netdev.o 10864 10864 bpf_overlay.o 6643 6643 bpf_lcx_jit.o 38437 38437 But it also makes malicious program to be rejected in 0.4 seconds vs 6.5 Hence apply this limit to unprivileged programs only. Signed-off-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Edward Cree <ecree@solarflare.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> --- kernel/bpf/verifier.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 55a49703f423..fc760d00a38c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -175,6 +175,7 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_INSNS 131072 #define BPF_COMPLEXITY_LIMIT_STACK 1024 +#define BPF_COMPLEXITY_LIMIT_STATES 64 #define BPF_MAP_PTR_UNPRIV 1UL #define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ @@ -5047,7 +5048,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state, *new; - int i, j, err; + int i, j, err, states_cnt = 0; sl = env->explored_states[insn_idx]; if (!sl) @@ -5074,8 +5075,12 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 1; } sl = sl->next; + states_cnt++; } + if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) + return 0; + /* there were no equivalent states, remember current one. * technically the current state is not proven to be safe yet, * but it will either reach outer most bpf_exit (which means it's safe) -- cgit From e9c7d656610e3fff7c96b199d418add2528aea6e Mon Sep 17 00:00:00 2001 From: Anders Roxell <anders.roxell@linaro.org> Date: Fri, 30 Nov 2018 16:08:59 +0100 Subject: stackleak: Mark stackleak_track_stack() as notrace Function graph tracing recurses into itself when stackleak is enabled, causing the ftrace graph selftest to run for up to 90 seconds and trigger the softlockup watchdog. Breakpoint 2, ftrace_graph_caller () at ../arch/arm64/kernel/entry-ftrace.S:200 200 mcount_get_lr_addr x0 // pointer to function's saved lr (gdb) bt \#0 ftrace_graph_caller () at ../arch/arm64/kernel/entry-ftrace.S:200 \#1 0xffffff80081d5280 in ftrace_caller () at ../arch/arm64/kernel/entry-ftrace.S:153 \#2 0xffffff8008555484 in stackleak_track_stack () at ../kernel/stackleak.c:106 \#3 0xffffff8008421ff8 in ftrace_ops_test (ops=0xffffff8009eaa840 <graph_ops>, ip=18446743524091297036, regs=<optimized out>) at ../kernel/trace/ftrace.c:1507 \#4 0xffffff8008428770 in __ftrace_ops_list_func (regs=<optimized out>, ignored=<optimized out>, parent_ip=<optimized out>, ip=<optimized out>) at ../kernel/trace/ftrace.c:6286 \#5 ftrace_ops_no_ops (ip=18446743524091297036, parent_ip=18446743524091242824) at ../kernel/trace/ftrace.c:6321 \#6 0xffffff80081d5280 in ftrace_caller () at ../arch/arm64/kernel/entry-ftrace.S:153 \#7 0xffffff800832fd10 in irq_find_mapping (domain=0xffffffc03fc4bc80, hwirq=27) at ../kernel/irq/irqdomain.c:876 \#8 0xffffff800832294c in __handle_domain_irq (domain=0xffffffc03fc4bc80, hwirq=27, lookup=true, regs=0xffffff800814b840) at ../kernel/irq/irqdesc.c:650 \#9 0xffffff80081d52b4 in ftrace_graph_caller () at ../arch/arm64/kernel/entry-ftrace.S:205 Rework so we mark stackleak_track_stack as notrace Co-developed-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Anders Roxell <anders.roxell@linaro.org> Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org> Signed-off-by: Kees Cook <keescook@chromium.org> --- kernel/stackleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stackleak.c b/kernel/stackleak.c index 08cb57eed389..b193a59fc05b 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -104,7 +104,7 @@ asmlinkage void notrace stackleak_erase(void) } NOKPROBE_SYMBOL(stackleak_erase); -void __used stackleak_track_stack(void) +void __used notrace stackleak_track_stack(void) { /* * N.B. stackleak_erase() fills the kernel stack with the poison value, -- cgit From 1aed58e67a6ec1e7a18bfabe8ba6ec2d27c15636 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria <ravi.bangoria@linux.ibm.com> Date: Wed, 5 Dec 2018 09:04:23 +0530 Subject: Uprobes: Fix kernel oops with delayed_uprobe_remove() There could be a race between task exit and probe unregister: exit_mm() mmput() __mmput() uprobe_unregister() uprobe_clear_state() put_uprobe() delayed_uprobe_remove() delayed_uprobe_remove() put_uprobe() is calling delayed_uprobe_remove() without taking delayed_uprobe_lock and thus the race sometimes results in a kernel crash. Fix this by taking delayed_uprobe_lock before calling delayed_uprobe_remove() from put_uprobe(). Detailed crash log can be found at: Link: http://lkml.kernel.org/r/000000000000140c370577db5ece@google.com Link: http://lkml.kernel.org/r/20181205033423.26242-1-ravi.bangoria@linux.ibm.com Acked-by: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Reported-by: syzbot+cb1fb754b771caca0a88@syzkaller.appspotmail.com Fixes: 1cc33161a83d ("uprobes: Support SDT markers having reference count (semaphore)") Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> --- kernel/events/uprobes.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 96d4bee83489..98b9312ce6b2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -572,7 +572,9 @@ static void put_uprobe(struct uprobe *uprobe) * gets called, we don't get a chance to remove uprobe from * delayed_uprobe_list from remove_breakpoint(). Do it here. */ + mutex_lock(&delayed_uprobe_lock); delayed_uprobe_remove(uprobe, NULL); + mutex_unlock(&delayed_uprobe_lock); kfree(uprobe); } } -- cgit