From 5f4126327494c189767ca64b222abadb07c55e3d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:07 -0700 Subject: bpf: change prototype for stack_map_get_build_id_offset This patch didn't incur functionality change. The function prototype got changed so that the same function can be reused later. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/stackmap.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel/bpf/stackmap.c') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 57eeb1234b67..04f6ec1679f0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -262,16 +262,11 @@ out: return ret; } -static void stack_map_get_build_id_offset(struct bpf_map *map, - struct stack_map_bucket *bucket, +static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u64 *ips, u32 trace_nr, bool user) { int i; struct vm_area_struct *vma; - struct bpf_stack_build_id *id_offs; - - bucket->nr = trace_nr; - id_offs = (struct bpf_stack_build_id *)bucket->data; /* * We cannot do up_read() in nmi context, so build_id lookup is @@ -361,8 +356,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; - stack_map_get_build_id_offset(map, new_bucket, ips, - trace_nr, user); + new_bucket->nr = trace_nr; + stack_map_get_build_id_offset( + (struct bpf_stack_build_id *)new_bucket->data, + ips, trace_nr, user); trace_len = trace_nr * sizeof(struct bpf_stack_build_id); if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, new_bucket->data, trace_len) == 0) { -- cgit From c195651e565ae7f41a68acb7d4aa7390ad215de1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 28 Apr 2018 22:28:08 -0700 Subject: bpf: add bpf_get_stack helper Currently, stackmap and bpf_get_stackid helper are provided for bpf program to get the stack trace. This approach has a limitation though. If two stack traces have the same hash, only one will get stored in the stackmap table, so some stack traces are missing from user perspective. This patch implements a new helper, bpf_get_stack, will send stack traces directly to bpf program. The bpf program is able to see all stack traces, and then can do in-kernel processing or send stack traces to user space through shared map or bpf_perf_event_output. Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/filter.h | 3 ++- include/uapi/linux/bpf.h | 42 ++++++++++++++++++++++++++++-- kernel/bpf/core.c | 5 ++++ kernel/bpf/stackmap.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 19 ++++++++++++++ kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++++++++- 7 files changed, 183 insertions(+), 4 deletions(-) (limited to 'kernel/bpf/stackmap.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 38ebbc61ed99..c553f6f9c6b0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -692,6 +692,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_skb_vlan_push_proto; extern const struct bpf_func_proto bpf_skb_vlan_pop_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; +extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; /* Shared helpers among cBPF and eBPF. */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 4da8b2308174..64899c04c1a6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -468,7 +468,8 @@ struct bpf_prog { dst_needed:1, /* Do we need dst entry? */ blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ - kprobe_override:1; /* Do we override a kprobe? */ + kprobe_override:1, /* Do we override a kprobe? */ + has_callchain_buf:1; /* callchain buffer allocated? */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index da77a9388947..1afb606a18b9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1767,6 +1767,40 @@ union bpf_attr { * **CONFIG_XFRM** configuration option. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Return + * a non-negative value equal to or less than size on success, or + * a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1835,7 +1869,8 @@ union bpf_attr { FN(msg_pull_data), \ FN(bind), \ FN(xdp_adjust_tail), \ - FN(skb_get_xfrm_state), + FN(skb_get_xfrm_state), \ + FN(get_stack), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -1869,11 +1904,14 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ #define BPF_F_TUNINFO_IPV6 (1ULL << 0) -/* BPF_FUNC_get_stackid flags. */ +/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ #define BPF_F_SKIP_FIELD_MASK 0xffULL #define BPF_F_USER_STACK (1ULL << 8) +/* flags used by BPF_FUNC_get_stackid only. */ #define BPF_F_FAST_STACK_CMP (1ULL << 9) #define BPF_F_REUSE_STACKID (1ULL << 10) +/* flags used by BPF_FUNC_get_stack only. */ +#define BPF_F_USER_BUILD_ID (1ULL << 11) /* BPF_FUNC_skb_set_tunnel_key flags. */ #define BPF_F_ZERO_CSUM_TX (1ULL << 1) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ba03ec39efb3..9349a5db3cf2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -1722,6 +1723,10 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); +#ifdef CONFIG_PERF_EVENTS + if (aux->prog->has_callchain_buf) + put_callchain_buffers(); +#endif for (i = 0; i < aux->func_cnt; i++) bpf_jit_free(aux->func[i]); if (aux->func_cnt) { diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 04f6ec1679f0..3ba102b41512 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -402,6 +402,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, + u64, flags) +{ + u32 init_nr, trace_nr, copy_len, elem_size, num_elem; + bool user_build_id = flags & BPF_F_USER_BUILD_ID; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + bool user = flags & BPF_F_USER_STACK; + struct perf_callchain_entry *trace; + bool kernel = !user; + int err = -EINVAL; + u64 *ips; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_USER_BUILD_ID))) + goto clear; + if (kernel && user_build_id) + goto clear; + + elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id) + : sizeof(u64); + if (unlikely(size % elem_size)) + goto clear; + + num_elem = size / elem_size; + if (sysctl_perf_event_max_stack < num_elem) + init_nr = 0; + else + init_nr = sysctl_perf_event_max_stack - num_elem; + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, false, false); + if (unlikely(!trace)) + goto err_fault; + + trace_nr = trace->nr - init_nr; + if (trace_nr < skip) + goto err_fault; + + trace_nr -= skip; + trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; + copy_len = trace_nr * elem_size; + ips = trace->ip + skip + init_nr; + if (user && user_build_id) + stack_map_get_build_id_offset(buf, ips, trace_nr, user); + else + memcpy(buf, ips, copy_len); + + if (size > copy_len) + memset(buf + copy_len, 0, size - copy_len); + return copy_len; + +err_fault: + err = -EFAULT; +clear: + memset(buf, 0, size); + return err; +} + +const struct bpf_func_proto bpf_get_stack_proto = { + .func = bpf_get_stack, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb1a596aebd3..253f6bdb9117 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "disasm.h" @@ -2450,6 +2451,24 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; + if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) { + const char *err_str; + +#ifdef CONFIG_PERF_EVENTS + err = get_callchain_buffers(sysctl_perf_event_max_stack); + err_str = "cannot get callchain buffer for func %s#%d\n"; +#else + err = -ENOTSUPP; + err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n"; +#endif + if (err) { + verbose(env, err_str, func_id_name(func_id), func_id); + return err; + } + + env->prog->has_callchain_buf = true; + } + if (changes_data) clear_all_pkt_pointers(env); return 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 56ba0f2a01db..46d866e9937c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -20,6 +20,7 @@ #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); /** * trace_call_bpf - invoke BPF program @@ -577,6 +578,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; #ifdef CONFIG_BPF_KPROBE_OVERRIDE @@ -664,6 +667,25 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size, + u64, flags) +{ + struct pt_regs *regs = *(struct pt_regs **)tp_buff; + + return bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); +} + +static const struct bpf_func_proto bpf_get_stack_proto_tp = { + .func = bpf_get_stack_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -672,6 +694,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_tp; default: return tracing_func_proto(func_id, prog); } @@ -734,6 +758,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_tp; case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; default: @@ -744,7 +770,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) /* * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp * to avoid potential recursive reuse issue when/if tracepoints are added - * inside bpf_*_event_output and/or bpf_get_stack_id + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack */ static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, @@ -787,6 +813,26 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, + void *, buf, u32, size, u64, flags) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + return bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); +} + +static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { + .func = bpf_get_stack_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -795,6 +841,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto_raw_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_raw_tp; default: return tracing_func_proto(func_id, prog); } -- cgit From bae77c5eb5b2107e300fb02da2311f2aa0d8ee3c Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 7 May 2018 10:50:48 -0700 Subject: bpf: enable stackmap with build_id in nmi context Currently, we cannot parse build_id in nmi context because of up_read(¤t->mm->mmap_sem), this makes stackmap with build_id less useful. This patch enables parsing build_id in nmi by putting the up_read() call in irq_work. To avoid memory allocation in nmi context, we use per cpu variable for the irq_work. As a result, only one irq_work per cpu is allowed. If the irq_work is in-use, we fallback to only report ips. Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Peter Zijlstra Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann --- init/Kconfig | 1 + kernel/bpf/stackmap.c | 59 +++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 54 insertions(+), 6 deletions(-) (limited to 'kernel/bpf/stackmap.c') diff --git a/init/Kconfig b/init/Kconfig index f013afc74b11..480a4f2713d9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1391,6 +1391,7 @@ config BPF_SYSCALL bool "Enable bpf() system call" select ANON_INODES select BPF + select IRQ_WORK default n help Enable the bpf() system call that allows to manipulate eBPF diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 3ba102b41512..b59ace0f0f09 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "percpu_freelist.h" #define STACK_CREATE_FLAG_MASK \ @@ -32,6 +33,23 @@ struct bpf_stack_map { struct stack_map_bucket *buckets[]; }; +/* irq_work to run up_read() for build_id lookup in nmi context */ +struct stack_map_irq_work { + struct irq_work irq_work; + struct rw_semaphore *sem; +}; + +static void do_up_read(struct irq_work *entry) +{ + struct stack_map_irq_work *work; + + work = container_of(entry, struct stack_map_irq_work, irq_work); + up_read(work->sem); + work->sem = NULL; +} + +static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); + static inline bool stack_map_use_build_id(struct bpf_map *map) { return (map->map_flags & BPF_F_STACK_BUILD_ID); @@ -267,17 +285,27 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, { int i; struct vm_area_struct *vma; + bool in_nmi_ctx = in_nmi(); + bool irq_work_busy = false; + struct stack_map_irq_work *work; + + if (in_nmi_ctx) { + work = this_cpu_ptr(&up_read_work); + if (work->irq_work.flags & IRQ_WORK_BUSY) + /* cannot queue more up_read, fallback */ + irq_work_busy = true; + } /* - * We cannot do up_read() in nmi context, so build_id lookup is - * only supported for non-nmi events. If at some point, it is - * possible to run find_vma() without taking the semaphore, we - * would like to allow build_id lookup in nmi context. + * We cannot do up_read() in nmi context. To do build_id lookup + * in nmi context, we need to run up_read() in irq_work. We use + * a percpu variable to do the irq_work. If the irq_work is + * already used by another lookup, we fall back to report ips. * * Same fallback is used for kernel stack (!user) on a stackmap * with build_id. */ - if (!user || !current || !current->mm || in_nmi() || + if (!user || !current || !current->mm || irq_work_busy || down_read_trylock(¤t->mm->mmap_sem) == 0) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { @@ -299,7 +327,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; } - up_read(¤t->mm->mmap_sem); + + if (!in_nmi_ctx) { + up_read(¤t->mm->mmap_sem); + } else { + work->sem = ¤t->mm->mmap_sem; + irq_work_queue(&work->irq_work); + } } BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, @@ -575,3 +609,16 @@ const struct bpf_map_ops stack_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, }; + +static int __init stack_map_init(void) +{ + int cpu; + struct stack_map_irq_work *work; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&up_read_work, cpu); + init_irq_work(&work->irq_work, do_up_read); + } + return 0; +} +subsys_initcall(stack_map_init); -- cgit From dc3b8ae9d271897e09b27fa4e4e0000de98590d1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 May 2018 23:33:20 +0200 Subject: bpf: avoid -Wmaybe-uninitialized warning The stack_map_get_build_id_offset() function is too long for gcc to track whether 'work' may or may not be initialized at the end of it, leading to a false-positive warning: kernel/bpf/stackmap.c: In function 'stack_map_get_build_id_offset': kernel/bpf/stackmap.c:334:13: error: 'work' may be used uninitialized in this function [-Werror=maybe-uninitialized] This removes the 'in_nmi_ctx' flag and uses the state of that variable itself to see if it got initialized. Fixes: bae77c5eb5b2 ("bpf: enable stackmap with build_id in nmi context") Signed-off-by: Arnd Bergmann Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/stackmap.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/bpf/stackmap.c') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b59ace0f0f09..b675a3f3d141 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -285,11 +285,10 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, { int i; struct vm_area_struct *vma; - bool in_nmi_ctx = in_nmi(); bool irq_work_busy = false; - struct stack_map_irq_work *work; + struct stack_map_irq_work *work = NULL; - if (in_nmi_ctx) { + if (in_nmi()) { work = this_cpu_ptr(&up_read_work); if (work->irq_work.flags & IRQ_WORK_BUSY) /* cannot queue more up_read, fallback */ @@ -328,7 +327,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, id_offs[i].status = BPF_STACK_BUILD_ID_VALID; } - if (!in_nmi_ctx) { + if (!work) { up_read(¤t->mm->mmap_sem); } else { work->sem = ¤t->mm->mmap_sem; -- cgit