diff options
Diffstat (limited to 'kernel')
41 files changed, 3078 insertions, 1420 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 897daa005b23..e597daae6120 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,8 +2,10 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +obj-$(CONFIG_BPF_SYSCALL) += disasm.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o +obj-$(CONFIG_BPF_SYSCALL) += cpumap.o ifeq ($(CONFIG_STREAM_PARSER),y) obj-$(CONFIG_BPF_SYSCALL) += sockmap.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 98c0f00c3f5e..68d866628be0 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; - if (perf_event_read_local(event, &value) == -EOPNOTSUPP) + if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) goto err_out; ee = bpf_event_entry_gen(perf_file, map_file); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 546113430049..3db5a17fcfe8 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -27,129 +27,405 @@ void cgroup_bpf_put(struct cgroup *cgrp) { unsigned int type; - for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) { - struct bpf_prog *prog = cgrp->bpf.prog[type]; - - if (prog) { - bpf_prog_put(prog); + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { + struct list_head *progs = &cgrp->bpf.progs[type]; + struct bpf_prog_list *pl, *tmp; + + list_for_each_entry_safe(pl, tmp, progs, node) { + list_del(&pl->node); + bpf_prog_put(pl->prog); + kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } + bpf_prog_array_free(cgrp->bpf.effective[type]); + } +} + +/* count number of elements in the list. + * it's slow but the list cannot be long + */ +static u32 prog_list_length(struct list_head *head) +{ + struct bpf_prog_list *pl; + u32 cnt = 0; + + list_for_each_entry(pl, head, node) { + if (!pl->prog) + continue; + cnt++; } + return cnt; +} + +/* if parent has non-overridable prog attached, + * disallow attaching new programs to the descendent cgroup. + * if parent has overridable or multi-prog, allow attaching + */ +static bool hierarchy_allows_attach(struct cgroup *cgrp, + enum bpf_attach_type type, + u32 new_flags) +{ + struct cgroup *p; + + p = cgroup_parent(cgrp); + if (!p) + return true; + do { + u32 flags = p->bpf.flags[type]; + u32 cnt; + + if (flags & BPF_F_ALLOW_MULTI) + return true; + cnt = prog_list_length(&p->bpf.progs[type]); + WARN_ON_ONCE(cnt > 1); + if (cnt == 1) + return !!(flags & BPF_F_ALLOW_OVERRIDE); + p = cgroup_parent(p); + } while (p); + return true; +} + +/* compute a chain of effective programs for a given cgroup: + * start from the list of programs in this cgroup and add + * all parent programs. + * Note that parent's F_ALLOW_OVERRIDE-type program is yielding + * to programs in this cgroup + */ +static int compute_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type, + struct bpf_prog_array __rcu **array) +{ + struct bpf_prog_array __rcu *progs; + struct bpf_prog_list *pl; + struct cgroup *p = cgrp; + int cnt = 0; + + /* count number of effective programs by walking parents */ + do { + if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + cnt += prog_list_length(&p->bpf.progs[type]); + p = cgroup_parent(p); + } while (p); + + progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); + if (!progs) + return -ENOMEM; + + /* populate the array with effective progs */ + cnt = 0; + p = cgrp; + do { + if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + list_for_each_entry(pl, + &p->bpf.progs[type], node) { + if (!pl->prog) + continue; + rcu_dereference_protected(progs, 1)-> + progs[cnt++] = pl->prog; + } + p = cgroup_parent(p); + } while (p); + + *array = progs; + return 0; +} + +static void activate_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type, + struct bpf_prog_array __rcu *array) +{ + struct bpf_prog_array __rcu *old_array; + + old_array = xchg(&cgrp->bpf.effective[type], array); + /* free prog array after grace period, since __cgroup_bpf_run_*() + * might be still walking the array + */ + bpf_prog_array_free(old_array); } /** * cgroup_bpf_inherit() - inherit effective programs from parent * @cgrp: the cgroup to modify - * @parent: the parent to inherit from */ -void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) +int cgroup_bpf_inherit(struct cgroup *cgrp) { - unsigned int type; +/* has to use marco instead of const int, since compiler thinks + * that array below is variable length + */ +#define NR ARRAY_SIZE(cgrp->bpf.effective) + struct bpf_prog_array __rcu *arrays[NR] = {}; + int i; - for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) { - struct bpf_prog *e; + for (i = 0; i < NR; i++) + INIT_LIST_HEAD(&cgrp->bpf.progs[i]); - e = rcu_dereference_protected(parent->bpf.effective[type], - lockdep_is_held(&cgroup_mutex)); - rcu_assign_pointer(cgrp->bpf.effective[type], e); - cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type]; - } + for (i = 0; i < NR; i++) + if (compute_effective_progs(cgrp, i, &arrays[i])) + goto cleanup; + + for (i = 0; i < NR; i++) + activate_effective_progs(cgrp, i, arrays[i]); + + return 0; +cleanup: + for (i = 0; i < NR; i++) + bpf_prog_array_free(arrays[i]); + return -ENOMEM; } +#define BPF_CGROUP_MAX_PROGS 64 + /** - * __cgroup_bpf_update() - Update the pinned program of a cgroup, and + * __cgroup_bpf_attach() - Attach the program to a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse - * @parent: The parent of @cgrp, or %NULL if @cgrp is the root - * @prog: A new program to pin - * @type: Type of pinning operation (ingress/egress) - * - * Each cgroup has a set of two pointers for bpf programs; one for eBPF - * programs it owns, and which is effective for execution. - * - * If @prog is not %NULL, this function attaches a new program to the cgroup - * and releases the one that is currently attached, if any. @prog is then made - * the effective program of type @type in that cgroup. - * - * If @prog is %NULL, the currently attached program of type @type is released, - * and the effective program of the parent cgroup (if any) is inherited to - * @cgrp. - * - * Then, the descendants of @cgrp are walked and the effective program for - * each of them is set to the effective program of @cgrp unless the - * descendant has its own program attached, in which case the subbranch is - * skipped. This ensures that delegated subcgroups with own programs are left - * untouched. + * @prog: A program to attach + * @type: Type of attach operation * * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, - struct bpf_prog *prog, enum bpf_attach_type type, - bool new_overridable) +int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) { - struct bpf_prog *old_prog, *effective = NULL; - struct cgroup_subsys_state *pos; - bool overridable = true; - - if (parent) { - overridable = !parent->bpf.disallow_override[type]; - effective = rcu_dereference_protected(parent->bpf.effective[type], - lockdep_is_held(&cgroup_mutex)); - } - - if (prog && effective && !overridable) - /* if parent has non-overridable prog attached, disallow - * attaching new programs to descendent cgroup - */ + struct list_head *progs = &cgrp->bpf.progs[type]; + struct bpf_prog *old_prog = NULL; + struct cgroup_subsys_state *css; + struct bpf_prog_list *pl; + bool pl_was_allocated; + int err; + + if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) + /* invalid combination */ + return -EINVAL; + + if (!hierarchy_allows_attach(cgrp, type, flags)) return -EPERM; - if (prog && effective && overridable != new_overridable) - /* if parent has overridable prog attached, only - * allow overridable programs in descendent cgroup + if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) + /* Disallow attaching non-overridable on top + * of existing overridable in this cgroup. + * Disallow attaching multi-prog if overridable or none */ return -EPERM; - old_prog = cgrp->bpf.prog[type]; - - if (prog) { - overridable = new_overridable; - effective = prog; - if (old_prog && - cgrp->bpf.disallow_override[type] == new_overridable) - /* disallow attaching non-overridable on top - * of existing overridable in this cgroup - * and vice versa - */ - return -EPERM; + if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) + return -E2BIG; + + if (flags & BPF_F_ALLOW_MULTI) { + list_for_each_entry(pl, progs, node) + if (pl->prog == prog) + /* disallow attaching the same prog twice */ + return -EINVAL; + + pl = kmalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return -ENOMEM; + pl_was_allocated = true; + pl->prog = prog; + list_add_tail(&pl->node, progs); + } else { + if (list_empty(progs)) { + pl = kmalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return -ENOMEM; + pl_was_allocated = true; + list_add_tail(&pl->node, progs); + } else { + pl = list_first_entry(progs, typeof(*pl), node); + old_prog = pl->prog; + pl_was_allocated = false; + } + pl->prog = prog; } - if (!prog && !old_prog) - /* report error when trying to detach and nothing is attached */ - return -ENOENT; - - cgrp->bpf.prog[type] = prog; + cgrp->bpf.flags[type] = flags; - css_for_each_descendant_pre(pos, &cgrp->self) { - struct cgroup *desc = container_of(pos, struct cgroup, self); + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); - /* skip the subtree if the descendant has its own program */ - if (desc->bpf.prog[type] && desc != cgrp) { - pos = css_rightmost_descendant(pos); - } else { - rcu_assign_pointer(desc->bpf.effective[type], - effective); - desc->bpf.disallow_override[type] = !overridable; - } + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; } - if (prog) - static_branch_inc(&cgroup_bpf_enabled_key); + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + static_branch_inc(&cgroup_bpf_enabled_key); if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* and cleanup the prog list */ + pl->prog = old_prog; + if (pl_was_allocated) { + list_del(&pl->node); + kfree(pl); + } + return err; +} + +/** + * __cgroup_bpf_detach() - Detach the program from a cgroup, and + * propagate the change to descendants + * @cgrp: The cgroup which descendants to traverse + * @prog: A program to detach or NULL + * @type: Type of detach operation + * + * Must be called with cgroup_mutex held. + */ +int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 unused_flags) +{ + struct list_head *progs = &cgrp->bpf.progs[type]; + u32 flags = cgrp->bpf.flags[type]; + struct bpf_prog *old_prog = NULL; + struct cgroup_subsys_state *css; + struct bpf_prog_list *pl; + int err; + + if (flags & BPF_F_ALLOW_MULTI) { + if (!prog) + /* to detach MULTI prog the user has to specify valid FD + * of the program to be detached + */ + return -EINVAL; + } else { + if (list_empty(progs)) + /* report error when trying to detach and nothing is attached */ + return -ENOENT; + } + + if (flags & BPF_F_ALLOW_MULTI) { + /* find the prog and detach it */ + list_for_each_entry(pl, progs, node) { + if (pl->prog != prog) + continue; + old_prog = prog; + /* mark it deleted, so it's ignored while + * recomputing effective + */ + pl->prog = NULL; + break; + } + if (!old_prog) + return -ENOENT; + } else { + /* to maintain backward compatibility NONE and OVERRIDE cgroups + * allow detaching with invalid FD (prog==NULL) + */ + pl = list_first_entry(progs, typeof(*pl), node); + old_prog = pl->prog; + pl->prog = NULL; + } + + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; + } + + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* now can actually delete it from this cgroup list */ + list_del(&pl->node); + kfree(pl); + if (list_empty(progs)) + /* last program was detached, reset flags to zero */ + cgrp->bpf.flags[type] = 0; + + bpf_prog_put(old_prog); + static_branch_dec(&cgroup_bpf_enabled_key); + return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* and restore back old_prog */ + pl->prog = old_prog; + return err; +} + +/* Must be called with cgroup_mutex held to avoid races. */ +int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + enum bpf_attach_type type = attr->query.attach_type; + struct list_head *progs = &cgrp->bpf.progs[type]; + u32 flags = cgrp->bpf.flags[type]; + int cnt, ret = 0, i; + + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) + cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); + else + cnt = prog_list_length(progs); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) + return -EFAULT; + if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) + /* return early if user requested only program count + flags */ + return 0; + if (attr->query.prog_cnt < cnt) { + cnt = attr->query.prog_cnt; + ret = -ENOSPC; + } + + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], + prog_ids, cnt); + } else { + struct bpf_prog_list *pl; + u32 id; + + i = 0; + list_for_each_entry(pl, progs, node) { + id = pl->prog->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) + return -EFAULT; + if (++i == cnt) + break; + } + } + return ret; } /** @@ -171,36 +447,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum bpf_attach_type type) { - struct bpf_prog *prog; + unsigned int offset = skb->data - skb_network_header(skb); + struct sock *save_sk; struct cgroup *cgrp; - int ret = 0; + int ret; if (!sk || !sk_fullsock(sk)) return 0; - if (sk->sk_family != AF_INET && - sk->sk_family != AF_INET6) + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - - rcu_read_lock(); - - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) { - unsigned int offset = skb->data - skb_network_header(skb); - struct sock *save_sk = skb->sk; - - skb->sk = sk; - __skb_push(skb, offset); - ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; - __skb_pull(skb, offset); - skb->sk = save_sk; - } - - rcu_read_unlock(); - - return ret; + save_sk = skb->sk; + skb->sk = sk; + __skb_push(skb, offset); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, + bpf_prog_run_save_cb); + __skb_pull(skb, offset); + skb->sk = save_sk; + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); @@ -221,19 +487,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - struct bpf_prog *prog; - int ret = 0; - - - rcu_read_lock(); - - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) - ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM; - - rcu_read_unlock(); + int ret; - return ret; + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -258,18 +515,10 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, enum bpf_attach_type type) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - struct bpf_prog *prog; - int ret = 0; - - - rcu_read_lock(); + int ret; - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) - ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM; - - rcu_read_unlock(); - - return ret; + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, + BPF_PROG_RUN); + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 917cc04a0a94..8e7c8bf2b687 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -309,12 +309,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { + const char *end = sym + KSYM_NAME_LEN; + BUILD_BUG_ON(sizeof("bpf_prog_") + - sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN); + sizeof(prog->tag) * 2 + + /* name has been null terminated. + * We should need +1 for the '_' preceding + * the name. However, the null character + * is double counted between the name and the + * sizeof("bpf_prog_") above, so we omit + * the +1 here. + */ + sizeof(prog->aux->name) > KSYM_NAME_LEN); sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); - *sym = 0; + if (prog->aux->name[0]) + snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name); + else + *sym = 0; } static __always_inline unsigned long @@ -1022,7 +1035,7 @@ select_insn: struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog; - u64 index = BPF_R3; + u32 index = BPF_R3; if (unlikely(index >= array->map.max_entries)) goto out; @@ -1381,6 +1394,75 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); +/* to avoid allocating empty bpf_prog_array for cgroups that + * don't have bpf program attached use one global 'empty_prog_array' + * It will not be modified the caller of bpf_prog_array_alloc() + * (since caller requested prog_cnt == 0) + * that pointer should be 'freed' by bpf_prog_array_free() + */ +static struct { + struct bpf_prog_array hdr; + struct bpf_prog *null_prog; +} empty_prog_array = { + .null_prog = NULL, +}; + +struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) +{ + if (prog_cnt) + return kzalloc(sizeof(struct bpf_prog_array) + + sizeof(struct bpf_prog *) * (prog_cnt + 1), + flags); + + return &empty_prog_array.hdr; +} + +void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) +{ + if (!progs || + progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr) + return; + kfree_rcu(progs, rcu); +} + +int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) +{ + struct bpf_prog **prog; + u32 cnt = 0; + + rcu_read_lock(); + prog = rcu_dereference(progs)->progs; + for (; *prog; prog++) + cnt++; + rcu_read_unlock(); + return cnt; +} + +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, + __u32 __user *prog_ids, u32 cnt) +{ + struct bpf_prog **prog; + u32 i = 0, id; + + rcu_read_lock(); + prog = rcu_dereference(progs)->progs; + for (; *prog; prog++) { + id = (*prog)->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) { + rcu_read_unlock(); + return -EFAULT; + } + if (++i == cnt) { + prog++; + break; + } + } + rcu_read_unlock(); + if (*prog) + return -ENOSPC; + return 0; +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; @@ -1498,5 +1580,8 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); +/* These are only used within the BPF_SYSCALL code */ +#ifdef CONFIG_BPF_SYSCALL EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type); EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); +#endif diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c new file mode 100644 index 000000000000..b4358d84ddf1 --- /dev/null +++ b/kernel/bpf/cpumap.c @@ -0,0 +1,702 @@ +/* bpf/cpumap.c + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * Released under terms in GPL version 2. See COPYING. + */ + +/* The 'cpumap' is primarily used as a backend map for XDP BPF helper + * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. + * + * Unlike devmap which redirects XDP frames out another NIC device, + * this map type redirects raw XDP frames to another CPU. The remote + * CPU will do SKB-allocation and call the normal network stack. + * + * This is a scalability and isolation mechanism, that allow + * separating the early driver network XDP layer, from the rest of the + * netstack, and assigning dedicated CPUs for this stage. This + * basically allows for 10G wirespeed pre-filtering via bpf. + */ +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/ptr_ring.h> + +#include <linux/sched.h> +#include <linux/workqueue.h> +#include <linux/kthread.h> +#include <linux/capability.h> +#include <trace/events/xdp.h> + +#include <linux/netdevice.h> /* netif_receive_skb_core */ +#include <linux/etherdevice.h> /* eth_type_trans */ + +/* General idea: XDP packets getting XDP redirected to another CPU, + * will maximum be stored/queued for one driver ->poll() call. It is + * guaranteed that setting flush bit and flush operation happen on + * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() + * which queue in bpf_cpu_map_entry contains packets. + */ + +#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ +struct xdp_bulk_queue { + void *q[CPU_MAP_BULK_SIZE]; + unsigned int count; +}; + +/* Struct for every remote "destination" CPU in map */ +struct bpf_cpu_map_entry { + u32 cpu; /* kthread CPU and map index */ + int map_id; /* Back reference to map */ + u32 qsize; /* Queue size placeholder for map lookup */ + + /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ + struct xdp_bulk_queue __percpu *bulkq; + + /* Queue with potential multi-producers, and single-consumer kthread */ + struct ptr_ring *queue; + struct task_struct *kthread; + struct work_struct kthread_stop_wq; + + atomic_t refcnt; /* Control when this struct can be free'ed */ + struct rcu_head rcu; +}; + +struct bpf_cpu_map { + struct bpf_map map; + /* Below members specific for map type */ + struct bpf_cpu_map_entry **cpu_map; + unsigned long __percpu *flush_needed; +}; + +static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, + struct xdp_bulk_queue *bq); + +static u64 cpu_map_bitmap_size(const union bpf_attr *attr) +{ + return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); +} + +static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) +{ + struct bpf_cpu_map *cmap; + int err = -ENOMEM; + u64 cost; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + return ERR_PTR(-EINVAL); + + cmap = kzalloc(sizeof(*cmap), GFP_USER); + if (!cmap) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + cmap->map.map_type = attr->map_type; + cmap->map.key_size = attr->key_size; + cmap->map.value_size = attr->value_size; + cmap->map.max_entries = attr->max_entries; + cmap->map.map_flags = attr->map_flags; + cmap->map.numa_node = bpf_map_attr_numa_node(attr); + + /* Pre-limit array size based on NR_CPUS, not final CPU check */ + if (cmap->map.max_entries > NR_CPUS) { + err = -E2BIG; + goto free_cmap; + } + + /* make sure page count doesn't overflow */ + cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); + cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_cmap; + cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* Notice returns -EPERM on if map size is larger than memlock limit */ + ret = bpf_map_precharge_memlock(cmap->map.pages); + if (ret) { + err = ret; + goto free_cmap; + } + + /* A per cpu bitfield with a bit per possible CPU in map */ + cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), + __alignof__(unsigned long)); + if (!cmap->flush_needed) + goto free_cmap; + + /* Alloc array for possible remote "destination" CPUs */ + cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * + sizeof(struct bpf_cpu_map_entry *), + cmap->map.numa_node); + if (!cmap->cpu_map) + goto free_percpu; + + return &cmap->map; +free_percpu: + free_percpu(cmap->flush_needed); +free_cmap: + kfree(cmap); + return ERR_PTR(err); +} + +void __cpu_map_queue_destructor(void *ptr) +{ + /* The tear-down procedure should have made sure that queue is + * empty. See __cpu_map_entry_replace() and work-queue + * invoked cpu_map_kthread_stop(). Catch any broken behaviour + * gracefully and warn once. + */ + if (WARN_ON_ONCE(ptr)) + page_frag_free(ptr); +} + +static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ + if (atomic_dec_and_test(&rcpu->refcnt)) { + /* The queue should be empty at this point */ + ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor); + kfree(rcpu->queue); + kfree(rcpu); + } +} + +static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ + atomic_inc(&rcpu->refcnt); +} + +/* called from workqueue, to workaround syscall using preempt_disable */ +static void cpu_map_kthread_stop(struct work_struct *work) +{ + struct bpf_cpu_map_entry *rcpu; + + rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); + + /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier, + * as it waits until all in-flight call_rcu() callbacks complete. + */ + rcu_barrier(); + + /* kthread_stop will wake_up_process and wait for it to complete */ + kthread_stop(rcpu->kthread); +} + +/* For now, xdp_pkt is a cpumap internal data structure, with info + * carried between enqueue to dequeue. It is mapped into the top + * headroom of the packet, to avoid allocating separate mem. + */ +struct xdp_pkt { + void *data; + u16 len; + u16 headroom; + u16 metasize; + struct net_device *dev_rx; +}; + +/* Convert xdp_buff to xdp_pkt */ +static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) +{ + struct xdp_pkt *xdp_pkt; + int metasize; + int headroom; + + /* Assure headroom is available for storing info */ + headroom = xdp->data - xdp->data_hard_start; + metasize = xdp->data - xdp->data_meta; + metasize = metasize > 0 ? metasize : 0; + if ((headroom - metasize) < sizeof(*xdp_pkt)) + return NULL; + + /* Store info in top of packet */ + xdp_pkt = xdp->data_hard_start; + + xdp_pkt->data = xdp->data; + xdp_pkt->len = xdp->data_end - xdp->data; + xdp_pkt->headroom = headroom - sizeof(*xdp_pkt); + xdp_pkt->metasize = metasize; + + return xdp_pkt; +} + +struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, + struct xdp_pkt *xdp_pkt) +{ + unsigned int frame_size; + void *pkt_data_start; + struct sk_buff *skb; + + /* build_skb need to place skb_shared_info after SKB end, and + * also want to know the memory "truesize". Thus, need to + * know the memory frame size backing xdp_buff. + * + * XDP was designed to have PAGE_SIZE frames, but this + * assumption is not longer true with ixgbe and i40e. It + * would be preferred to set frame_size to 2048 or 4096 + * depending on the driver. + * frame_size = 2048; + * frame_len = frame_size - sizeof(*xdp_pkt); + * + * Instead, with info avail, skb_shared_info in placed after + * packet len. This, unfortunately fakes the truesize. + * Another disadvantage of this approach, the skb_shared_info + * is not at a fixed memory location, with mixed length + * packets, which is bad for cache-line hotness. + */ + frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + pkt_data_start = xdp_pkt->data - xdp_pkt->headroom; + skb = build_skb(pkt_data_start, frame_size); + if (!skb) + return NULL; + + skb_reserve(skb, xdp_pkt->headroom); + __skb_put(skb, xdp_pkt->len); + if (xdp_pkt->metasize) + skb_metadata_set(skb, xdp_pkt->metasize); + + /* Essential SKB info: protocol and skb->dev */ + skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx); + + /* Optional SKB info, currently missing: + * - HW checksum info (skb->ip_summed) + * - HW RX hash (skb_set_hash) + * - RX ring dev queue index (skb_record_rx_queue) + */ + + return skb; +} + +static int cpu_map_kthread_run(void *data) +{ + struct bpf_cpu_map_entry *rcpu = data; + + set_current_state(TASK_INTERRUPTIBLE); + + /* When kthread gives stop order, then rcpu have been disconnected + * from map, thus no new packets can enter. Remaining in-flight + * per CPU stored packets are flushed to this queue. Wait honoring + * kthread_stop signal until queue is empty. + */ + while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { + unsigned int processed = 0, drops = 0, sched = 0; + struct xdp_pkt *xdp_pkt; + + /* Release CPU reschedule checks */ + if (__ptr_ring_empty(rcpu->queue)) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule(); + sched = 1; + } else { + sched = cond_resched(); + } + __set_current_state(TASK_RUNNING); + + /* Process packets in rcpu->queue */ + local_bh_disable(); + /* + * The bpf_cpu_map_entry is single consumer, with this + * kthread CPU pinned. Lockless access to ptr_ring + * consume side valid as no-resize allowed of queue. + */ + while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) { + struct sk_buff *skb; + int ret; + + skb = cpu_map_build_skb(rcpu, xdp_pkt); + if (!skb) { + page_frag_free(xdp_pkt); + continue; + } + + /* Inject into network stack */ + ret = netif_receive_skb_core(skb); + if (ret == NET_RX_DROP) + drops++; + + /* Limit BH-disable period */ + if (++processed == 8) + break; + } + /* Feedback loop via tracepoint */ + trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched); + + local_bh_enable(); /* resched point, may call do_softirq() */ + } + __set_current_state(TASK_RUNNING); + + put_cpu_map_entry(rcpu); + return 0; +} + +struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) +{ + gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; + struct bpf_cpu_map_entry *rcpu; + int numa, err; + + /* Have map->numa_node, but choose node of redirect target CPU */ + numa = cpu_to_node(cpu); + + rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa); + if (!rcpu) + return NULL; + + /* Alloc percpu bulkq */ + rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq), + sizeof(void *), gfp); + if (!rcpu->bulkq) + goto free_rcu; + + /* Alloc queue */ + rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); + if (!rcpu->queue) + goto free_bulkq; + + err = ptr_ring_init(rcpu->queue, qsize, gfp); + if (err) + goto free_queue; + + rcpu->cpu = cpu; + rcpu->map_id = map_id; + rcpu->qsize = qsize; + + /* Setup kthread */ + rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, + "cpumap/%d/map:%d", cpu, map_id); + if (IS_ERR(rcpu->kthread)) + goto free_ptr_ring; + + get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ + get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ + + /* Make sure kthread runs on a single CPU */ + kthread_bind(rcpu->kthread, cpu); + wake_up_process(rcpu->kthread); + + return rcpu; + +free_ptr_ring: + ptr_ring_cleanup(rcpu->queue, NULL); +free_queue: + kfree(rcpu->queue); +free_bulkq: + free_percpu(rcpu->bulkq); +free_rcu: + kfree(rcpu); + return NULL; +} + +void __cpu_map_entry_free(struct rcu_head *rcu) +{ + struct bpf_cpu_map_entry *rcpu; + int cpu; + + /* This cpu_map_entry have been disconnected from map and one + * RCU graze-period have elapsed. Thus, XDP cannot queue any + * new packets and cannot change/set flush_needed that can + * find this entry. + */ + rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); + + /* Flush remaining packets in percpu bulkq */ + for_each_online_cpu(cpu) { + struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); + + /* No concurrent bq_enqueue can run at this point */ + bq_flush_to_queue(rcpu, bq); + } + free_percpu(rcpu->bulkq); + /* Cannot kthread_stop() here, last put free rcpu resources */ + put_cpu_map_entry(rcpu); +} + +/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to + * ensure any driver rcu critical sections have completed, but this + * does not guarantee a flush has happened yet. Because driver side + * rcu_read_lock/unlock only protects the running XDP program. The + * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a + * pending flush op doesn't fail. + * + * The bpf_cpu_map_entry is still used by the kthread, and there can + * still be pending packets (in queue and percpu bulkq). A refcnt + * makes sure to last user (kthread_stop vs. call_rcu) free memory + * resources. + * + * The rcu callback __cpu_map_entry_free flush remaining packets in + * percpu bulkq to queue. Due to caller map_delete_elem() disable + * preemption, cannot call kthread_stop() to make sure queue is empty. + * Instead a work_queue is started for stopping kthread, + * cpu_map_kthread_stop, which waits for an RCU graze period before + * stopping kthread, emptying the queue. + */ +void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, + u32 key_cpu, struct bpf_cpu_map_entry *rcpu) +{ + struct bpf_cpu_map_entry *old_rcpu; + + old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu); + if (old_rcpu) { + call_rcu(&old_rcpu->rcu, __cpu_map_entry_free); + INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop); + schedule_work(&old_rcpu->kthread_stop_wq); + } +} + +int cpu_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + u32 key_cpu = *(u32 *)key; + + if (key_cpu >= map->max_entries) + return -EINVAL; + + /* notice caller map_delete_elem() use preempt_disable() */ + __cpu_map_entry_replace(cmap, key_cpu, NULL); + return 0; +} + +int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpu_map_entry *rcpu; + + /* Array index key correspond to CPU number */ + u32 key_cpu = *(u32 *)key; + /* Value is the queue size */ + u32 qsize = *(u32 *)value; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(key_cpu >= cmap->map.max_entries)) + return -E2BIG; + if (unlikely(map_flags == BPF_NOEXIST)) + return -EEXIST; + if (unlikely(qsize > 16384)) /* sanity limit on qsize */ + return -EOVERFLOW; + + /* Make sure CPU is a valid possible cpu */ + if (!cpu_possible(key_cpu)) + return -ENODEV; + + if (qsize == 0) { + rcpu = NULL; /* Same as deleting */ + } else { + /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ + rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); + if (!rcpu) + return -ENOMEM; + } + rcu_read_lock(); + __cpu_map_entry_replace(cmap, key_cpu, rcpu); + rcu_read_unlock(); + return 0; +} + +void cpu_map_free(struct bpf_map *map) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + int cpu; + u32 i; + + /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the bpf programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete. The rcu critical section only guarantees + * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map. + * It does __not__ ensure pending flush operations (if any) are + * complete. + */ + synchronize_rcu(); + + /* To ensure all pending flush operations have completed wait for flush + * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. + * Because the above synchronize_rcu() ensures the map is disconnected + * from the program we can assume no new bits will be set. + */ + for_each_online_cpu(cpu) { + unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); + + while (!bitmap_empty(bitmap, cmap->map.max_entries)) + cond_resched(); + } + + /* For cpu_map the remote CPUs can still be using the entries + * (struct bpf_cpu_map_entry). + */ + for (i = 0; i < cmap->map.max_entries; i++) { + struct bpf_cpu_map_entry *rcpu; + + rcpu = READ_ONCE(cmap->cpu_map[i]); + if (!rcpu) + continue; + + /* bq flush and cleanup happens after RCU graze-period */ + __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ + } + free_percpu(cmap->flush_needed); + bpf_map_area_free(cmap->cpu_map); + kfree(cmap); +} + +struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpu_map_entry *rcpu; + + if (key >= map->max_entries) + return NULL; + + rcpu = READ_ONCE(cmap->cpu_map[key]); + return rcpu; +} + +static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_cpu_map_entry *rcpu = + __cpu_map_lookup_elem(map, *(u32 *)key); + + return rcpu ? &rcpu->qsize : NULL; +} + +static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = next_key; + + if (index >= cmap->map.max_entries) { + *next = 0; + return 0; + } + + if (index == cmap->map.max_entries - 1) + return -ENOENT; + *next = index + 1; + return 0; +} + +const struct bpf_map_ops cpu_map_ops = { + .map_alloc = cpu_map_alloc, + .map_free = cpu_map_free, + .map_delete_elem = cpu_map_delete_elem, + .map_update_elem = cpu_map_update_elem, + .map_lookup_elem = cpu_map_lookup_elem, + .map_get_next_key = cpu_map_get_next_key, +}; + +static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, + struct xdp_bulk_queue *bq) +{ + unsigned int processed = 0, drops = 0; + const int to_cpu = rcpu->cpu; + struct ptr_ring *q; + int i; + + if (unlikely(!bq->count)) + return 0; + + q = rcpu->queue; + spin_lock(&q->producer_lock); + + for (i = 0; i < bq->count; i++) { + void *xdp_pkt = bq->q[i]; + int err; + + err = __ptr_ring_produce(q, xdp_pkt); + if (err) { + drops++; + page_frag_free(xdp_pkt); /* Free xdp_pkt */ + } + processed++; + } + bq->count = 0; + spin_unlock(&q->producer_lock); + + /* Feedback loop via tracepoints */ + trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); + return 0; +} + +/* Runs under RCU-read-side, plus in softirq under NAPI protection. + * Thus, safe percpu variable access. + */ +static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) +{ + struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); + + if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) + bq_flush_to_queue(rcpu, bq); + + /* Notice, xdp_buff/page MUST be queued here, long enough for + * driver to code invoking us to finished, due to driver + * (e.g. ixgbe) recycle tricks based on page-refcnt. + * + * Thus, incoming xdp_pkt is always queued here (else we race + * with another CPU on page-refcnt and remaining driver code). + * Queue time is very short, as driver will invoke flush + * operation, when completing napi->poll call. + */ + bq->q[bq->count++] = xdp_pkt; + return 0; +} + +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + struct xdp_pkt *xdp_pkt; + + xdp_pkt = convert_to_xdp_pkt(xdp); + if (!xdp_pkt) + return -EOVERFLOW; + + /* Info needed when constructing SKB on remote CPU */ + xdp_pkt->dev_rx = dev_rx; + + bq_enqueue(rcpu, xdp_pkt); + return 0; +} + +void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); + + __set_bit(bit, bitmap); +} + +void __cpu_map_flush(struct bpf_map *map) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); + u32 bit; + + /* The napi->poll softirq makes sure __cpu_map_insert_ctx() + * and __cpu_map_flush() happen on same CPU. Thus, the percpu + * bitmap indicate which percpu bulkq have packets. + */ + for_each_set_bit(bit, bitmap, map->max_entries) { + struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); + struct xdp_bulk_queue *bq; + + /* This is possible if entry is removed by user space + * between xdp redirect and flush op. + */ + if (unlikely(!rcpu)) + continue; + + __clear_bit(bit, bitmap); + + /* Flush all frames in bulkq to real queue */ + bq = this_cpu_ptr(rcpu->bulkq); + bq_flush_to_queue(rcpu, bq); + + /* If already running, costs spin_lock_irqsave + smb_mb */ + wake_up_process(rcpu->kthread); + } +} diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c new file mode 100644 index 000000000000..e682850c9715 --- /dev/null +++ b/kernel/bpf/disasm.c @@ -0,0 +1,214 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/bpf.h> + +#include "disasm.h" + +#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) +static const char * const func_id_str[] = { + __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) +}; +#undef __BPF_FUNC_STR_FN + +const char *func_id_name(int id) +{ + BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + + if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) + return func_id_str[id]; + else + return "unknown"; +} + +const char *const bpf_class_string[8] = { + [BPF_LD] = "ld", + [BPF_LDX] = "ldx", + [BPF_ST] = "st", + [BPF_STX] = "stx", + [BPF_ALU] = "alu", + [BPF_JMP] = "jmp", + [BPF_RET] = "BUG", + [BPF_ALU64] = "alu64", +}; + +const char *const bpf_alu_string[16] = { + [BPF_ADD >> 4] = "+=", + [BPF_SUB >> 4] = "-=", + [BPF_MUL >> 4] = "*=", + [BPF_DIV >> 4] = "/=", + [BPF_OR >> 4] = "|=", + [BPF_AND >> 4] = "&=", + [BPF_LSH >> 4] = "<<=", + [BPF_RSH >> 4] = ">>=", + [BPF_NEG >> 4] = "neg", + [BPF_MOD >> 4] = "%=", + [BPF_XOR >> 4] = "^=", + [BPF_MOV >> 4] = "=", + [BPF_ARSH >> 4] = "s>>=", + [BPF_END >> 4] = "endian", +}; + +static const char *const bpf_ldst_string[] = { + [BPF_W >> 3] = "u32", + [BPF_H >> 3] = "u16", + [BPF_B >> 3] = "u8", + [BPF_DW >> 3] = "u64", +}; + +static const char *const bpf_jmp_string[16] = { + [BPF_JA >> 4] = "jmp", + [BPF_JEQ >> 4] = "==", + [BPF_JGT >> 4] = ">", + [BPF_JLT >> 4] = "<", + [BPF_JGE >> 4] = ">=", + [BPF_JLE >> 4] = "<=", + [BPF_JSET >> 4] = "&", + [BPF_JNE >> 4] = "!=", + [BPF_JSGT >> 4] = "s>", + [BPF_JSLT >> 4] = "s<", + [BPF_JSGE >> 4] = "s>=", + [BPF_JSLE >> 4] = "s<=", + [BPF_CALL >> 4] = "call", + [BPF_EXIT >> 4] = "exit", +}; + +static void print_bpf_end_insn(bpf_insn_print_cb verbose, + struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", + insn->imm, insn->dst_reg); +} + +void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, + const struct bpf_insn *insn, bool allow_ptr_leaks) +{ + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + if (BPF_OP(insn->code) == BPF_END) { + if (class == BPF_ALU64) + verbose(env, "BUG_alu64_%02x\n", insn->code); + else + print_bpf_end_insn(verbose, env, insn); + } else if (BPF_OP(insn->code) == BPF_NEG) { + verbose(env, "(%02x) r%d = %s-r%d\n", + insn->code, insn->dst_reg, + class == BPF_ALU ? "(u32) " : "", + insn->dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose(env, "(%02x) %sr%d %s %sr%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->src_reg); + } else { + verbose(env, "(%02x) %sr%d %s %s%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->imm); + } + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_MEM) + verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->src_reg); + else if (BPF_MODE(insn->code) == BPF_XADD) + verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + else + verbose(env, "BUG_%02x\n", insn->code); + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "BUG_st_%02x\n", insn->code); + return; + } + verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "BUG_ldx_%02x\n", insn->code); + return; + } + verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (class == BPF_LD) { + if (BPF_MODE(insn->code) == BPF_ABS) { + verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IND) { + verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM && + BPF_SIZE(insn->code) == BPF_DW) { + /* At this point, we already made sure that the second + * part of the ldimm64 insn is accessible. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + + if (map_ptr && !allow_ptr_leaks) + imm = 0; + + verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, + insn->dst_reg, (unsigned long long)imm); + } else { + verbose(env, "BUG_ld_%02x\n", insn->code); + return; + } + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + verbose(env, "(%02x) call %s#%d\n", insn->code, + func_id_name(insn->imm), insn->imm); + } else if (insn->code == (BPF_JMP | BPF_JA)) { + verbose(env, "(%02x) goto pc%+d\n", + insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_EXIT)) { + verbose(env, "(%02x) exit\n", insn->code); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->src_reg, insn->off); + } else { + verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->imm, insn->off); + } + } else { + verbose(env, "(%02x) %s\n", + insn->code, bpf_class_string[class]); + } +} diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h new file mode 100644 index 000000000000..8de977e420b6 --- /dev/null +++ b/kernel/bpf/disasm.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef __BPF_DISASM_H__ +#define __BPF_DISASM_H__ + +#include <linux/bpf.h> +#include <linux/kernel.h> +#include <linux/stringify.h> + +extern const char *const bpf_alu_string[16]; +extern const char *const bpf_class_string[8]; + +const char *func_id_name(int id); + +struct bpf_verifier_env; +typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env, + const char *, ...); +void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, + const struct bpf_insn *insn, bool allow_ptr_leaks); + +#endif diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index e833ed914358..be1dde967208 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -363,6 +363,7 @@ out: putname(pname); return ret; } +EXPORT_SYMBOL_GPL(bpf_obj_get_user); static void bpf_evict_inode(struct inode *inode) { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 6424ce0e4969..a298d6666698 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -102,7 +102,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) skb_orphan(skb); skb->sk = psock->sock; - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; @@ -369,7 +369,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 25d074920a00..0e893cac6795 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,6 +23,9 @@ #include <linux/version.h> #include <linux/kernel.h> #include <linux/idr.h> +#include <linux/cred.h> +#include <linux/timekeeping.h> +#include <linux/ctype.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -312,7 +315,30 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD numa_node +/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes. + * Return 0 on success and < 0 on error. + */ +static int bpf_obj_name_cpy(char *dst, const char *src) +{ + const char *end = src + BPF_OBJ_NAME_LEN; + + memset(dst, 0, BPF_OBJ_NAME_LEN); + + /* Copy all isalnum() and '_' char */ + while (src < end && *src) { + if (!isalnum(*src) && *src != '_') + return -EINVAL; + *dst++ = *src++; + } + + /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */ + if (src == end) + return -EINVAL; + + return 0; +} + +#define BPF_MAP_CREATE_LAST_FIELD map_name /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -334,6 +360,10 @@ static int map_create(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + err = bpf_obj_name_cpy(map->name, attr->map_name); + if (err) + goto free_map_nouncharge; + atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); @@ -562,6 +592,12 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; + /* Need to create a kthread, thus must support schedule */ + if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + err = map->ops->map_update_elem(map, key, value, attr->flags); + goto out; + } + /* must increment bpf_prog_active to avoid kprobe+bpf triggering from * inside bpf map update or delete otherwise deadlocks are possible */ @@ -592,7 +628,7 @@ static int map_update_elem(union bpf_attr *attr) } __this_cpu_dec(bpf_prog_active); preempt_enable(); - +out: if (!err) trace_bpf_map_update_elem(map, ufd, key, value); free_value: @@ -703,9 +739,9 @@ err_put: return err; } -static const struct bpf_verifier_ops * const bpf_prog_types[] = { -#define BPF_PROG_TYPE(_id, _ops) \ - [_id] = &_ops, +static const struct bpf_prog_ops * const bpf_prog_types[] = { +#define BPF_PROG_TYPE(_id, _name) \ + [_id] = & _name ## _prog_ops, #define BPF_MAP_TYPE(_id, _ops) #include <linux/bpf_types.h> #undef BPF_PROG_TYPE @@ -973,7 +1009,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) EXPORT_SYMBOL_GPL(bpf_prog_get_type); /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_flags +#define BPF_PROG_LOAD_LAST_FIELD prog_name static int bpf_prog_load(union bpf_attr *attr) { @@ -1037,6 +1073,11 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_prog; + prog->aux->load_time = ktime_get_boot_ns(); + err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); + if (err) + goto free_prog; + /* run eBPF verifier */ err = bpf_check(&prog, attr); if (err < 0) @@ -1132,6 +1173,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) return 0; } +#define BPF_F_ATTACH_MASK \ + (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) + static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; @@ -1145,7 +1189,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; - if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE) + if (attr->attach_flags & ~BPF_F_ATTACH_MASK) return -EINVAL; switch (attr->attach_type) { @@ -1176,8 +1220,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) return PTR_ERR(cgrp); } - ret = cgroup_bpf_update(cgrp, prog, attr->attach_type, - attr->attach_flags & BPF_F_ALLOW_OVERRIDE); + ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + attr->attach_flags); if (ret) bpf_prog_put(prog); cgroup_put(cgrp); @@ -1189,6 +1233,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) static int bpf_prog_detach(const union bpf_attr *attr) { + enum bpf_prog_type ptype; + struct bpf_prog *prog; struct cgroup *cgrp; int ret; @@ -1201,26 +1247,67 @@ static int bpf_prog_detach(const union bpf_attr *attr) switch (attr->attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: + ptype = BPF_PROG_TYPE_CGROUP_SKB; + break; case BPF_CGROUP_INET_SOCK_CREATE: + ptype = BPF_PROG_TYPE_CGROUP_SOCK; + break; case BPF_CGROUP_SOCK_OPS: - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - - ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false); - cgroup_put(cgrp); + ptype = BPF_PROG_TYPE_SOCK_OPS; break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - ret = sockmap_get_from_fd(attr, false); - break; + return sockmap_get_from_fd(attr, false); default: return -EINVAL; } + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + prog = NULL; + + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); + if (prog) + bpf_prog_put(prog); + cgroup_put(cgrp); return ret; } +#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt + +static int bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct cgroup *cgrp; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (CHECK_ATTR(BPF_PROG_QUERY)) + return -EINVAL; + if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE) + return -EINVAL; + + switch (attr->query.attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_SOCK_OPS: + break; + default: + return -EINVAL; + } + cgrp = cgroup_get_from_fd(attr->query.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + ret = cgroup_bpf_query(cgrp, attr, uattr); + cgroup_put(cgrp); + return ret; +} #endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration @@ -1358,8 +1445,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.type = prog->type; info.id = prog->aux->id; + info.load_time = prog->aux->load_time; + info.created_by_uid = from_kuid_munged(current_user_ns(), + prog->aux->user->uid); memcpy(info.tag, prog->tag, sizeof(prog->tag)); + memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); + + ulen = info.nr_map_ids; + info.nr_map_ids = prog->aux->used_map_cnt; + ulen = min_t(u32, info.nr_map_ids, ulen); + if (ulen) { + u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids); + u32 i; + + for (i = 0; i < ulen; i++) + if (put_user(prog->aux->used_maps[i]->id, + &user_map_ids[i])) + return -EFAULT; + } if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; @@ -1413,6 +1517,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.value_size = map->value_size; info.max_entries = map->max_entries; info.map_flags = map->map_flags; + memcpy(info.name, map->name, sizeof(map->name)); if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) @@ -1499,6 +1604,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_DETACH: err = bpf_prog_detach(&attr); break; + case BPF_PROG_QUERY: + err = bpf_prog_query(&attr, uattr); + break; #endif case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b914fbe1383e..545b8c45a578 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21,6 +21,17 @@ #include <linux/vmalloc.h> #include <linux/stringify.h> +#include "disasm.h" + +static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { +#define BPF_PROG_TYPE(_id, _name) \ + [_id] = & _name ## _verifier_ops, +#define BPF_MAP_TYPE(_id, _ops) +#include <linux/bpf_types.h> +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +}; + /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. * All paths of conditional branches are analyzed until 'bpf_exit' insn. @@ -153,28 +164,42 @@ struct bpf_call_arg_meta { int access_size; }; -/* verbose verifier prints what it's seeing - * bpf_check() is called under lock, so no race to access these global vars - */ -static u32 log_level, log_size, log_len; -static char *log_buf; - static DEFINE_MUTEX(bpf_verifier_lock); /* log_level controls verbosity level of eBPF verifier. * verbose() is used to dump the verification trace to the log, so the user * can figure out what's wrong with the program */ -static __printf(1, 2) void verbose(const char *fmt, ...) +static __printf(2, 3) void verbose(struct bpf_verifier_env *env, + const char *fmt, ...) { + struct bpf_verifer_log *log = &env->log; + unsigned int n; va_list args; - if (log_level == 0 || log_len >= log_size - 1) + if (!log->level || !log->ubuf || bpf_verifier_log_full(log)) return; va_start(args, fmt); - log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); + n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); va_end(args); + + WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, + "verifier log line truncated - local buffer too short\n"); + + n = min(log->len_total - log->len_used - 1, n); + log->kbuf[n] = '\0'; + + if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) + log->len_used += n; + else + log->ubuf = NULL; +} + +static bool type_is_pkt_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_PACKET || + type == PTR_TO_PACKET_META; } /* string representation of 'enum bpf_reg_type' */ @@ -187,26 +212,12 @@ static const char * const reg_type_str[] = { [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", [PTR_TO_STACK] = "fp", [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", }; -#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) -static const char * const func_id_str[] = { - __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) -}; -#undef __BPF_FUNC_STR_FN - -static const char *func_id_name(int id) -{ - BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); - - if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) - return func_id_str[id]; - else - return "unknown"; -} - -static void print_verifier_state(struct bpf_verifier_state *state) +static void print_verifier_state(struct bpf_verifier_env *env, + struct bpf_verifier_state *state) { struct bpf_reg_state *reg; enum bpf_reg_type t; @@ -217,21 +228,21 @@ static void print_verifier_state(struct bpf_verifier_state *state) t = reg->type; if (t == NOT_INIT) continue; - verbose(" R%d=%s", i, reg_type_str[t]); + verbose(env, " R%d=%s", i, reg_type_str[t]); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ - verbose("%lld", reg->var_off.value + reg->off); + verbose(env, "%lld", reg->var_off.value + reg->off); } else { - verbose("(id=%d", reg->id); + verbose(env, "(id=%d", reg->id); if (t != SCALAR_VALUE) - verbose(",off=%d", reg->off); - if (t == PTR_TO_PACKET) - verbose(",r=%d", reg->range); + verbose(env, ",off=%d", reg->off); + if (type_is_pkt_pointer(t)) + verbose(env, ",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) - verbose(",ks=%d,vs=%d", + verbose(env, ",ks=%d,vs=%d", reg->map_ptr->key_size, reg->map_ptr->value_size); if (tnum_is_const(reg->var_off)) { @@ -239,199 +250,38 @@ static void print_verifier_state(struct bpf_verifier_state *state) * could be a pointer whose offset is too big * for reg->off */ - verbose(",imm=%llx", reg->var_off.value); + verbose(env, ",imm=%llx", reg->var_off.value); } else { if (reg->smin_value != reg->umin_value && reg->smin_value != S64_MIN) - verbose(",smin_value=%lld", + verbose(env, ",smin_value=%lld", (long long)reg->smin_value); if (reg->smax_value != reg->umax_value && reg->smax_value != S64_MAX) - verbose(",smax_value=%lld", + verbose(env, ",smax_value=%lld", (long long)reg->smax_value); if (reg->umin_value != 0) - verbose(",umin_value=%llu", + verbose(env, ",umin_value=%llu", (unsigned long long)reg->umin_value); if (reg->umax_value != U64_MAX) - verbose(",umax_value=%llu", + verbose(env, ",umax_value=%llu", (unsigned long long)reg->umax_value); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(",var_off=%s", tn_buf); + verbose(env, ",var_off=%s", tn_buf); } } - verbose(")"); + verbose(env, ")"); } } for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] == STACK_SPILL) - verbose(" fp%d=%s", -MAX_BPF_STACK + i, + verbose(env, " fp%d=%s", -MAX_BPF_STACK + i, reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]); } - verbose("\n"); -} - -static const char *const bpf_class_string[] = { - [BPF_LD] = "ld", - [BPF_LDX] = "ldx", - [BPF_ST] = "st", - [BPF_STX] = "stx", - [BPF_ALU] = "alu", - [BPF_JMP] = "jmp", - [BPF_RET] = "BUG", - [BPF_ALU64] = "alu64", -}; - -static const char *const bpf_alu_string[16] = { - [BPF_ADD >> 4] = "+=", - [BPF_SUB >> 4] = "-=", - [BPF_MUL >> 4] = "*=", - [BPF_DIV >> 4] = "/=", - [BPF_OR >> 4] = "|=", - [BPF_AND >> 4] = "&=", - [BPF_LSH >> 4] = "<<=", - [BPF_RSH >> 4] = ">>=", - [BPF_NEG >> 4] = "neg", - [BPF_MOD >> 4] = "%=", - [BPF_XOR >> 4] = "^=", - [BPF_MOV >> 4] = "=", - [BPF_ARSH >> 4] = "s>>=", - [BPF_END >> 4] = "endian", -}; - -static const char *const bpf_ldst_string[] = { - [BPF_W >> 3] = "u32", - [BPF_H >> 3] = "u16", - [BPF_B >> 3] = "u8", - [BPF_DW >> 3] = "u64", -}; - -static const char *const bpf_jmp_string[16] = { - [BPF_JA >> 4] = "jmp", - [BPF_JEQ >> 4] = "==", - [BPF_JGT >> 4] = ">", - [BPF_JLT >> 4] = "<", - [BPF_JGE >> 4] = ">=", - [BPF_JLE >> 4] = "<=", - [BPF_JSET >> 4] = "&", - [BPF_JNE >> 4] = "!=", - [BPF_JSGT >> 4] = "s>", - [BPF_JSLT >> 4] = "s<", - [BPF_JSGE >> 4] = "s>=", - [BPF_JSLE >> 4] = "s<=", - [BPF_CALL >> 4] = "call", - [BPF_EXIT >> 4] = "exit", -}; - -static void print_bpf_insn(const struct bpf_verifier_env *env, - const struct bpf_insn *insn) -{ - u8 class = BPF_CLASS(insn->code); - - if (class == BPF_ALU || class == BPF_ALU64) { - if (BPF_SRC(insn->code) == BPF_X) - verbose("(%02x) %sr%d %s %sr%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", - insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", - insn->src_reg); - else - verbose("(%02x) %sr%d %s %s%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", - insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", - insn->imm); - } else if (class == BPF_STX) { - if (BPF_MODE(insn->code) == BPF_MEM) - verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->src_reg); - else if (BPF_MODE(insn->code) == BPF_XADD) - verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, insn->off, - insn->src_reg); - else - verbose("BUG_%02x\n", insn->code); - } else if (class == BPF_ST) { - if (BPF_MODE(insn->code) != BPF_MEM) { - verbose("BUG_st_%02x\n", insn->code); - return; - } - verbose("(%02x) *(%s *)(r%d %+d) = %d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->imm); - } else if (class == BPF_LDX) { - if (BPF_MODE(insn->code) != BPF_MEM) { - verbose("BUG_ldx_%02x\n", insn->code); - return; - } - verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", - insn->code, insn->dst_reg, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->src_reg, insn->off); - } else if (class == BPF_LD) { - if (BPF_MODE(insn->code) == BPF_ABS) { - verbose("(%02x) r0 = *(%s *)skb[%d]\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IND) { - verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->src_reg, insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IMM && - BPF_SIZE(insn->code) == BPF_DW) { - /* At this point, we already made sure that the second - * part of the ldimm64 insn is accessible. - */ - u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; - - if (map_ptr && !env->allow_ptr_leaks) - imm = 0; - - verbose("(%02x) r%d = 0x%llx\n", insn->code, - insn->dst_reg, (unsigned long long)imm); - } else { - verbose("BUG_ld_%02x\n", insn->code); - return; - } - } else if (class == BPF_JMP) { - u8 opcode = BPF_OP(insn->code); - - if (opcode == BPF_CALL) { - verbose("(%02x) call %s#%d\n", insn->code, - func_id_name(insn->imm), insn->imm); - } else if (insn->code == (BPF_JMP | BPF_JA)) { - verbose("(%02x) goto pc%+d\n", - insn->code, insn->off); - } else if (insn->code == (BPF_JMP | BPF_EXIT)) { - verbose("(%02x) exit\n", insn->code); - } else if (BPF_SRC(insn->code) == BPF_X) { - verbose("(%02x) if r%d %s r%d goto pc%+d\n", - insn->code, insn->dst_reg, - bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->src_reg, insn->off); - } else { - verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", - insn->code, insn->dst_reg, - bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->imm, insn->off); - } - } else { - verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); - } + verbose(env, "\n"); } static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx) @@ -469,7 +319,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, env->head = elem; env->stack_size++; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { - verbose("BPF program is too complex\n"); + verbose(env, "BPF program is too complex\n"); goto err; } return &elem->st; @@ -507,10 +357,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) __mark_reg_known(reg, 0); } -static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_known_zero(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_known_zero(regs, %u)\n", regno); + verbose(env, "mark_reg_known_zero(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -519,6 +370,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) __mark_reg_known_zero(regs + regno); } +static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) +{ + return type_is_pkt_pointer(reg->type); +} + +static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) +{ + return reg_is_pkt_pointer(reg) || + reg->type == PTR_TO_PACKET_END; +} + +/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ +static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, + enum bpf_reg_type which) +{ + /* The register can already have a range from prior markings. + * This is fine as long as it hasn't been advanced from its + * origin. + */ + return reg->type == which && + reg->id == 0 && + reg->off == 0 && + tnum_equals_const(reg->var_off, 0); +} + /* Attempts to improve min/max values based on var_off information */ static void __update_reg_bounds(struct bpf_reg_state *reg) { @@ -595,10 +471,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) __mark_reg_unbounded(reg); } -static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_unknown(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_unknown(regs, %u)\n", regno); + verbose(env, "mark_reg_unknown(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -613,10 +490,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg) reg->type = NOT_INIT; } -static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_not_init(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_not_init(regs, %u)\n", regno); + verbose(env, "mark_reg_not_init(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -625,22 +503,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) __mark_reg_not_init(regs + regno); } -static void init_reg_state(struct bpf_reg_state *regs) +static void init_reg_state(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) { int i; for (i = 0; i < MAX_BPF_REG; i++) { - mark_reg_not_init(regs, i); + mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; } /* frame pointer */ regs[BPF_REG_FP].type = PTR_TO_STACK; - mark_reg_known_zero(regs, BPF_REG_FP); + mark_reg_known_zero(env, regs, BPF_REG_FP); /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; - mark_reg_known_zero(regs, BPF_REG_1); + mark_reg_known_zero(env, regs, BPF_REG_1); } enum reg_arg_type { @@ -653,6 +532,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) { struct bpf_verifier_state *parent = state->parent; + if (regno == BPF_REG_FP) + /* We don't need to worry about FP liveness because it's read-only */ + return; + while (parent) { /* if read wasn't screened by an earlier write ... */ if (state->regs[regno].live & REG_LIVE_WRITTEN) @@ -670,26 +553,26 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, struct bpf_reg_state *regs = env->cur_state.regs; if (regno >= MAX_BPF_REG) { - verbose("R%d is invalid\n", regno); + verbose(env, "R%d is invalid\n", regno); return -EINVAL; } if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (regs[regno].type == NOT_INIT) { - verbose("R%d !read_ok\n", regno); + verbose(env, "R%d !read_ok\n", regno); return -EACCES; } mark_reg_read(&env->cur_state, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { - verbose("frame pointer is read only\n"); + verbose(env, "frame pointer is read only\n"); return -EACCES; } regs[regno].live |= REG_LIVE_WRITTEN; if (t == DST_OP) - mark_reg_unknown(regs, regno); + mark_reg_unknown(env, regs, regno); } return 0; } @@ -702,6 +585,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: + case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: case CONST_PTR_TO_MAP: return true; @@ -713,7 +597,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ -static int check_stack_write(struct bpf_verifier_state *state, int off, +static int check_stack_write(struct bpf_verifier_env *env, + struct bpf_verifier_state *state, int off, int size, int value_regno) { int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; @@ -726,7 +611,7 @@ static int check_stack_write(struct bpf_verifier_state *state, int off, /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { - verbose("invalid size of register spill\n"); + verbose(env, "invalid size of register spill\n"); return -EACCES; } @@ -761,7 +646,8 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo } } -static int check_stack_read(struct bpf_verifier_state *state, int off, int size, +static int check_stack_read(struct bpf_verifier_env *env, + struct bpf_verifier_state *state, int off, int size, int value_regno) { u8 *slot_type; @@ -771,12 +657,12 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, if (slot_type[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { - verbose("invalid size of register spill\n"); + verbose(env, "invalid size of register spill\n"); return -EACCES; } for (i = 1; i < BPF_REG_SIZE; i++) { if (slot_type[i] != STACK_SPILL) { - verbose("corrupted spill memory\n"); + verbose(env, "corrupted spill memory\n"); return -EACCES; } } @@ -792,14 +678,14 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, } else { for (i = 0; i < size; i++) { if (slot_type[i] != STACK_MISC) { - verbose("invalid read from stack off %d+%d size %d\n", + verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; } } if (value_regno >= 0) /* have read misc data from the stack */ - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); return 0; } } @@ -811,7 +697,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, struct bpf_map *map = env->cur_state.regs[regno].map_ptr; if (off < 0 || size <= 0 || off + size > map->value_size) { - verbose("invalid access to map value, value_size=%d off=%d size=%d\n", + verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", map->value_size, off, size); return -EACCES; } @@ -830,8 +716,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ - if (log_level) - print_verifier_state(state); + if (env->log.level) + print_verifier_state(env, state); /* The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our @@ -839,13 +725,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * will have a set floor within our range. */ if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; } err = __check_map_access(env, regno, reg->smin_value + off, size); if (err) { - verbose("R%d min value is outside of the array range\n", regno); + verbose(env, "R%d min value is outside of the array range\n", + regno); return err; } @@ -854,13 +741,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", + verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", regno); return -EACCES; } err = __check_map_access(env, regno, reg->umax_value + off, size); if (err) - verbose("R%d max value is outside of the array range\n", regno); + verbose(env, "R%d max value is outside of the array range\n", + regno); return err; } @@ -899,7 +787,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, struct bpf_reg_state *reg = ®s[regno]; if (off < 0 || size <= 0 || (u64)off + size > reg->range) { - verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", off, size, regno, reg->id, reg->off, reg->range); return -EACCES; } @@ -922,13 +810,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, * detail to prove they're safe. */ if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; } err = __check_packet_access(env, regno, off, size); if (err) { - verbose("R%d offset is outside of the packet\n", regno); + verbose(env, "R%d offset is outside of the packet\n", regno); return err; } return err; @@ -942,12 +830,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, .reg_type = *reg_type, }; - /* for analyzer ctx accesses are already validated and converted */ - if (env->analyzer_ops) - return 0; - - if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t, &info)) { + if (env->ops->is_valid_access && + env->ops->is_valid_access(off, size, t, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -955,16 +839,19 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, * will only allow for whole field access and rejects any other * type of narrower access. */ - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; *reg_type = info.reg_type; + if (env->analyzer_ops) + return 0; + + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; return 0; } - verbose("invalid bpf_context access off=%d size=%d\n", off, size); + verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size); return -EACCES; } @@ -982,7 +869,8 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); } -static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, +static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int off, int size, bool strict) { struct tnum reg_off; @@ -1007,7 +895,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("misaligned packet access off %d+%s+%d+%d size %d\n", + verbose(env, + "misaligned packet access off %d+%s+%d+%d size %d\n", ip_align, tn_buf, reg->off, off, size); return -EACCES; } @@ -1015,7 +904,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, return 0; } -static int check_generic_ptr_alignment(const struct bpf_reg_state *reg, +static int check_generic_ptr_alignment(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, const char *pointer_desc, int off, int size, bool strict) { @@ -1030,7 +920,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("misaligned %saccess off %s+%d+%d size %d\n", + verbose(env, "misaligned %saccess off %s+%d+%d size %d\n", pointer_desc, tn_buf, reg->off, off, size); return -EACCES; } @@ -1047,8 +937,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, switch (reg->type) { case PTR_TO_PACKET: - /* special case, because of NET_IP_ALIGN */ - return check_pkt_ptr_alignment(reg, off, size, strict); + case PTR_TO_PACKET_META: + /* Special case, because of NET_IP_ALIGN. Given metadata sits + * right in front, treat it the very same way. + */ + return check_pkt_ptr_alignment(env, reg, off, size, strict); case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -1061,7 +954,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, default: break; } - return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict); + return check_generic_ptr_alignment(env, reg, pointer_desc, off, size, + strict); } /* check whether memory at (regno + off) is accessible for t = (read | write) @@ -1093,20 +987,20 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into map\n", value_regno); + verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into ctx\n", value_regno); + verbose(env, "R%d leaks addr into ctx\n", value_regno); return -EACCES; } /* ctx accesses must be at a fixed offset, so that we can @@ -1116,7 +1010,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("variable ctx access var_off=%s off=%d size=%d", + verbose(env, + "variable ctx access var_off=%s off=%d size=%d", tn_buf, off, size); return -EACCES; } @@ -1124,13 +1019,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a - * PTR_TO_PACKET[_END]. In the latter case, we know - * the offset is zero. + * PTR_TO_PACKET[_META,_END]. In the latter + * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); else - mark_reg_known_zero(state->regs, value_regno); + mark_reg_known_zero(env, state->regs, + value_regno); state->regs[value_regno].id = 0; state->regs[value_regno].off = 0; state->regs[value_regno].range = 0; @@ -1146,13 +1042,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("variable stack access var_off=%s off=%d size=%d", + verbose(env, "variable stack access var_off=%s off=%d size=%d", tn_buf, off, size); return -EACCES; } off += reg->var_off.value; if (off >= 0 || off < -MAX_BPF_STACK) { - verbose("invalid stack off=%d size=%d\n", off, size); + verbose(env, "invalid stack off=%d size=%d\n", off, + size); return -EACCES; } @@ -1163,29 +1060,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!env->allow_ptr_leaks && state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && size != BPF_REG_SIZE) { - verbose("attempt to corrupt spilled pointer on stack\n"); + verbose(env, "attempt to corrupt spilled pointer on stack\n"); return -EACCES; } - err = check_stack_write(state, off, size, value_regno); + err = check_stack_write(env, state, off, size, + value_regno); } else { - err = check_stack_read(state, off, size, value_regno); + err = check_stack_read(env, state, off, size, + value_regno); } - } else if (reg->type == PTR_TO_PACKET) { + } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { - verbose("cannot write into packet\n"); + verbose(env, "cannot write into packet\n"); return -EACCES; } if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into packet\n", value_regno); + verbose(env, "R%d leaks addr into packet\n", + value_regno); return -EACCES; } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); } else { - verbose("R%d invalid mem access '%s'\n", - regno, reg_type_str[reg->type]); + verbose(env, "R%d invalid mem access '%s'\n", regno, + reg_type_str[reg->type]); return -EACCES; } @@ -1205,7 +1105,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || insn->imm != 0) { - verbose("BPF_XADD uses reserved fields\n"); + verbose(env, "BPF_XADD uses reserved fields\n"); return -EINVAL; } @@ -1220,7 +1120,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins return err; if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d leaks addr into mem\n", insn->src_reg); + verbose(env, "R%d leaks addr into mem\n", insn->src_reg); return -EACCES; } @@ -1261,7 +1161,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, register_is_null(regs[regno])) return 0; - verbose("R%d type=%s expected=%s\n", regno, + verbose(env, "R%d type=%s expected=%s\n", regno, reg_type_str[regs[regno].type], reg_type_str[PTR_TO_STACK]); return -EACCES; @@ -1272,13 +1172,13 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); - verbose("invalid variable stack read R%d var_off=%s\n", + verbose(env, "invalid variable stack read R%d var_off=%s\n", regno, tn_buf); } off = regs[regno].off + regs[regno].var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || access_size <= 0) { - verbose("invalid stack type R%d off=%d access_size=%d\n", + verbose(env, "invalid stack type R%d off=%d access_size=%d\n", regno, off, access_size); return -EACCES; } @@ -1294,7 +1194,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, for (i = 0; i < access_size; i++) { if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { - verbose("invalid indirect read from stack off %d+%d size %d\n", + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", off, i, access_size); return -EACCES; } @@ -1310,6 +1210,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, switch (reg->type) { case PTR_TO_PACKET: + case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size); case PTR_TO_MAP_VALUE: return check_map_access(env, regno, reg->off, access_size); @@ -1336,22 +1237,24 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_ANYTHING) { if (is_pointer_value(env, regno)) { - verbose("R%d leaks addr into helper function\n", regno); + verbose(env, "R%d leaks addr into helper function\n", + regno); return -EACCES; } return 0; } - if (type == PTR_TO_PACKET && + if (type_is_pkt_pointer(type) && !may_access_direct_pkt_data(env, meta, BPF_READ)) { - verbose("helper access to the packet is not allowed\n"); + verbose(env, "helper access to the packet is not allowed\n"); return -EACCES; } if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - if (type != PTR_TO_PACKET && type != expected_type) + if (!type_is_pkt_pointer(type) && + type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO) { @@ -1375,12 +1278,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (register_is_null(*reg)) /* final test in check_stack_boundary() */; - else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && + else if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; } else { - verbose("unsupported arg_type %d\n", arg_type); + verbose(env, "unsupported arg_type %d\n", arg_type); return -EFAULT; } @@ -1398,10 +1302,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * we have to check map_key here. Otherwise it means * that kernel subsystem misconfigured verifier */ - verbose("invalid map_ptr to access map->key\n"); + verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } - if (type == PTR_TO_PACKET) + if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, meta->map_ptr->key_size); else @@ -1414,10 +1318,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ - verbose("invalid map_ptr to access map->value\n"); + verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } - if (type == PTR_TO_PACKET) + if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, meta->map_ptr->value_size); else @@ -1434,7 +1338,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (regno == 0) { /* kernel subsystem misconfigured verifier */ - verbose("ARG_CONST_SIZE cannot be first argument\n"); + verbose(env, + "ARG_CONST_SIZE cannot be first argument\n"); return -EACCES; } @@ -1451,7 +1356,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, meta = NULL; if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", + verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", regno); return -EACCES; } @@ -1465,7 +1370,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (reg->umax_value >= BPF_MAX_VAR_SIZ) { - verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", regno); return -EACCES; } @@ -1476,12 +1381,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return err; err_type: - verbose("R%d type=%s expected=%s\n", regno, + verbose(env, "R%d type=%s expected=%s\n", regno, reg_type_str[type], reg_type_str[expected_type]); return -EACCES; } -static int check_map_func_compatibility(struct bpf_map *map, int func_id) +static int check_map_func_compatibility(struct bpf_verifier_env *env, + struct bpf_map *map, int func_id) { if (!map) return 0; @@ -1494,7 +1400,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) break; case BPF_MAP_TYPE_PERF_EVENT_ARRAY: if (func_id != BPF_FUNC_perf_event_read && - func_id != BPF_FUNC_perf_event_output) + func_id != BPF_FUNC_perf_event_output && + func_id != BPF_FUNC_perf_event_read_value) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -1514,6 +1421,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (func_id != BPF_FUNC_redirect_map) goto error; break; + /* Restrict bpf side of cpumap, open when use-cases appear */ + case BPF_MAP_TYPE_CPUMAP: + if (func_id != BPF_FUNC_redirect_map) + goto error; + break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) @@ -1537,6 +1449,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) break; case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: + case BPF_FUNC_perf_event_read_value: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; @@ -1550,7 +1463,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) goto error; break; case BPF_FUNC_redirect_map: - if (map->map_type != BPF_MAP_TYPE_DEVMAP) + if (map->map_type != BPF_MAP_TYPE_DEVMAP && + map->map_type != BPF_MAP_TYPE_CPUMAP) goto error; break; case BPF_FUNC_sk_redirect_map: @@ -1567,7 +1481,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) return 0; error: - verbose("cannot pass map_type %d into func %s#%d\n", + verbose(env, "cannot pass map_type %d into func %s#%d\n", map->map_type, func_id_name(func_id), func_id); return -EINVAL; } @@ -1590,8 +1504,8 @@ static int check_raw_mode(const struct bpf_func_proto *fn) return count > 1 ? -EINVAL : 0; } -/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid, - * so turn them into unknown SCALAR_VALUE. +/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] + * are now invalid, so turn them into unknown SCALAR_VALUE. */ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { @@ -1600,18 +1514,15 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) int i; for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == PTR_TO_PACKET || - regs[i].type == PTR_TO_PACKET_END) - mark_reg_unknown(regs, i); + if (reg_is_pkt_pointer_any(®s[i])) + mark_reg_unknown(env, regs, i); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) continue; reg = &state->spilled_regs[i / BPF_REG_SIZE]; - if (reg->type != PTR_TO_PACKET && - reg->type != PTR_TO_PACKET_END) - continue; - __mark_reg_unknown(reg); + if (reg_is_pkt_pointer_any(reg)) + __mark_reg_unknown(reg); } } @@ -1626,21 +1537,23 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) /* find function prototype */ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { - verbose("invalid func %s#%d\n", func_id_name(func_id), func_id); + verbose(env, "invalid func %s#%d\n", func_id_name(func_id), + func_id); return -EINVAL; } - if (env->prog->aux->ops->get_func_proto) - fn = env->prog->aux->ops->get_func_proto(func_id); + if (env->ops->get_func_proto) + fn = env->ops->get_func_proto(func_id); if (!fn) { - verbose("unknown func %s#%d\n", func_id_name(func_id), func_id); + verbose(env, "unknown func %s#%d\n", func_id_name(func_id), + func_id); return -EINVAL; } /* eBPF programs must be GPL compatible to use GPL-ed functions */ if (!env->prog->gpl_compatible && fn->gpl_only) { - verbose("cannot call GPL only function from proprietary program\n"); + verbose(env, "cannot call GPL only function from proprietary program\n"); return -EINVAL; } @@ -1654,7 +1567,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) */ err = check_raw_mode(fn); if (err) { - verbose("kernel subsystem misconfigured func %s#%d\n", + verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); return err; } @@ -1687,14 +1600,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(regs, caller_saved[i]); + mark_reg_not_init(env, regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } /* update return register (already marked as written above) */ if (fn->ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ - mark_reg_unknown(regs, BPF_REG_0); + mark_reg_unknown(env, regs, BPF_REG_0); } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { @@ -1702,14 +1615,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ - mark_reg_known_zero(regs, BPF_REG_0); + mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].off = 0; /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() */ if (meta.map_ptr == NULL) { - verbose("kernel subsystem misconfigured verifier\n"); + verbose(env, + "kernel subsystem misconfigured verifier\n"); return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; @@ -1720,12 +1634,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) else if (insn_aux->map_ptr != meta.map_ptr) insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { - verbose("unknown return type %d of func %s#%d\n", + verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } - err = check_map_func_compatibility(meta.map_ptr, func_id); + err = check_map_func_compatibility(env, meta.map_ptr, func_id); if (err) return err; @@ -1784,39 +1698,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[dst]; if (WARN_ON_ONCE(known && (smin_val != smax_val))) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: known but bad sbounds\n"); + print_verifier_state(env, &env->cur_state); + verbose(env, + "verifier internal error: known but bad sbounds\n"); return -EINVAL; } if (WARN_ON_ONCE(known && (umin_val != umax_val))) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: known but bad ubounds\n"); + print_verifier_state(env, &env->cur_state); + verbose(env, + "verifier internal error: known but bad ubounds\n"); return -EINVAL; } if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops on pointers produce (meaningless) scalars */ if (!env->allow_ptr_leaks) - verbose("R%d 32-bit pointer arithmetic prohibited\n", + verbose(env, + "R%d 32-bit pointer arithmetic prohibited\n", dst); return -EACCES; } if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", + verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", dst); return -EACCES; } if (ptr_reg->type == CONST_PTR_TO_MAP) { if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", + verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", dst); return -EACCES; } if (ptr_reg->type == PTR_TO_PACKET_END) { if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", + verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", dst); return -EACCES; } @@ -1871,7 +1788,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; - if (ptr_reg->type == PTR_TO_PACKET) { + if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ dst_reg->range = 0; @@ -1881,7 +1798,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ if (!env->allow_ptr_leaks) - verbose("R%d tried to subtract pointer from scalar\n", + verbose(env, "R%d tried to subtract pointer from scalar\n", dst); return -EACCES; } @@ -1891,7 +1808,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, */ if (ptr_reg->type == PTR_TO_STACK) { if (!env->allow_ptr_leaks) - verbose("R%d subtraction from stack pointer prohibited\n", + verbose(env, "R%d subtraction from stack pointer prohibited\n", dst); return -EACCES; } @@ -1931,7 +1848,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; - if (ptr_reg->type == PTR_TO_PACKET) { + if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) @@ -1946,13 +1863,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * ptr &= ~3 which would reduce min_value by 3.) */ if (!env->allow_ptr_leaks) - verbose("R%d bitwise operator %s on pointer prohibited\n", + verbose(env, "R%d bitwise operator %s on pointer prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; default: /* other operators (e.g. MUL,LSH) produce non-pointer results */ if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic with %s operator prohibited\n", + verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; } @@ -2118,7 +2035,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 63 are undefined. This includes * shifts by a negative number. */ - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } /* We lose all sign bit information (except what we can pick @@ -2146,7 +2063,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 63 are undefined. This includes * shifts by a negative number. */ - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } /* BPF_RSH is an unsigned shift, so make the appropriate casts */ @@ -2174,7 +2091,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); break; default: - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } @@ -2206,12 +2123,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * an arbitrary scalar. */ if (!env->allow_ptr_leaks) { - verbose("R%d pointer %s pointer prohibited\n", + verbose(env, "R%d pointer %s pointer prohibited\n", insn->dst_reg, bpf_alu_string[opcode >> 4]); return -EACCES; } - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); return 0; } else { /* scalar += pointer @@ -2263,13 +2180,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: unexpected ptr_reg\n"); + print_verifier_state(env, &env->cur_state); + verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: no src_reg\n"); + print_verifier_state(env, &env->cur_state); + verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); @@ -2287,14 +2204,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) != 0 || insn->src_reg != BPF_REG_0 || insn->off != 0 || insn->imm != 0) { - verbose("BPF_NEG uses reserved fields\n"); + verbose(env, "BPF_NEG uses reserved fields\n"); return -EINVAL; } } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0 || (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || BPF_CLASS(insn->code) == BPF_ALU64) { - verbose("BPF_END uses reserved fields\n"); + verbose(env, "BPF_END uses reserved fields\n"); return -EINVAL; } } @@ -2305,7 +2222,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (is_pointer_value(env, insn->dst_reg)) { - verbose("R%d pointer arithmetic prohibited\n", + verbose(env, "R%d pointer arithmetic prohibited\n", insn->dst_reg); return -EACCES; } @@ -2319,7 +2236,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { - verbose("BPF_MOV uses reserved fields\n"); + verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } @@ -2329,7 +2246,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0) { - verbose("BPF_MOV uses reserved fields\n"); + verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } } @@ -2345,14 +2262,16 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * copy register state to dest reg */ regs[insn->dst_reg] = regs[insn->src_reg]; + regs[insn->dst_reg].live |= REG_LIVE_WRITTEN; } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d partial copy of pointer\n", + verbose(env, + "R%d partial copy of pointer\n", insn->src_reg); return -EACCES; } - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); /* high 32 bits are known zero. */ regs[insn->dst_reg].var_off = tnum_cast( regs[insn->dst_reg].var_off, 4); @@ -2367,14 +2286,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } else if (opcode > BPF_END) { - verbose("invalid BPF_ALU opcode %x\n", opcode); + verbose(env, "invalid BPF_ALU opcode %x\n", opcode); return -EINVAL; } else { /* all other ALU ops: and, sub, xor, add, ... */ if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { - verbose("BPF_ALU uses reserved fields\n"); + verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; } /* check src1 operand */ @@ -2383,7 +2302,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0) { - verbose("BPF_ALU uses reserved fields\n"); + verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; } } @@ -2395,7 +2314,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if ((opcode == BPF_MOD || opcode == BPF_DIV) && BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { - verbose("div by zero\n"); + verbose(env, "div by zero\n"); return -EINVAL; } @@ -2404,7 +2323,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; if (insn->imm < 0 || insn->imm >= size) { - verbose("invalid shift %d\n", insn->imm); + verbose(env, "invalid shift %d\n", insn->imm); return -EINVAL; } } @@ -2421,7 +2340,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } static void find_good_pkt_pointers(struct bpf_verifier_state *state, - struct bpf_reg_state *dst_reg) + struct bpf_reg_state *dst_reg, + enum bpf_reg_type type) { struct bpf_reg_state *regs = state->regs, *reg; int i; @@ -2483,7 +2403,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) + if (regs[i].type == type && regs[i].id == dst_reg->id) /* keep the maximum range already checked */ regs[i].range = max_t(u16, regs[i].range, dst_reg->off); @@ -2491,7 +2411,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, if (state->stack_slot_type[i] != STACK_SPILL) continue; reg = &state->spilled_regs[i / BPF_REG_SIZE]; - if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) + if (reg->type == type && reg->id == dst_reg->id) reg->range = max_t(u16, reg->range, dst_reg->off); } } @@ -2756,13 +2676,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, int err; if (opcode > BPF_JSLE) { - verbose("invalid BPF_JMP opcode %x\n", opcode); + verbose(env, "invalid BPF_JMP opcode %x\n", opcode); return -EINVAL; } if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0) { - verbose("BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP uses reserved fields\n"); return -EINVAL; } @@ -2772,13 +2692,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return err; if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d pointer comparison prohibited\n", + verbose(env, "R%d pointer comparison prohibited\n", insn->src_reg); return -EACCES; } } else { if (insn->src_reg != BPF_REG_0) { - verbose("BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP uses reserved fields\n"); return -EINVAL; } } @@ -2856,25 +2776,46 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(this_branch, dst_reg); + find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(other_branch, dst_reg); + find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { - find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], + PTR_TO_PACKET); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { - find_good_pkt_pointers(this_branch, ®s[insn->src_reg]); + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], + PTR_TO_PACKET); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && + dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { + find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && + dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { + find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && + reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + regs[insn->src_reg].type == PTR_TO_PACKET_META) { + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], + PTR_TO_PACKET_META); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && + reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + regs[insn->src_reg].type == PTR_TO_PACKET_META) { + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], + PTR_TO_PACKET_META); } else if (is_pointer_value(env, insn->dst_reg)) { - verbose("R%d pointer comparison prohibited\n", insn->dst_reg); + verbose(env, "R%d pointer comparison prohibited\n", + insn->dst_reg); return -EACCES; } - if (log_level) - print_verifier_state(this_branch); + if (env->log.level) + print_verifier_state(env, this_branch); return 0; } @@ -2893,11 +2834,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) int err; if (BPF_SIZE(insn->code) != BPF_DW) { - verbose("invalid BPF_LD_IMM insn\n"); + verbose(env, "invalid BPF_LD_IMM insn\n"); return -EINVAL; } if (insn->off != 0) { - verbose("BPF_LD_IMM64 uses reserved fields\n"); + verbose(env, "BPF_LD_IMM64 uses reserved fields\n"); return -EINVAL; } @@ -2955,14 +2896,14 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) int i, err; if (!may_access_skb(env->prog->type)) { - verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); + verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); return -EINVAL; } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { - verbose("BPF_LD_[ABS|IND] uses reserved fields\n"); + verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n"); return -EINVAL; } @@ -2972,7 +2913,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (regs[BPF_REG_6].type != PTR_TO_CTX) { - verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); + verbose(env, + "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); return -EINVAL; } @@ -2985,7 +2927,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) /* reset caller saved regs to unreadable */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(regs, caller_saved[i]); + mark_reg_not_init(env, regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } @@ -2993,7 +2935,44 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * the value fetched from the packet. * Already marked as written above. */ - mark_reg_unknown(regs, BPF_REG_0); + mark_reg_unknown(env, regs, BPF_REG_0); + return 0; +} + +static int check_return_code(struct bpf_verifier_env *env) +{ + struct bpf_reg_state *reg; + struct tnum range = tnum_range(0, 1); + + switch (env->prog->type) { + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_SOCK_OPS: + break; + default: + return 0; + } + + reg = &env->cur_state.regs[BPF_REG_0]; + if (reg->type != SCALAR_VALUE) { + verbose(env, "At program exit the register R0 is not a known value (%s)\n", + reg_type_str[reg->type]); + return -EINVAL; + } + + if (!tnum_in(range, reg->var_off)) { + verbose(env, "At program exit the register R0 "); + if (!tnum_is_unknown(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "has value %s", tn_buf); + } else { + verbose(env, "has unknown scalar value"); + } + verbose(env, " should have been 0 or 1\n"); + return -EINVAL; + } return 0; } @@ -3057,7 +3036,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) return 0; if (w < 0 || w >= env->prog->len) { - verbose("jump out of range from insn %d to %d\n", t, w); + verbose(env, "jump out of range from insn %d to %d\n", t, w); return -EINVAL; } @@ -3074,13 +3053,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) insn_stack[cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - verbose("back-edge from insn %d to %d\n", t, w); + verbose(env, "back-edge from insn %d to %d\n", t, w); return -EINVAL; } else if (insn_state[w] == EXPLORED) { /* forward- or cross-edge */ insn_state[t] = DISCOVERED | e; } else { - verbose("insn state internal bug\n"); + verbose(env, "insn state internal bug\n"); return -EFAULT; } return 0; @@ -3174,7 +3153,7 @@ peek_stack: mark_explored: insn_state[t] = EXPLORED; if (cur_stack-- <= 0) { - verbose("pop stack internal bug\n"); + verbose(env, "pop stack internal bug\n"); ret = -EFAULT; goto err_free; } @@ -3183,7 +3162,7 @@ mark_explored: check_state: for (i = 0; i < insn_cnt; i++) { if (insn_state[i] != EXPLORED) { - verbose("unreachable insn %d\n", i); + verbose(env, "unreachable insn %d\n", i); ret = -EINVAL; goto err_free; } @@ -3298,8 +3277,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; /* Check our ids match any regs they're supposed to */ return check_ids(rold->id, rcur->id, idmap); + case PTR_TO_PACKET_META: case PTR_TO_PACKET: - if (rcur->type != PTR_TO_PACKET) + if (rcur->type != rold->type) return false; /* We must have at least as much range as the old ptr * did, so that any accesses which were safe before are @@ -3563,7 +3543,7 @@ static int do_check(struct bpf_verifier_env *env) int insn_processed = 0; bool do_print_state = false; - init_reg_state(regs); + init_reg_state(env, regs); state->parent = NULL; insn_idx = 0; for (;;) { @@ -3572,7 +3552,7 @@ static int do_check(struct bpf_verifier_env *env) int err; if (insn_idx >= insn_cnt) { - verbose("invalid insn idx %d insn_cnt %d\n", + verbose(env, "invalid insn idx %d insn_cnt %d\n", insn_idx, insn_cnt); return -EFAULT; } @@ -3581,7 +3561,8 @@ static int do_check(struct bpf_verifier_env *env) class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { - verbose("BPF program is too large. Processed %d insn\n", + verbose(env, + "BPF program is too large. Processed %d insn\n", insn_processed); return -E2BIG; } @@ -3591,12 +3572,12 @@ static int do_check(struct bpf_verifier_env *env) return err; if (err == 1) { /* found equivalent state, can prune the search */ - if (log_level) { + if (env->log.level) { if (do_print_state) - verbose("\nfrom %d to %d: safe\n", + verbose(env, "\nfrom %d to %d: safe\n", prev_insn_idx, insn_idx); else - verbose("%d: safe\n", insn_idx); + verbose(env, "%d: safe\n", insn_idx); } goto process_bpf_exit; } @@ -3604,19 +3585,20 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (log_level > 1 || (log_level && do_print_state)) { - if (log_level > 1) - verbose("%d:", insn_idx); + if (env->log.level > 1 || (env->log.level && do_print_state)) { + if (env->log.level > 1) + verbose(env, "%d:", insn_idx); else - verbose("\nfrom %d to %d:", + verbose(env, "\nfrom %d to %d:", prev_insn_idx, insn_idx); - print_verifier_state(&env->cur_state); + print_verifier_state(env, &env->cur_state); do_print_state = false; } - if (log_level) { - verbose("%d: ", insn_idx); - print_bpf_insn(env, insn); + if (env->log.level) { + verbose(env, "%d: ", insn_idx); + print_bpf_insn(verbose, env, insn, + env->allow_ptr_leaks); } err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); @@ -3672,7 +3654,7 @@ static int do_check(struct bpf_verifier_env *env) * src_reg == stack|map in some other branch. * Reject it. */ - verbose("same insn cannot be used with different pointers\n"); + verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } @@ -3712,14 +3694,14 @@ static int do_check(struct bpf_verifier_env *env) } else if (dst_reg_type != *prev_dst_type && (dst_reg_type == PTR_TO_CTX || *prev_dst_type == PTR_TO_CTX)) { - verbose("same insn cannot be used with different pointers\n"); + verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) { - verbose("BPF_ST uses reserved fields\n"); + verbose(env, "BPF_ST uses reserved fields\n"); return -EINVAL; } /* check src operand */ @@ -3742,7 +3724,7 @@ static int do_check(struct bpf_verifier_env *env) insn->off != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_CALL uses reserved fields\n"); + verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; } @@ -3755,7 +3737,7 @@ static int do_check(struct bpf_verifier_env *env) insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_JA uses reserved fields\n"); + verbose(env, "BPF_JA uses reserved fields\n"); return -EINVAL; } @@ -3767,7 +3749,7 @@ static int do_check(struct bpf_verifier_env *env) insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_EXIT uses reserved fields\n"); + verbose(env, "BPF_EXIT uses reserved fields\n"); return -EINVAL; } @@ -3782,10 +3764,13 @@ static int do_check(struct bpf_verifier_env *env) return err; if (is_pointer_value(env, BPF_REG_0)) { - verbose("R0 leaks addr as return value\n"); + verbose(env, "R0 leaks addr as return value\n"); return -EACCES; } + err = check_return_code(env); + if (err) + return err; process_bpf_exit: insn_idx = pop_stack(env, &prev_insn_idx); if (insn_idx < 0) { @@ -3814,19 +3799,19 @@ process_bpf_exit: insn_idx++; } else { - verbose("invalid BPF_LD mode\n"); + verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; } } else { - verbose("unknown insn class %d\n", class); + verbose(env, "unknown insn class %d\n", class); return -EINVAL; } insn_idx++; } - verbose("processed %d insns, stack depth %d\n", - insn_processed, env->prog->aux->stack_depth); + verbose(env, "processed %d insns, stack depth %d\n", insn_processed, + env->prog->aux->stack_depth); return 0; } @@ -3838,7 +3823,8 @@ static int check_map_prealloc(struct bpf_map *map) !(map->map_flags & BPF_F_NO_PREALLOC); } -static int check_map_prog_compatibility(struct bpf_map *map, +static int check_map_prog_compatibility(struct bpf_verifier_env *env, + struct bpf_map *map, struct bpf_prog *prog) { @@ -3849,12 +3835,12 @@ static int check_map_prog_compatibility(struct bpf_map *map, */ if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { if (!check_map_prealloc(map)) { - verbose("perf_event programs can only use preallocated hash map\n"); + verbose(env, "perf_event programs can only use preallocated hash map\n"); return -EINVAL; } if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta)) { - verbose("perf_event programs can only use preallocated inner hash map\n"); + verbose(env, "perf_event programs can only use preallocated inner hash map\n"); return -EINVAL; } } @@ -3877,14 +3863,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { - verbose("BPF_LDX uses reserved fields\n"); + verbose(env, "BPF_LDX uses reserved fields\n"); return -EINVAL; } if (BPF_CLASS(insn->code) == BPF_STX && ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { - verbose("BPF_STX uses reserved fields\n"); + verbose(env, "BPF_STX uses reserved fields\n"); return -EINVAL; } @@ -3895,7 +3881,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) if (i == insn_cnt - 1 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || insn[1].off != 0) { - verbose("invalid bpf_ld_imm64 insn\n"); + verbose(env, "invalid bpf_ld_imm64 insn\n"); return -EINVAL; } @@ -3904,19 +3890,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) goto next_insn; if (insn->src_reg != BPF_PSEUDO_MAP_FD) { - verbose("unrecognized bpf_ld_imm64 insn\n"); + verbose(env, + "unrecognized bpf_ld_imm64 insn\n"); return -EINVAL; } f = fdget(insn->imm); map = __bpf_map_get(f); if (IS_ERR(map)) { - verbose("fd %d is not pointing to valid bpf_map\n", + verbose(env, "fd %d is not pointing to valid bpf_map\n", insn->imm); return PTR_ERR(map); } - err = check_map_prog_compatibility(map, env->prog); + err = check_map_prog_compatibility(env, map, env->prog); if (err) { fdput(f); return err; @@ -4025,7 +4012,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { - const struct bpf_verifier_ops *ops = env->prog->aux->ops; + const struct bpf_verifier_ops *ops = env->ops; int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; @@ -4038,7 +4025,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); if (cnt >= ARRAY_SIZE(insn_buf)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } else if (cnt) { new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); @@ -4086,7 +4073,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) u8 size_code; if (type == BPF_WRITE) { - verbose("bpf verifier narrow ctx access misconfigured\n"); + verbose(env, "bpf verifier narrow ctx access misconfigured\n"); return -EINVAL; } @@ -4105,7 +4092,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) &target_size); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || (ctx_field_size && !target_size)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -4187,7 +4174,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -4226,12 +4213,13 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; } patch_call_imm: - fn = prog->aux->ops->get_func_proto(insn->imm); + fn = env->ops->get_func_proto(insn->imm); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions */ if (!fn->func) { - verbose("kernel subsystem misconfigured func %s#%d\n", + verbose(env, + "kernel subsystem misconfigured func %s#%d\n", func_id_name(insn->imm), insn->imm); return -EFAULT; } @@ -4265,8 +4253,8 @@ static void free_states(struct bpf_verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { - char __user *log_ubuf = NULL; struct bpf_verifier_env *env; + struct bpf_verifer_log *log; int ret = -EINVAL; /* 'struct bpf_verifier_env' can be global, but since it's not small, @@ -4275,6 +4263,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; + log = &env->log; env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * (*prog)->len); @@ -4282,6 +4271,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!env->insn_aux_data) goto err_free_env; env->prog = *prog; + env->ops = bpf_verifier_ops[env->prog->type]; /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); @@ -4290,23 +4280,15 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) /* user requested verbose verifier output * and supplied buffer to store the verification trace */ - log_level = attr->log_level; - log_ubuf = (char __user *) (unsigned long) attr->log_buf; - log_size = attr->log_size; - log_len = 0; + log->level = attr->log_level; + log->ubuf = (char __user *) (unsigned long) attr->log_buf; + log->len_total = attr->log_size; ret = -EINVAL; - /* log_* values have to be sane */ - if (log_size < 128 || log_size > UINT_MAX >> 8 || - log_level == 0 || log_ubuf == NULL) + /* log attributes have to be sane */ + if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || + !log->level || !log->ubuf) goto err_unlock; - - ret = -ENOMEM; - log_buf = vmalloc(log_size); - if (!log_buf) - goto err_unlock; - } else { - log_level = 0; } env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); @@ -4343,17 +4325,11 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); - if (log_level && log_len >= log_size - 1) { - BUG_ON(log_len >= log_size); - /* verifier log exceeded user supplied buffer */ + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; - /* fall through to return what was recorded */ - } - - /* copy verifier log back to user space including trailing zero */ - if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { + if (log->level && !log->ubuf) { ret = -EFAULT; - goto free_log_buf; + goto err_release_maps; } if (ret == 0 && env->used_map_cnt) { @@ -4364,7 +4340,7 @@ skip_full_check: if (!env->prog->aux->used_maps) { ret = -ENOMEM; - goto free_log_buf; + goto err_release_maps; } memcpy(env->prog->aux->used_maps, env->used_maps, @@ -4377,9 +4353,7 @@ skip_full_check: convert_pseudo_ld_imm64(env); } -free_log_buf: - if (log_level) - vfree(log_buf); +err_release_maps: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. @@ -4394,12 +4368,21 @@ err_free_env: return ret; } +static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = { + [BPF_PROG_TYPE_XDP] = &xdp_analyzer_ops, + [BPF_PROG_TYPE_SCHED_CLS] = &tc_cls_act_analyzer_ops, +}; + int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, void *priv) { struct bpf_verifier_env *env; int ret; + if (prog->type >= ARRAY_SIZE(bpf_analyzer_ops) || + !bpf_analyzer_ops[prog->type]) + return -EOPNOTSUPP; + env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; @@ -4410,14 +4393,13 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, if (!env->insn_aux_data) goto err_free_env; env->prog = prog; + env->ops = bpf_analyzer_ops[env->prog->type]; env->analyzer_ops = ops; env->analyzer_priv = priv; /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); - log_level = 0; - env->strict_alignment = false; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6551cd45238..00f5b358aeac 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1896,6 +1896,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) if (ret) goto destroy_root; + ret = cgroup_bpf_inherit(root_cgrp); + WARN_ON_ONCE(ret); + trace_cgroup_setup_root(root); /* @@ -2311,6 +2314,14 @@ out_release_tset: list_del_init(&cset->mg_node); } spin_unlock_irq(&css_set_lock); + + /* + * Re-initialize the cgroup_taskset structure in case it is reused + * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() + * iteration. + */ + tset->nr_tasks = 0; + tset->csets = &tset->src_csets; return ret; } @@ -4713,6 +4724,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; + ret = cgroup_bpf_inherit(cgrp); + if (ret) + goto out_idr_free; for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; @@ -4747,13 +4761,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); - if (parent) - cgroup_bpf_inherit(cgrp, parent); - cgroup_propagate_control(cgrp); return cgrp; +out_idr_free: + cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5736,14 +5749,33 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ #ifdef CONFIG_CGROUP_BPF -int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, bool overridable) +int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_attach(cgrp, prog, type, flags); + mutex_unlock(&cgroup_mutex); + return ret; +} +int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_detach(cgrp, prog, type, flags); + mutex_unlock(&cgroup_mutex); + return ret; +} +int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) { - struct cgroup *parent = cgroup_parent(cgrp); int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable); + ret = __cgroup_bpf_query(cgrp, attr, uattr); mutex_unlock(&cgroup_mutex); return ret; } diff --git a/kernel/cpu.c b/kernel/cpu.c index acf5308fad51..d851df22f5c5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -24,6 +24,7 @@ #include <linux/lockdep.h> #include <linux/tick.h> #include <linux/irq.h> +#include <linux/nmi.h> #include <linux/smpboot.h> #include <linux/relay.h> #include <linux/slab.h> @@ -46,11 +47,13 @@ * @bringup: Single callback bringup or teardown selector * @cb_state: The state for a single callback (install/uninstall) * @result: Result of the operation - * @done: Signal completion to the issuer of the task + * @done_up: Signal completion to the issuer of the task for cpu-up + * @done_down: Signal completion to the issuer of the task for cpu-down */ struct cpuhp_cpu_state { enum cpuhp_state state; enum cpuhp_state target; + enum cpuhp_state fail; #ifdef CONFIG_SMP struct task_struct *thread; bool should_run; @@ -58,18 +61,39 @@ struct cpuhp_cpu_state { bool single; bool bringup; struct hlist_node *node; + struct hlist_node *last; enum cpuhp_state cb_state; int result; - struct completion done; + struct completion done_up; + struct completion done_down; #endif }; -static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); +static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { + .fail = CPUHP_INVALID, +}; #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) -static struct lock_class_key cpuhp_state_key; -static struct lockdep_map cpuhp_state_lock_map = - STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key); +static struct lockdep_map cpuhp_state_up_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); +static struct lockdep_map cpuhp_state_down_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); + + +static void inline cpuhp_lock_acquire(bool bringup) +{ + lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); +} + +static void inline cpuhp_lock_release(bool bringup) +{ + lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); +} +#else + +static void inline cpuhp_lock_acquire(bool bringup) { } +static void inline cpuhp_lock_release(bool bringup) { } + #endif /** @@ -123,13 +147,16 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) /** * cpuhp_invoke_callback _ Invoke the callbacks for a given state * @cpu: The cpu for which the callback should be invoked - * @step: The step in the state machine + * @state: The state to do callbacks for * @bringup: True if the bringup callback should be invoked + * @node: For multi-instance, do a single entry callback for install/remove + * @lastp: For multi-instance rollback, remember how far we got * * Called from cpu hotplug and from the state register machinery. */ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, - bool bringup, struct hlist_node *node) + bool bringup, struct hlist_node *node, + struct hlist_node **lastp) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct cpuhp_step *step = cpuhp_get_step(state); @@ -137,7 +164,17 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int (*cb)(unsigned int cpu); int ret, cnt; + if (st->fail == state) { + st->fail = CPUHP_INVALID; + + if (!(bringup ? step->startup.single : step->teardown.single)) + return 0; + + return -EAGAIN; + } + if (!step->multi_instance) { + WARN_ON_ONCE(lastp && *lastp); cb = bringup ? step->startup.single : step->teardown.single; if (!cb) return 0; @@ -152,6 +189,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, /* Single invocation for instance add/remove */ if (node) { + WARN_ON_ONCE(lastp && *lastp); trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); @@ -161,13 +199,23 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, /* State transition. Invoke on all instances */ cnt = 0; hlist_for_each(node, &step->list) { + if (lastp && node == *lastp) + break; + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); ret = cbm(cpu, node); trace_cpuhp_exit(cpu, st->state, state, ret); - if (ret) - goto err; + if (ret) { + if (!lastp) + goto err; + + *lastp = node; + return ret; + } cnt++; } + if (lastp) + *lastp = NULL; return 0; err: /* Rollback the instances if one failed */ @@ -178,12 +226,39 @@ err: hlist_for_each(node, &step->list) { if (!cnt--) break; - cbm(cpu, node); + + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); + ret = cbm(cpu, node); + trace_cpuhp_exit(cpu, st->state, state, ret); + /* + * Rollback must not fail, + */ + WARN_ON_ONCE(ret); } return ret; } #ifdef CONFIG_SMP +static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup) +{ + struct completion *done = bringup ? &st->done_up : &st->done_down; + wait_for_completion(done); +} + +static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup) +{ + struct completion *done = bringup ? &st->done_up : &st->done_down; + complete(done); +} + +/* + * The former STARTING/DYING states, ran with IRQs disabled and must not fail. + */ +static bool cpuhp_is_atomic_state(enum cpuhp_state state) +{ + return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE; +} + /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); bool cpuhp_tasks_frozen; @@ -271,14 +346,79 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ -static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st); +static inline enum cpuhp_state +cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + + st->rollback = false; + st->last = NULL; + + st->target = target; + st->single = false; + st->bringup = st->state < target; + + return prev_state; +} + +static inline void +cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) +{ + st->rollback = true; + + /* + * If we have st->last we need to undo partial multi_instance of this + * state first. Otherwise start undo at the previous state. + */ + if (!st->last) { + if (st->bringup) + st->state--; + else + st->state++; + } + + st->target = prev_state; + st->bringup = !st->bringup; +} + +/* Regular hotplug invocation of the AP hotplug thread */ +static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) +{ + if (!st->single && st->state == st->target) + return; + + st->result = 0; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); + wait_for_ap_thread(st, st->bringup); +} + +static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target) +{ + enum cpuhp_state prev_state; + int ret; + + prev_state = cpuhp_set_state(st, target); + __cpuhp_kick_ap(st); + if ((ret = st->result)) { + cpuhp_reset_state(st, prev_state); + __cpuhp_kick_ap(st); + } + + return ret; +} static int bringup_wait_for_ap(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ - wait_for_completion(&st->done); + wait_for_ap_thread(st, true); if (WARN_ON_ONCE((!cpu_online(cpu)))) return -ECANCELED; @@ -286,12 +426,10 @@ static int bringup_wait_for_ap(unsigned int cpu) stop_machine_unpark(cpu); kthread_unpark(st->thread); - /* Should we go further up ? */ - if (st->target > CPUHP_AP_ONLINE_IDLE) { - __cpuhp_kick_ap_work(st); - wait_for_completion(&st->done); - } - return st->result; + if (st->target <= CPUHP_AP_ONLINE_IDLE) + return 0; + + return cpuhp_kick_ap(st, st->target); } static int bringup_cpu(unsigned int cpu) @@ -317,32 +455,6 @@ static int bringup_cpu(unsigned int cpu) /* * Hotplug state machine related functions */ -static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - for (st->state++; st->state < st->target; st->state++) { - struct cpuhp_step *step = cpuhp_get_step(st->state); - - if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, true, NULL); - } -} - -static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, - enum cpuhp_state target) -{ - enum cpuhp_state prev_state = st->state; - int ret = 0; - - for (; st->state > target; st->state--) { - ret = cpuhp_invoke_callback(cpu, st->state, false, NULL); - if (ret) { - st->target = prev_state; - undo_cpu_down(cpu, st); - break; - } - } - return ret; -} static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) { @@ -350,7 +462,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, false, NULL); + cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); } } @@ -362,7 +474,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, while (st->state < target) { st->state++; - ret = cpuhp_invoke_callback(cpu, st->state, true, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); if (ret) { st->target = prev_state; undo_cpu_up(cpu, st); @@ -379,7 +491,8 @@ static void cpuhp_create(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - init_completion(&st->done); + init_completion(&st->done_up); + init_completion(&st->done_down); } static int cpuhp_should_run(unsigned int cpu) @@ -389,69 +502,90 @@ static int cpuhp_should_run(unsigned int cpu) return st->should_run; } -/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */ -static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); - - return cpuhp_down_callbacks(cpu, st, target); -} - -/* Execute the online startup callbacks. Used to be CPU_ONLINE */ -static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) -{ - return cpuhp_up_callbacks(cpu, st, st->target); -} - /* * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke * callbacks when a state gets [un]installed at runtime. + * + * Each invocation of this function by the smpboot thread does a single AP + * state callback. + * + * It has 3 modes of operation: + * - single: runs st->cb_state + * - up: runs ++st->state, while st->state < st->target + * - down: runs st->state--, while st->state > st->target + * + * When complete or on error, should_run is cleared and the completion is fired. */ static void cpuhp_thread_fun(unsigned int cpu) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); - int ret = 0; + bool bringup = st->bringup; + enum cpuhp_state state; /* - * Paired with the mb() in cpuhp_kick_ap_work and - * cpuhp_invoke_ap_callback, so the work set is consistent visible. + * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures + * that if we see ->should_run we also see the rest of the state. */ smp_mb(); - if (!st->should_run) + + if (WARN_ON_ONCE(!st->should_run)) return; - st->should_run = false; + cpuhp_lock_acquire(bringup); - lock_map_acquire(&cpuhp_state_lock_map); - /* Single callback invocation for [un]install ? */ if (st->single) { - if (st->cb_state < CPUHP_AP_ONLINE) { - local_irq_disable(); - ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node); - local_irq_enable(); + state = st->cb_state; + st->should_run = false; + } else { + if (bringup) { + st->state++; + state = st->state; + st->should_run = (st->state < st->target); + WARN_ON_ONCE(st->state > st->target); } else { - ret = cpuhp_invoke_callback(cpu, st->cb_state, - st->bringup, st->node); + state = st->state; + st->state--; + st->should_run = (st->state > st->target); + WARN_ON_ONCE(st->state < st->target); } - } else if (st->rollback) { - BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); + } + + WARN_ON_ONCE(!cpuhp_is_ap_state(state)); + + if (st->rollback) { + struct cpuhp_step *step = cpuhp_get_step(state); + if (step->skip_onerr) + goto next; + } + + if (cpuhp_is_atomic_state(state)) { + local_irq_disable(); + st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); + local_irq_enable(); - undo_cpu_down(cpu, st); - st->rollback = false; + /* + * STARTING/DYING must not fail! + */ + WARN_ON_ONCE(st->result); } else { - /* Cannot happen .... */ - BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); - - /* Regular hotplug work */ - if (st->state < st->target) - ret = cpuhp_ap_online(cpu, st); - else if (st->state > st->target) - ret = cpuhp_ap_offline(cpu, st); + st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last); + } + + if (st->result) { + /* + * If we fail on a rollback, we're up a creek without no + * paddle, no way forward, no way back. We loose, thanks for + * playing. + */ + WARN_ON_ONCE(st->rollback); + st->should_run = false; } - lock_map_release(&cpuhp_state_lock_map); - st->result = ret; - complete(&st->done); + +next: + cpuhp_lock_release(bringup); + + if (!st->should_run) + complete_ap_thread(st, bringup); } /* Invoke a single callback on a remote cpu */ @@ -460,62 +594,64 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, struct hlist_node *node) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int ret; if (!cpu_online(cpu)) return 0; - lock_map_acquire(&cpuhp_state_lock_map); - lock_map_release(&cpuhp_state_lock_map); + cpuhp_lock_acquire(false); + cpuhp_lock_release(false); + + cpuhp_lock_acquire(true); + cpuhp_lock_release(true); /* * If we are up and running, use the hotplug thread. For early calls * we invoke the thread function directly. */ if (!st->thread) - return cpuhp_invoke_callback(cpu, state, bringup, node); + return cpuhp_invoke_callback(cpu, state, bringup, node, NULL); + + st->rollback = false; + st->last = NULL; + st->node = node; + st->bringup = bringup; st->cb_state = state; st->single = true; - st->bringup = bringup; - st->node = node; - /* - * Make sure the above stores are visible before should_run becomes - * true. Paired with the mb() above in cpuhp_thread_fun() - */ - smp_mb(); - st->should_run = true; - wake_up_process(st->thread); - wait_for_completion(&st->done); - return st->result; -} + __cpuhp_kick_ap(st); -/* Regular hotplug invocation of the AP hotplug thread */ -static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) -{ - st->result = 0; - st->single = false; /* - * Make sure the above stores are visible before should_run becomes - * true. Paired with the mb() above in cpuhp_thread_fun() + * If we failed and did a partial, do a rollback. */ - smp_mb(); - st->should_run = true; - wake_up_process(st->thread); + if ((ret = st->result) && st->last) { + st->rollback = true; + st->bringup = !bringup; + + __cpuhp_kick_ap(st); + } + + return ret; } static int cpuhp_kick_ap_work(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - enum cpuhp_state state = st->state; + enum cpuhp_state prev_state = st->state; + int ret; + + cpuhp_lock_acquire(false); + cpuhp_lock_release(false); - trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); - lock_map_acquire(&cpuhp_state_lock_map); - lock_map_release(&cpuhp_state_lock_map); - __cpuhp_kick_ap_work(st); - wait_for_completion(&st->done); - trace_cpuhp_exit(cpu, st->state, state, st->result); - return st->result; + cpuhp_lock_acquire(true); + cpuhp_lock_release(true); + + trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); + ret = cpuhp_kick_ap(st, st->target); + trace_cpuhp_exit(cpu, st->state, prev_state, ret); + + return ret; } static struct smp_hotplug_thread cpuhp_threads = { @@ -581,6 +717,7 @@ static int take_cpu_down(void *_param) struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); int err, cpu = smp_processor_id(); + int ret; /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); @@ -594,8 +731,13 @@ static int take_cpu_down(void *_param) WARN_ON(st->state != CPUHP_TEARDOWN_CPU); st->state--; /* Invoke the former CPU_DYING callbacks */ - for (; st->state > target; st->state--) - cpuhp_invoke_callback(cpu, st->state, false, NULL); + for (; st->state > target; st->state--) { + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + /* + * DYING must not fail! + */ + WARN_ON_ONCE(ret); + } /* Give up timekeeping duties */ tick_handover_do_timer(); @@ -639,7 +781,7 @@ static int takedown_cpu(unsigned int cpu) * * Wait for the stop thread to go away. */ - wait_for_completion(&st->done); + wait_for_ap_thread(st, false); BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); /* Interrupts are moved away from the dying cpu, reenable alloc/free */ @@ -658,7 +800,7 @@ static void cpuhp_complete_idle_dead(void *arg) { struct cpuhp_cpu_state *st = arg; - complete(&st->done); + complete_ap_thread(st, false); } void cpuhp_report_idle_dead(void) @@ -676,11 +818,32 @@ void cpuhp_report_idle_dead(void) cpuhp_complete_idle_dead, st, 0); } -#else -#define takedown_cpu NULL -#endif +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + for (st->state++; st->state < st->target; st->state++) { + struct cpuhp_step *step = cpuhp_get_step(st->state); -#ifdef CONFIG_HOTPLUG_CPU + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + } +} + +static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + for (; st->state > target; st->state--) { + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); + if (ret) { + st->target = prev_state; + undo_cpu_down(cpu, st); + break; + } + } + return ret; +} /* Requires cpu_add_remove_lock to be held */ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, @@ -699,13 +862,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, cpuhp_tasks_frozen = tasks_frozen; - prev_state = st->state; - st->target = target; + prev_state = cpuhp_set_state(st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread. */ if (st->state > CPUHP_TEARDOWN_CPU) { + st->target = max((int)target, CPUHP_TEARDOWN_CPU); ret = cpuhp_kick_ap_work(cpu); /* * The AP side has done the error rollback already. Just @@ -720,6 +883,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, */ if (st->state > CPUHP_TEARDOWN_CPU) goto out; + + st->target = target; } /* * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need @@ -727,13 +892,17 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, */ ret = cpuhp_down_callbacks(cpu, st, target); if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { - st->target = prev_state; - st->rollback = true; - cpuhp_kick_ap_work(cpu); + cpuhp_reset_state(st, prev_state); + __cpuhp_kick_ap(st); } out: cpus_write_unlock(); + /* + * Do post unplug cleanup. This is still protected against + * concurrent CPU hotplug via cpu_add_remove_lock. + */ + lockup_detector_cleanup(); return ret; } @@ -754,11 +923,15 @@ out: cpu_maps_update_done(); return err; } + int cpu_down(unsigned int cpu) { return do_cpu_down(cpu, CPUHP_OFFLINE); } EXPORT_SYMBOL(cpu_down); + +#else +#define takedown_cpu NULL #endif /*CONFIG_HOTPLUG_CPU*/ /** @@ -772,11 +945,16 @@ void notify_cpu_starting(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); + int ret; rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ while (st->state < target) { st->state++; - cpuhp_invoke_callback(cpu, st->state, true, NULL); + ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); + /* + * STARTING must not fail! + */ + WARN_ON_ONCE(ret); } } @@ -794,7 +972,7 @@ void cpuhp_online_idle(enum cpuhp_state state) return; st->state = CPUHP_AP_ONLINE_IDLE; - complete(&st->done); + complete_ap_thread(st, true); } /* Requires cpu_add_remove_lock to be held */ @@ -829,7 +1007,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) cpuhp_tasks_frozen = tasks_frozen; - st->target = target; + cpuhp_set_state(st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread once more. @@ -1296,6 +1474,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, struct cpuhp_step *sp = cpuhp_get_step(state); int ret; + /* + * If there's nothing to do, we done. + * Relies on the union for multi_instance. + */ if ((bringup && !sp->startup.single) || (!bringup && !sp->teardown.single)) return 0; @@ -1307,9 +1489,9 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, if (cpuhp_is_ap_state(state)) ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); else - ret = cpuhp_invoke_callback(cpu, state, bringup, node); + ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #else - ret = cpuhp_invoke_callback(cpu, state, bringup, node); + ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL); #endif BUG_ON(ret && !bringup); return ret; @@ -1641,9 +1823,55 @@ static ssize_t show_cpuhp_target(struct device *dev, } static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); + +static ssize_t write_cpuhp_fail(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + struct cpuhp_step *sp; + int fail, ret; + + ret = kstrtoint(buf, 10, &fail); + if (ret) + return ret; + + /* + * Cannot fail STARTING/DYING callbacks. + */ + if (cpuhp_is_atomic_state(fail)) + return -EINVAL; + + /* + * Cannot fail anything that doesn't have callbacks. + */ + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(fail); + if (!sp->startup.single && !sp->teardown.single) + ret = -EINVAL; + mutex_unlock(&cpuhp_state_mutex); + if (ret) + return ret; + + st->fail = fail; + + return count; +} + +static ssize_t show_cpuhp_fail(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->fail); +} + +static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail); + static struct attribute *cpuhp_cpu_attrs[] = { &dev_attr_state.attr, &dev_attr_target.attr, + &dev_attr_fail.attr, NULL }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 6bc21e202ae4..902149f05381 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event) * will not be local and we cannot read them atomically * - must not have a pmu::count method */ -int perf_event_read_local(struct perf_event *event, u64 *value) +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { unsigned long flags; int ret = 0; + u64 now; /* * Disabling interrupts avoids all counter scheduling (context @@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } + now = event->shadow_ctx_time + perf_clock(); + if (enabled) + *enabled = now - event->tstamp_enabled; /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) + if (event->oncpu == smp_processor_id()) { event->pmu->read(event); + if (running) + *running = now - event->tstamp_running; + } else if (running) { + *running = event->total_time_running; + } *value = local64_read(&event->count); out: @@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event, struct bpf_perf_event_data_kern ctx = { .data = data, .regs = regs, + .event = event, }; int ret = 0; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index af71a84e12ee..f684d8e5fa2b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -412,6 +412,19 @@ err: return NULL; } +static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb) +{ + if (rb->aux_overwrite) + return false; + + if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); + return true; + } + + return false; +} + /* * Commit the data written by hardware into the ring buffer by adjusting * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the @@ -451,10 +464,8 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) } rb->user_page->aux_head = rb->aux_head; - if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + if (rb_need_aux_wakeup(rb)) wakeup = true; - rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); - } if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) @@ -484,9 +495,8 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) rb->aux_head += size; rb->user_page->aux_head = rb->aux_head; - if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { + if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); - rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; } diff --git a/kernel/exit.c b/kernel/exit.c index 3481ababd06a..f2cd53e92147 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1600,12 +1600,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); int signo = 0; + if (err > 0) { signo = SIGCHLD; err = 0; - } - - if (!err) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } @@ -1723,16 +1721,15 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (err > 0) { signo = SIGCHLD; err = 0; - } - - if (!err && uru) { - /* kernel_waitid() overwrites everything in ru */ - if (COMPAT_USE_64BIT_TIME) - err = copy_to_user(uru, &ru, sizeof(ru)); - else - err = put_compat_rusage(&ru, uru); - if (err) - return -EFAULT; + if (uru) { + /* kernel_waitid() overwrites everything in ru */ + if (COMPAT_USE_64BIT_TIME) + err = copy_to_user(uru, &ru, sizeof(ru)); + else + err = put_compat_rusage(&ru, uru); + if (err) + return -EFAULT; + } } if (!infop) diff --git a/kernel/extable.c b/kernel/extable.c index 38c2412401a1..9aa1cc41ecf7 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -102,15 +102,7 @@ int core_kernel_data(unsigned long addr) int __kernel_text_address(unsigned long addr) { - if (core_kernel_text(addr)) - return 1; - if (is_module_text_address(addr)) - return 1; - if (is_ftrace_trampoline(addr)) - return 1; - if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) - return 1; - if (is_bpf_text_address(addr)) + if (kernel_text_address(addr)) return 1; /* * There might be init symbols in saved stacktraces. @@ -127,17 +119,42 @@ int __kernel_text_address(unsigned long addr) int kernel_text_address(unsigned long addr) { + bool no_rcu; + int ret = 1; + if (core_kernel_text(addr)) return 1; + + /* + * If a stack dump happens while RCU is not watching, then + * RCU needs to be notified that it requires to start + * watching again. This can happen either by tracing that + * triggers a stack trace, or a WARN() that happens during + * coming back from idle, or cpu on or offlining. + * + * is_module_text_address() as well as the kprobe slots + * and is_bpf_text_address() require RCU to be watching. + */ + no_rcu = !rcu_is_watching(); + + /* Treat this like an NMI as it can happen anywhere */ + if (no_rcu) + rcu_nmi_enter(); + if (is_module_text_address(addr)) - return 1; + goto out; if (is_ftrace_trampoline(addr)) - return 1; + goto out; if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) - return 1; + goto out; if (is_bpf_text_address(addr)) - return 1; - return 0; + goto out; + ret = 0; +out: + if (no_rcu) + rcu_nmi_exit(); + + return ret; } /* diff --git a/kernel/fork.c b/kernel/fork.c index 10646182440f..e702cb9ffbd8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -946,6 +946,24 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +#ifdef CONFIG_MMU +static void mmput_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, + async_put_work); + + __mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ + if (atomic_dec_and_test(&mm->mm_users)) { + INIT_WORK(&mm->async_put_work, mmput_async_fn); + schedule_work(&mm->async_put_work); + } +} +#endif + /** * set_mm_exe_file - change a reference to the mm's executable file * diff --git a/kernel/futex.c b/kernel/futex.c index 3d38eaf05492..0518a0bfc746 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi_state *pi_state) /* * Drops a reference to the pi_state object and frees or caches it * when the last reference is gone. - * - * Must be called with the hb lock held. */ static void put_pi_state(struct futex_pi_state *pi_state) { @@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi_state *pi_state) * and has cleaned up the pi_state already */ if (pi_state->owner) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); - list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + struct task_struct *owner; - rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + owner = pi_state->owner; + if (owner) { + raw_spin_lock(&owner->pi_lock); + list_del_init(&pi_state->list); + raw_spin_unlock(&owner->pi_lock); + } + rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); } - if (current->pi_state_cache) + if (current->pi_state_cache) { kfree(pi_state); - else { + } else { /* * pi_state->list is already empty. * clear pi_state->owner. @@ -907,13 +911,14 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); - - raw_spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: */ if (head->next != next) { + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); continue; } @@ -922,9 +927,10 @@ void exit_pi_state_list(struct task_struct *curr) WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; - raw_spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock(&curr->pi_lock); get_pi_state(pi_state); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); rt_mutex_futex_unlock(&pi_state->pi_mutex); @@ -1208,6 +1214,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); + /* + * Assignment without holding pi_state->pi_mutex.wait_lock is safe + * because there is no concurrency as the object is not published yet. + */ pi_state->owner = p; raw_spin_unlock_irq(&p->pi_lock); @@ -2878,6 +2888,7 @@ retry: raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); + /* drops pi_state->pi_mutex.wait_lock */ ret = wake_futex_pi(uaddr, uval, pi_state); put_pi_state(pi_state); diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index f7086b78ad6e..5270a54b9fa4 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -322,7 +322,6 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, /* Calc pointer to the next generic chip */ tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); } - d->name = name; return 0; } EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e84b7056bb08..ac4644e92b49 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -945,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private) struct irq_desc *desc; struct irq_domain *domain; struct radix_tree_iter iter; - void **slot; + void __rcu **slot; int i; seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", @@ -1453,7 +1453,7 @@ out_free_desc: /* The irq_data was moved, fix the revmap to refer to the new location */ static void irq_domain_fix_revmap(struct irq_data *d) { - void **slot; + void __rcu **slot; if (d->hwirq < d->domain->revmap_size) return; /* Not using radix tree. */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 573dc52b0806..d00132b5c325 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1643,6 +1643,10 @@ const void *free_irq(unsigned int irq, void *dev_id) #endif action = __free_irq(irq, dev_id); + + if (!action) + return NULL; + devname = action->name; kfree(action); return devname; diff --git a/kernel/kcmp.c b/kernel/kcmp.c index ea34ed8bb952..055bb2962a0b 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -131,7 +131,7 @@ static int kcmp_epoll_target(struct task_struct *task1, if (filp_epoll) { filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); fput(filp_epoll); - } else + } if (IS_ERR(filp_tgt)) return PTR_ERR(filp_tgt); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 02f660666ab8..1fefe6dcafd7 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -613,6 +613,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) DEFINE_WAKE_Q(wake_q); /* + * __rwsem_down_write_failed_common(sem) + * rwsem_optimistic_spin(sem) + * osq_unlock(sem->osq) + * ... + * atomic_long_add_return(&sem->count) + * + * - VS - + * + * __up_write() + * if (atomic_long_sub_return_release(&sem->count) < 0) + * rwsem_wake(sem) + * osq_is_locked(&sem->osq) + * + * And __up_write() must observe !osq_is_locked() when it observes the + * atomic_long_add_return() in order to not miss a wakeup. + * + * This boils down to: + * + * [S.rel] X = 1 [RmW] r0 = (Y += 0) + * MB RMB + * [RmW] Y += 1 [L] r1 = X + * + * exists (r0=1 /\ r1=0) + */ + smp_rmb(); + + /* * If a spinner is present, it is not necessary to do the wakeup. * Try to do wakeup only if the trylock succeeds to minimize * spinlock contention which may introduce too much delay in the diff --git a/kernel/memremap.c b/kernel/memremap.c index 6bcbfbf1a8fd..403ab9cdb949 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -350,7 +350,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; - int error, nid, is_ram; + int error, nid, is_ram, i = 0; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -448,6 +448,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, list_del(&page->lru); page->pgmap = pgmap; percpu_ref_get(ref); + if (!(++i % 1024)) + cond_resched(); } devres_add(dev, page_map); return __va(res->start); diff --git a/kernel/params.c b/kernel/params.c index 60b2d8101355..cc9108c2a1fd 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -224,7 +224,7 @@ char *parse_args(const char *doing, } \ int param_get_##name(char *buffer, const struct kernel_param *kp) \ { \ - return scnprintf(buffer, PAGE_SIZE, format, \ + return scnprintf(buffer, PAGE_SIZE, format "\n", \ *((type *)kp->arg)); \ } \ const struct kernel_param_ops param_ops_##name = { \ @@ -236,14 +236,14 @@ char *parse_args(const char *doing, EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); -STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); -STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); -STANDARD_PARAM_DEF(long, long, "%li", kstrtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); -STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); +STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); +STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); +STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); +STANDARD_PARAM_DEF(long, long, "%li", kstrtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); +STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); int param_set_charp(const char *val, const struct kernel_param *kp) { @@ -270,7 +270,7 @@ EXPORT_SYMBOL(param_set_charp); int param_get_charp(char *buffer, const struct kernel_param *kp) { - return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg)); + return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg)); } EXPORT_SYMBOL(param_get_charp); @@ -301,7 +301,7 @@ EXPORT_SYMBOL(param_set_bool); int param_get_bool(char *buffer, const struct kernel_param *kp) { /* Y and N chosen as being relatively non-coder friendly */ - return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N'); + return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N'); } EXPORT_SYMBOL(param_get_bool); @@ -360,7 +360,7 @@ EXPORT_SYMBOL(param_set_invbool); int param_get_invbool(char *buffer, const struct kernel_param *kp) { - return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); + return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y'); } EXPORT_SYMBOL(param_get_invbool); @@ -460,8 +460,9 @@ static int param_array_get(char *buffer, const struct kernel_param *kp) struct kernel_param p = *kp; for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { + /* Replace \n with comma */ if (i) - buffer[off++] = ','; + buffer[off - 1] = ','; p.arg = arr->elem + arr->elemsize * i; check_kparam_locked(p.mod); ret = arr->ops->get(buffer + off, &p); @@ -507,7 +508,7 @@ EXPORT_SYMBOL(param_set_copystring); int param_get_string(char *buffer, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; - return strlcpy(buffer, kps->string, kps->maxlen); + return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string); } EXPORT_SYMBOL(param_get_string); @@ -549,10 +550,6 @@ static ssize_t param_attr_show(struct module_attribute *mattr, kernel_param_lock(mk->mod); count = attribute->param->ops->get(buf, attribute->param); kernel_param_unlock(mk->mod); - if (count > 0) { - strcat(buf, "\n"); - ++count; - } return count; } @@ -600,7 +597,7 @@ EXPORT_SYMBOL(kernel_param_unlock); /* * add_sysfs_param - add a parameter to sysfs * @mk: struct module_kobject - * @kparam: the actual parameter definition to add to sysfs + * @kp: the actual parameter definition to add to sysfs * @name: name of parameter * * Create a kobject if for a (per-module) parameter if mp NULL, and diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 3e2b4f519009..ccd2d20e6b06 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -120,22 +120,26 @@ static void s2idle_loop(void) * frozen processes + suspended devices + idle processors. * Thus s2idle_enter() should be called right after * all devices have been suspended. + * + * Wakeups during the noirq suspend of devices may be spurious, + * so prevent them from terminating the loop right away. */ error = dpm_noirq_suspend_devices(PMSG_SUSPEND); if (!error) s2idle_enter(); + else if (error == -EBUSY && pm_wakeup_pending()) + error = 0; - dpm_noirq_resume_devices(PMSG_RESUME); - if (error && (error != -EBUSY || !pm_wakeup_pending())) { - dpm_noirq_end(); - break; - } - - if (s2idle_ops && s2idle_ops->wake) + if (!error && s2idle_ops && s2idle_ops->wake) s2idle_ops->wake(); + dpm_noirq_resume_devices(PMSG_RESUME); + dpm_noirq_end(); + if (error) + break; + if (s2idle_ops && s2idle_ops->sync) s2idle_ops->sync(); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1250e4bd4b85..b0ad62b0e7b8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -882,6 +882,11 @@ void rcu_irq_exit(void) RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); + + /* Page faults can happen in NMI handlers, so check... */ + if (rdtp->dynticks_nmi_nesting) + return; + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && rdtp->dynticks_nesting < 1); if (rdtp->dynticks_nesting <= 1) { @@ -1015,6 +1020,11 @@ void rcu_irq_enter(void) RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); + + /* Page faults can happen in NMI handlers, so check... */ + if (rdtp->dynticks_nmi_nesting) + return; + oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18a6966567da..d17c5da523a0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5166,6 +5166,28 @@ void sched_show_task(struct task_struct *p) put_task_stack(p); } +static inline bool +state_filter_match(unsigned long state_filter, struct task_struct *p) +{ + /* no filter, everything matches */ + if (!state_filter) + return true; + + /* filter, but doesn't match */ + if (!(p->state & state_filter)) + return false; + + /* + * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows + * TASK_KILLABLE). + */ + if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) + return false; + + return true; +} + + void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; @@ -5188,7 +5210,7 @@ void show_state_filter(unsigned long state_filter) */ touch_nmi_watchdog(); touch_all_softlockup_watchdogs(); - if (!state_filter || (p->state & state_filter)) + if (state_filter_match(state_filter, p)) sched_show_task(p); } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 01217fb5a5de..2f93e4a2d9f6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -466,8 +466,6 @@ static char *task_group_path(struct task_group *tg) } #endif -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { diff --git a/kernel/seccomp.c b/kernel/seccomp.c index c24579dfa7a1..bb3a38005b9c 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -473,14 +473,19 @@ static long seccomp_attach_filter(unsigned int flags, return 0; } +void __get_seccomp_filter(struct seccomp_filter *filter) +{ + /* Reference count is bounded by the number of total processes. */ + refcount_inc(&filter->usage); +} + /* get_seccomp_filter - increments the reference count of the filter on @tsk */ void get_seccomp_filter(struct task_struct *tsk) { struct seccomp_filter *orig = tsk->seccomp.filter; if (!orig) return; - /* Reference count is bounded by the number of total processes. */ - refcount_inc(&orig->usage); + __get_seccomp_filter(orig); } static inline void seccomp_filter_free(struct seccomp_filter *filter) @@ -491,10 +496,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter) } } -/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ -void put_seccomp_filter(struct task_struct *tsk) +static void __put_seccomp_filter(struct seccomp_filter *orig) { - struct seccomp_filter *orig = tsk->seccomp.filter; /* Clean up single-reference branches iteratively. */ while (orig && refcount_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; @@ -503,6 +506,12 @@ void put_seccomp_filter(struct task_struct *tsk) } } +/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ +void put_seccomp_filter(struct task_struct *tsk) +{ + __put_seccomp_filter(tsk->seccomp.filter); +} + static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) { memset(info, 0, sizeof(*info)); @@ -1025,13 +1034,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, if (!data) goto out; - get_seccomp_filter(task); + __get_seccomp_filter(filter); spin_unlock_irq(&task->sighand->siglock); if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) ret = -EFAULT; - put_seccomp_filter(task); + __put_seccomp_filter(filter); return ret; out: diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 1d71c051a951..5043e7433f4b 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -344,39 +344,30 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); * by the client, but only by calling this function. * This function can only be called on a registered smp_hotplug_thread. */ -int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, - const struct cpumask *new) +void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *new) { struct cpumask *old = plug_thread->cpumask; - cpumask_var_t tmp; + static struct cpumask tmp; unsigned int cpu; - if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) - return -ENOMEM; - - get_online_cpus(); + lockdep_assert_cpus_held(); mutex_lock(&smpboot_threads_lock); /* Park threads that were exclusively enabled on the old mask. */ - cpumask_andnot(tmp, old, new); - for_each_cpu_and(cpu, tmp, cpu_online_mask) + cpumask_andnot(&tmp, old, new); + for_each_cpu_and(cpu, &tmp, cpu_online_mask) smpboot_park_thread(plug_thread, cpu); /* Unpark threads that are exclusively enabled on the new mask. */ - cpumask_andnot(tmp, new, old); - for_each_cpu_and(cpu, tmp, cpu_online_mask) + cpumask_andnot(&tmp, new, old); + for_each_cpu_and(cpu, &tmp, cpu_online_mask) smpboot_unpark_thread(plug_thread, cpu); cpumask_copy(old, new); mutex_unlock(&smpboot_threads_lock); - put_online_cpus(); - - free_cpumask_var(tmp); - - return 0; } -EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread); static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6648fbbb8157..d9c31bc2eaea 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -367,7 +367,8 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_time_avg, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, #ifdef CONFIG_SCHEDSTATS { @@ -871,9 +872,9 @@ static struct ctl_table kern_table[] = { #if defined(CONFIG_LOCKUP_DETECTOR) { .procname = "watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof (int), - .mode = 0644, + .data = &watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_watchdog, .extra1 = &zero, .extra2 = &one, @@ -889,16 +890,12 @@ static struct ctl_table kern_table[] = { }, { .procname = "nmi_watchdog", - .data = &nmi_watchdog_enabled, - .maxlen = sizeof (int), - .mode = 0644, + .data = &nmi_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = NMI_WATCHDOG_SYSCTL_PERM, .proc_handler = proc_nmi_watchdog, .extra1 = &zero, -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) .extra2 = &one, -#else - .extra2 = &zero, -#endif }, { .procname = "watchdog_cpumask", @@ -910,9 +907,9 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_SOFTLOCKUP_DETECTOR { .procname = "soft_watchdog", - .data = &soft_watchdog_enabled, - .maxlen = sizeof (int), - .mode = 0644, + .data = &soft_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_soft_watchdog, .extra1 = &zero, .extra2 = &one, @@ -2187,8 +2184,6 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, if (write) { if (*lvalp > UINT_MAX) return -EINVAL; - if (*lvalp > UINT_MAX) - return -EINVAL; *valp = *lvalp; } else { unsigned int val = *valp; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2a685b45b73b..45a3928544ce 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -648,6 +648,12 @@ int blk_trace_startstop(struct request_queue *q, int start) } EXPORT_SYMBOL_GPL(blk_trace_startstop); +/* + * When reading or writing the blktrace sysfs files, the references to the + * opened sysfs or device files should prevent the underlying block device + * from being removed. So no further delete protection is really needed. + */ + /** * blk_trace_ioctl: - handle the ioctls associated with tracing * @bdev: the block device @@ -665,7 +671,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); switch (cmd) { case BLKTRACESETUP: @@ -691,7 +697,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) break; } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); return ret; } @@ -1727,7 +1733,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { ret = sprintf(buf, "%u\n", !!q->blk_trace); @@ -1746,7 +1752,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); out_bdput: bdput(bdev); out: @@ -1788,7 +1794,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { if (value) @@ -1814,7 +1820,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } out_unlock_bdev: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&q->blk_trace_mutex); out_bdput: bdput(bdev); out: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index dc498b605d5d..3126da2f468a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -255,14 +255,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) +static __always_inline int +get_map_perf_counter(struct bpf_map *map, u64 flags, + u64 *value, u64 *enabled, u64 *running) { struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; - u64 value = 0; - int err; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; @@ -275,7 +275,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - err = perf_event_read_local(ee->event, &value); + return perf_event_read_local(ee->event, value, enabled, running); +} + +BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) +{ + u64 value = 0; + int err; + + err = get_map_perf_counter(map, flags, &value, NULL, NULL); /* * this api is ugly since we miss [-22..-2] range of valid * counter values, but that's uapi @@ -293,6 +301,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags, + struct bpf_perf_event_value *, buf, u32, size) +{ + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_perf_event_value))) + goto clear; + err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled, + &buf->running); + if (unlikely(err)) + goto clear; + return 0; +clear: + memset(buf, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_perf_event_read_value_proto = { + .func = bpf_perf_event_read_value, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); static __always_inline u64 @@ -499,6 +534,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_perf_event_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; + case BPF_FUNC_perf_event_read_value: + return &bpf_perf_event_read_value_proto; default: return tracing_func_proto(func_id); } @@ -524,11 +561,14 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type return true; } -const struct bpf_verifier_ops kprobe_prog_ops = { +const struct bpf_verifier_ops kprobe_verifier_ops = { .get_func_proto = kprobe_prog_func_proto, .is_valid_access = kprobe_prog_is_valid_access, }; +const struct bpf_prog_ops kprobe_prog_ops = { +}; + BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, u64, flags, void *, data, u64, size) { @@ -576,6 +616,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx, + struct bpf_perf_event_value *, buf, u32, size) +{ + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_perf_event_value))) + goto clear; + err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled, + &buf->running); + if (unlikely(err)) + goto clear; + return 0; +clear: + memset(buf, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = { + .func = bpf_perf_prog_read_value_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -583,6 +649,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; + case BPF_FUNC_perf_prog_read_value: + return &bpf_perf_prog_read_value_proto_tp; default: return tracing_func_proto(func_id); } @@ -602,11 +670,14 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type return true; } -const struct bpf_verifier_ops tracepoint_prog_ops = { +const struct bpf_verifier_ops tracepoint_verifier_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = tp_prog_is_valid_access, }; +const struct bpf_prog_ops tracepoint_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { @@ -662,8 +733,11 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -const struct bpf_verifier_ops perf_event_prog_ops = { +const struct bpf_verifier_ops perf_event_verifier_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; + +const struct bpf_prog_ops perf_event_prog_ops = { +}; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6abfafd7f173..8319e09e15b9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4954,9 +4954,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); -static unsigned long save_global_trampoline; -static unsigned long save_global_flags; - static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); @@ -6808,17 +6805,6 @@ void unregister_ftrace_graph(void) unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -#ifdef CONFIG_DYNAMIC_FTRACE - /* - * Function graph does not allocate the trampoline, but - * other global_ops do. We need to reset the ALLOC_TRAMP flag - * if one was used. - */ - global_ops.trampoline = save_global_trampoline; - if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) - global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; -#endif - out: mutex_unlock(&ftrace_lock); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index bac629af2285..c738e764e2a5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter) return !trace_seq_has_overflowed(s); } -static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; - -static int task_state_char(unsigned long state) -{ - int bit = state ? __ffs(state) + 1 : 0; - - return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; -} - /** * ftrace_find_event - find a registered event * @type: the type of event to look for @@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - T = task_state_char(field->next_state); - S = task_state_char(field->prev_state); + T = __task_state_to_char(field->next_state); + S = __task_state_to_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); trace_seq_printf(&iter->seq, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", @@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); + S = __task_state_to_char(field->prev_state); + T = __task_state_to_char(field->next_state); trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, @@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - S = task_state_char(field->prev_state); - T = task_state_char(field->next_state); + S = __task_state_to_char(field->prev_state); + T = __task_state_to_char(field->next_state); SEQ_PUT_HEX_FIELD(s, field->prev_pid); SEQ_PUT_HEX_FIELD(s, field->prev_prio); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ddec53b67646..0c331978b1a6 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = prev->pid; entry->prev_prio = prev->prio; - entry->prev_state = prev->state; + entry->prev_state = __get_task_state(prev); entry->next_pid = next->pid; entry->next_prio = next->prio; - entry->next_state = next->state; + entry->next_state = __get_task_state(next); entry->next_cpu = task_cpu(next); if (!call_filter_check_discard(call, entry, buffer, event)) @@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->prev_pid = curr->pid; entry->prev_prio = curr->prio; - entry->prev_state = curr->state; + entry->prev_state = __get_task_state(curr); entry->next_pid = wakee->pid; entry->next_prio = wakee->prio; - entry->next_state = wakee->state; + entry->next_state = __get_task_state(wakee); entry->next_cpu = task_cpu(wakee); if (!call_filter_check_discard(call, entry, buffer, event)) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index a4df67cbc711..49cb41412eec 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -96,23 +96,9 @@ check_stack(unsigned long ip, unsigned long *stack) if (in_nmi()) return; - /* - * There's a slight chance that we are tracing inside the - * RCU infrastructure, and rcu_irq_enter() will not work - * as expected. - */ - if (unlikely(rcu_irq_enter_disabled())) - return; - local_irq_save(flags); arch_spin_lock(&stack_trace_max_lock); - /* - * RCU may not be watching, make it see us. - * The stack trace code uses rcu_sched. - */ - rcu_irq_enter(); - /* In case another CPU set the tracer_frame on us */ if (unlikely(!frame_size)) this_size -= tracer_frame; @@ -205,7 +191,6 @@ check_stack(unsigned long ip, unsigned long *stack) } out: - rcu_irq_exit(); arch_spin_unlock(&stack_trace_max_lock); local_irq_restore(flags); } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f5d52024f6b7..6bcb854909c0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,20 +29,29 @@ #include <linux/kvm_para.h> #include <linux/kthread.h> -/* Watchdog configuration */ -static DEFINE_MUTEX(watchdog_proc_mutex); - -int __read_mostly nmi_watchdog_enabled; +static DEFINE_MUTEX(watchdog_mutex); #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | - NMI_WATCHDOG_ENABLED; +# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED) +# define NMI_WATCHDOG_DEFAULT 1 #else -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; +# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED) +# define NMI_WATCHDOG_DEFAULT 0 #endif +unsigned long __read_mostly watchdog_enabled; +int __read_mostly watchdog_user_enabled = 1; +int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; +int __read_mostly soft_watchdog_user_enabled = 1; +int __read_mostly watchdog_thresh = 10; +int __read_mostly nmi_watchdog_available; + +struct cpumask watchdog_allowed_mask __read_mostly; + +struct cpumask watchdog_cpumask __read_mostly; +unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); + #ifdef CONFIG_HARDLOCKUP_DETECTOR -/* boot commands */ /* * Should we panic when a soft-lockup or hard-lockup occurs: */ @@ -56,9 +65,9 @@ unsigned int __read_mostly hardlockup_panic = * kernel command line parameters are parsed, because otherwise it is not * possible to override this in hardlockup_panic_setup(). */ -void hardlockup_detector_disable(void) +void __init hardlockup_detector_disable(void) { - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + nmi_watchdog_user_enabled = 0; } static int __init hardlockup_panic_setup(char *str) @@ -68,48 +77,24 @@ static int __init hardlockup_panic_setup(char *str) else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + nmi_watchdog_user_enabled = 0; else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; + nmi_watchdog_user_enabled = 1; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); -#endif - -#ifdef CONFIG_SOFTLOCKUP_DETECTOR -int __read_mostly soft_watchdog_enabled; -#endif - -int __read_mostly watchdog_user_enabled; -int __read_mostly watchdog_thresh = 10; - -#ifdef CONFIG_SMP -int __read_mostly sysctl_softlockup_all_cpu_backtrace; +# ifdef CONFIG_SMP int __read_mostly sysctl_hardlockup_all_cpu_backtrace; -#endif -struct cpumask watchdog_cpumask __read_mostly; -unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); -/* - * The 'watchdog_running' variable is set to 1 when the watchdog threads - * are registered/started and is set to 0 when the watchdog threads are - * unregistered/stopped, so it is an indicator whether the threads exist. - */ -static int __read_mostly watchdog_running; -/* - * If a subsystem has a need to deactivate the watchdog temporarily, it - * can use the suspend/resume interface to achieve this. The content of - * the 'watchdog_suspended' variable reflects this state. Existing threads - * are parked/unparked by the lockup_detector_{suspend|resume} functions - * (see comment blocks pertaining to those functions for further details). - * - * 'watchdog_suspended' also prevents threads from being registered/started - * or unregistered/stopped via parameters in /proc/sys/kernel, so the state - * of 'watchdog_running' cannot change while the watchdog is deactivated - * temporarily (see related code in 'proc' handlers). - */ -int __read_mostly watchdog_suspended; +static int __init hardlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); +# endif /* CONFIG_SMP */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* * These functions can be overridden if an architecture implements its @@ -121,36 +106,68 @@ int __read_mostly watchdog_suspended; */ int __weak watchdog_nmi_enable(unsigned int cpu) { + hardlockup_detector_perf_enable(); return 0; } + void __weak watchdog_nmi_disable(unsigned int cpu) { + hardlockup_detector_perf_disable(); } -/* - * watchdog_nmi_reconfigure can be implemented to be notified after any - * watchdog configuration change. The arch hardlockup watchdog should - * respond to the following variables: - * - nmi_watchdog_enabled +/* Return 0, if a NMI watchdog is available. Error code otherwise */ +int __weak __init watchdog_nmi_probe(void) +{ + return hardlockup_detector_perf_init(); +} + +/** + * watchdog_nmi_stop - Stop the watchdog for reconfiguration + * + * The reconfiguration steps are: + * watchdog_nmi_stop(); + * update_variables(); + * watchdog_nmi_start(); + */ +void __weak watchdog_nmi_stop(void) { } + +/** + * watchdog_nmi_start - Start the watchdog after reconfiguration + * + * Counterpart to watchdog_nmi_stop(). + * + * The following variables have been updated in update_variables() and + * contain the currently valid configuration: + * - watchdog_enabled * - watchdog_thresh * - watchdog_cpumask - * - sysctl_hardlockup_all_cpu_backtrace - * - hardlockup_panic - * - watchdog_suspended */ -void __weak watchdog_nmi_reconfigure(void) +void __weak watchdog_nmi_start(void) { } + +/** + * lockup_detector_update_enable - Update the sysctl enable bit + * + * Caller needs to make sure that the NMI/perf watchdogs are off, so this + * can't race with watchdog_nmi_disable(). + */ +static void lockup_detector_update_enable(void) { + watchdog_enabled = 0; + if (!watchdog_user_enabled) + return; + if (nmi_watchdog_available && nmi_watchdog_user_enabled) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + if (soft_watchdog_user_enabled) + watchdog_enabled |= SOFT_WATCHDOG_ENABLED; } - #ifdef CONFIG_SOFTLOCKUP_DETECTOR -/* Helper for online, unparked cpus. */ -#define for_each_watchdog_cpu(cpu) \ - for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) - -atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); +/* Global variables, exported for sysctl */ +unsigned int __read_mostly softlockup_panic = + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; +static bool softlockup_threads_initialized __read_mostly; static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -164,50 +181,40 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; -unsigned int __read_mostly softlockup_panic = - CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; - static int __init softlockup_panic_setup(char *str) { softlockup_panic = simple_strtoul(str, NULL, 0); - return 1; } __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { - watchdog_enabled = 0; + watchdog_user_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); static int __init nosoftlockup_setup(char *str) { - watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED; + soft_watchdog_user_enabled = 0; return 1; } __setup("nosoftlockup", nosoftlockup_setup); #ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; + static int __init softlockup_all_cpu_backtrace_setup(char *str) { - sysctl_softlockup_all_cpu_backtrace = - !!simple_strtol(str, NULL, 0); + sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static int __init hardlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_hardlockup_all_cpu_backtrace = - !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); -#endif #endif +static void __lockup_detector_cleanup(void); + /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- * lockups can have false positives under extreme conditions. So we generally @@ -278,11 +285,15 @@ void touch_all_softlockup_watchdogs(void) int cpu; /* - * this is done lockless - * do we care if a 0 races with a timestamp? - * all it means is the softlock check starts one cycle later + * watchdog_mutex cannpt be taken here, as this might be called + * from (soft)interrupt context, so the access to + * watchdog_allowed_cpumask might race with a concurrent update. + * + * The watchdog time stamp can race against a concurrent real + * update as well, the only side effect might be a cycle delay for + * the softlockup check. */ - for_each_watchdog_cpu(cpu) + for_each_cpu(cpu, &watchdog_allowed_mask) per_cpu(watchdog_touch_ts, cpu) = 0; wq_watchdog_touch(-1); } @@ -322,9 +333,6 @@ static void watchdog_interrupt_count(void) __this_cpu_inc(hrtimer_interrupts); } -static int watchdog_enable_all_cpus(void); -static void watchdog_disable_all_cpus(void); - /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -333,7 +341,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; - if (atomic_read(&watchdog_park_in_progress) != 0) + if (!watchdog_enabled) return HRTIMER_NORESTART; /* kick the hardlockup detector */ @@ -447,32 +455,38 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) static void watchdog_enable(unsigned int cpu) { - struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); - /* kick off the timer for the hardlockup detector */ + /* + * Start the timer first to prevent the NMI watchdog triggering + * before the timer has a chance to fire. + */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = watchdog_timer_fn; - - /* Enable the perf event */ - watchdog_nmi_enable(cpu); - - /* done here because hrtimer_start can only pin to smp_processor_id() */ hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED); - /* initialize timestamp */ - watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); + /* Initialize timestamp */ __touch_watchdog(); + /* Enable the perf event */ + if (watchdog_enabled & NMI_WATCHDOG_ENABLED) + watchdog_nmi_enable(cpu); + + watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); } static void watchdog_disable(unsigned int cpu) { - struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); watchdog_set_prio(SCHED_NORMAL, 0); - hrtimer_cancel(hrtimer); - /* disable the perf event */ + /* + * Disable the perf event first. That prevents that a large delay + * between disabling the timer and disabling the perf event causes + * the perf NMI to detect a false positive. + */ watchdog_nmi_disable(cpu); + hrtimer_cancel(hrtimer); } static void watchdog_cleanup(unsigned int cpu, bool online) @@ -499,21 +513,6 @@ static void watchdog(unsigned int cpu) __this_cpu_write(soft_lockup_hrtimer_cnt, __this_cpu_read(hrtimer_interrupts)); __touch_watchdog(); - - /* - * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the - * failure path. Check for failures that can occur asynchronously - - * for example, when CPUs are on-lined - and shut down the hardware - * perf event on each CPU accordingly. - * - * The only non-obvious place this bit can be cleared is through - * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a - * pr_info here would be too noisy as it would result in a message - * every few seconds if the hardlockup was disabled but the softlockup - * enabled. - */ - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - watchdog_nmi_disable(cpu); } static struct smp_hotplug_thread watchdog_threads = { @@ -527,295 +526,174 @@ static struct smp_hotplug_thread watchdog_threads = { .unpark = watchdog_enable, }; -/* - * park all watchdog threads that are specified in 'watchdog_cpumask' - * - * This function returns an error if kthread_park() of a watchdog thread - * fails. In this situation, the watchdog threads of some CPUs can already - * be parked and the watchdog threads of other CPUs can still be runnable. - * Callers are expected to handle this special condition as appropriate in - * their context. - * - * This function may only be called in a context that is protected against - * races with CPU hotplug - for example, via get_online_cpus(). - */ -static int watchdog_park_threads(void) +static void softlockup_update_smpboot_threads(void) { - int cpu, ret = 0; + lockdep_assert_held(&watchdog_mutex); - atomic_set(&watchdog_park_in_progress, 1); + if (!softlockup_threads_initialized) + return; - for_each_watchdog_cpu(cpu) { - ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); - if (ret) - break; - } - - atomic_set(&watchdog_park_in_progress, 0); - - return ret; + smpboot_update_cpumask_percpu_thread(&watchdog_threads, + &watchdog_allowed_mask); } -/* - * unpark all watchdog threads that are specified in 'watchdog_cpumask' - * - * This function may only be called in a context that is protected against - * races with CPU hotplug - for example, via get_online_cpus(). - */ -static void watchdog_unpark_threads(void) +/* Temporarily park all watchdog threads */ +static void softlockup_park_all_threads(void) { - int cpu; - - for_each_watchdog_cpu(cpu) - kthread_unpark(per_cpu(softlockup_watchdog, cpu)); + cpumask_clear(&watchdog_allowed_mask); + softlockup_update_smpboot_threads(); } -static int update_watchdog_all_cpus(void) +/* Unpark enabled threads */ +static void softlockup_unpark_threads(void) { - int ret; - - ret = watchdog_park_threads(); - if (ret) - return ret; - - watchdog_unpark_threads(); - - return 0; + cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); + softlockup_update_smpboot_threads(); } -static int watchdog_enable_all_cpus(void) +static void lockup_detector_reconfigure(void) { - int err = 0; - - if (!watchdog_running) { - err = smpboot_register_percpu_thread_cpumask(&watchdog_threads, - &watchdog_cpumask); - if (err) - pr_err("Failed to create watchdog threads, disabled\n"); - else - watchdog_running = 1; - } else { - /* - * Enable/disable the lockup detectors or - * change the sample period 'on the fly'. - */ - err = update_watchdog_all_cpus(); - - if (err) { - watchdog_disable_all_cpus(); - pr_err("Failed to update lockup detectors, disabled\n"); - } - } - - if (err) - watchdog_enabled = 0; - - return err; + cpus_read_lock(); + watchdog_nmi_stop(); + softlockup_park_all_threads(); + set_sample_period(); + lockup_detector_update_enable(); + if (watchdog_enabled && watchdog_thresh) + softlockup_unpark_threads(); + watchdog_nmi_start(); + cpus_read_unlock(); + /* + * Must be called outside the cpus locked section to prevent + * recursive locking in the perf code. + */ + __lockup_detector_cleanup(); } -static void watchdog_disable_all_cpus(void) +/* + * Create the watchdog thread infrastructure and configure the detector(s). + * + * The threads are not unparked as watchdog_allowed_mask is empty. When + * the threads are sucessfully initialized, take the proper locks and + * unpark the threads in the watchdog_cpumask if the watchdog is enabled. + */ +static __init void lockup_detector_setup(void) { - if (watchdog_running) { - watchdog_running = 0; - smpboot_unregister_percpu_thread(&watchdog_threads); - } -} + int ret; -#ifdef CONFIG_SYSCTL -static int watchdog_update_cpus(void) -{ - return smpboot_update_cpumask_percpu_thread( - &watchdog_threads, &watchdog_cpumask); -} -#endif + /* + * If sysctl is off and watchdog got disabled on the command line, + * nothing to do here. + */ + lockup_detector_update_enable(); -#else /* SOFTLOCKUP */ -static int watchdog_park_threads(void) -{ - return 0; -} + if (!IS_ENABLED(CONFIG_SYSCTL) && + !(watchdog_enabled && watchdog_thresh)) + return; -static void watchdog_unpark_threads(void) -{ -} + ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads, + &watchdog_allowed_mask); + if (ret) { + pr_err("Failed to initialize soft lockup detector threads\n"); + return; + } -static int watchdog_enable_all_cpus(void) -{ - return 0; + mutex_lock(&watchdog_mutex); + softlockup_threads_initialized = true; + lockup_detector_reconfigure(); + mutex_unlock(&watchdog_mutex); } -static void watchdog_disable_all_cpus(void) +#else /* CONFIG_SOFTLOCKUP_DETECTOR */ +static inline int watchdog_park_threads(void) { return 0; } +static inline void watchdog_unpark_threads(void) { } +static inline int watchdog_enable_all_cpus(void) { return 0; } +static inline void watchdog_disable_all_cpus(void) { } +static void lockup_detector_reconfigure(void) { + cpus_read_lock(); + watchdog_nmi_stop(); + lockup_detector_update_enable(); + watchdog_nmi_start(); + cpus_read_unlock(); } - -#ifdef CONFIG_SYSCTL -static int watchdog_update_cpus(void) +static inline void lockup_detector_setup(void) { - return 0; + lockup_detector_reconfigure(); } -#endif +#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ -static void set_sample_period(void) +static void __lockup_detector_cleanup(void) { + lockdep_assert_held(&watchdog_mutex); + hardlockup_detector_perf_cleanup(); } -#endif /* SOFTLOCKUP */ -/* - * Suspend the hard and soft lockup detector by parking the watchdog threads. +/** + * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes + * + * Caller must not hold the cpu hotplug rwsem. */ -int lockup_detector_suspend(void) +void lockup_detector_cleanup(void) { - int ret = 0; - - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); - /* - * Multiple suspend requests can be active in parallel (counted by - * the 'watchdog_suspended' variable). If the watchdog threads are - * running, the first caller takes care that they will be parked. - * The state of 'watchdog_running' cannot change while a suspend - * request is active (see related code in 'proc' handlers). - */ - if (watchdog_running && !watchdog_suspended) - ret = watchdog_park_threads(); - - if (ret == 0) - watchdog_suspended++; - else { - watchdog_disable_all_cpus(); - pr_err("Failed to suspend lockup detectors, disabled\n"); - watchdog_enabled = 0; - } - - watchdog_nmi_reconfigure(); - - mutex_unlock(&watchdog_proc_mutex); - - return ret; + mutex_lock(&watchdog_mutex); + __lockup_detector_cleanup(); + mutex_unlock(&watchdog_mutex); } -/* - * Resume the hard and soft lockup detector by unparking the watchdog threads. +/** + * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) + * + * Special interface for parisc. It prevents lockup detector warnings from + * the default pm_poweroff() function which busy loops forever. */ -void lockup_detector_resume(void) +void lockup_detector_soft_poweroff(void) { - mutex_lock(&watchdog_proc_mutex); - - watchdog_suspended--; - /* - * The watchdog threads are unparked if they were previously running - * and if there is no more active suspend request. - */ - if (watchdog_running && !watchdog_suspended) - watchdog_unpark_threads(); - - watchdog_nmi_reconfigure(); - - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + watchdog_enabled = 0; } #ifdef CONFIG_SYSCTL -/* - * Update the run state of the lockup detectors. - */ -static int proc_watchdog_update(void) +/* Propagate any changes to the watchdog threads */ +static void proc_watchdog_update(void) { - int err = 0; - - /* - * Watchdog threads won't be started if they are already active. - * The 'watchdog_running' variable in watchdog_*_all_cpus() takes - * care of this. If those threads are already active, the sample - * period will be updated and the lockup detectors will be enabled - * or disabled 'on the fly'. - */ - if (watchdog_enabled && watchdog_thresh) - err = watchdog_enable_all_cpus(); - else - watchdog_disable_all_cpus(); - - watchdog_nmi_reconfigure(); - - return err; - + /* Remove impossible cpus to keep sysctl output clean. */ + cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); + lockup_detector_reconfigure(); } /* * common function for watchdog, nmi_watchdog and soft_watchdog parameter * - * caller | table->data points to | 'which' contains the flag(s) - * -------------------|-----------------------|----------------------------- - * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed - * | | with SOFT_WATCHDOG_ENABLED - * -------------------|-----------------------|----------------------------- - * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED - * -------------------|-----------------------|----------------------------- - * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED + * caller | table->data points to | 'which' + * -------------------|----------------------------|-------------------------- + * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED | + * | | SOFT_WATCHDOG_ENABLED + * -------------------|----------------------------|-------------------------- + * proc_nmi_watchdog | nmi_watchdog_user_enabled | NMI_WATCHDOG_ENABLED + * -------------------|----------------------------|-------------------------- + * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old, new; - int *watchdog_param = (int *)table->data; + int err, old, *param = table->data; - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); + mutex_lock(&watchdog_mutex); - if (watchdog_suspended) { - /* no parameter changes allowed while watchdog is suspended */ - err = -EAGAIN; - goto out; - } - - /* - * If the parameter is being read return the state of the corresponding - * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the - * run state of the lockup detectors. - */ if (!write) { - *watchdog_param = (watchdog_enabled & which) != 0; + /* + * On read synchronize the userspace interface. This is a + * racy snapshot. + */ + *param = (watchdog_enabled & which) != 0; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); } else { + old = READ_ONCE(*param); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (err) - goto out; - - /* - * There is a race window between fetching the current value - * from 'watchdog_enabled' and storing the new value. During - * this race window, watchdog_nmi_enable() can sneak in and - * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'. - * The 'cmpxchg' detects this race and the loop retries. - */ - do { - old = watchdog_enabled; - /* - * If the parameter value is not zero set the - * corresponding bit(s), else clear it(them). - */ - if (*watchdog_param) - new = old | which; - else - new = old & ~which; - } while (cmpxchg(&watchdog_enabled, old, new) != old); - - /* - * Update the run state of the lockup detectors. There is _no_ - * need to check the value returned by proc_watchdog_update() - * and to restore the previous value of 'watchdog_enabled' as - * both lockup detectors are disabled if proc_watchdog_update() - * returns an error. - */ - if (old == new) - goto out; - - err = proc_watchdog_update(); + if (!err && old != READ_ONCE(*param)) + proc_watchdog_update(); } -out: - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + mutex_unlock(&watchdog_mutex); return err; } @@ -835,6 +713,8 @@ int proc_watchdog(struct ctl_table *table, int write, int proc_nmi_watchdog(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + if (!nmi_watchdog_available && write) + return -ENOTSUPP; return proc_watchdog_common(NMI_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); } @@ -855,39 +735,17 @@ int proc_soft_watchdog(struct ctl_table *table, int write, int proc_watchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old, new; - - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); + int err, old; - if (watchdog_suspended) { - /* no parameter changes allowed while watchdog is suspended */ - err = -EAGAIN; - goto out; - } + mutex_lock(&watchdog_mutex); - old = ACCESS_ONCE(watchdog_thresh); + old = READ_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (err || !write) - goto out; - - /* - * Update the sample period. Restore on failure. - */ - new = ACCESS_ONCE(watchdog_thresh); - if (old == new) - goto out; + if (!err && write && old != READ_ONCE(watchdog_thresh)) + proc_watchdog_update(); - set_sample_period(); - err = proc_watchdog_update(); - if (err) { - watchdog_thresh = old; - set_sample_period(); - } -out: - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + mutex_unlock(&watchdog_mutex); return err; } @@ -902,45 +760,19 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, { int err; - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); - - if (watchdog_suspended) { - /* no parameter changes allowed while watchdog is suspended */ - err = -EAGAIN; - goto out; - } + mutex_lock(&watchdog_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); - if (!err && write) { - /* Remove impossible cpus to keep sysctl output cleaner. */ - cpumask_and(&watchdog_cpumask, &watchdog_cpumask, - cpu_possible_mask); - - if (watchdog_running) { - /* - * Failure would be due to being unable to allocate - * a temporary cpumask, so we are likely not in a - * position to do much else to make things better. - */ - if (watchdog_update_cpus() != 0) - pr_err("cpumask update failed\n"); - } + if (!err && write) + proc_watchdog_update(); - watchdog_nmi_reconfigure(); - } -out: - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + mutex_unlock(&watchdog_mutex); return err; } - #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) { - set_sample_period(); - #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_enabled()) { pr_info("Disabling watchdog on nohz_full cores by default\n"); @@ -951,6 +783,7 @@ void __init lockup_detector_init(void) cpumask_copy(&watchdog_cpumask, cpu_possible_mask); #endif - if (watchdog_enabled) - watchdog_enable_all_cpus(); + if (!watchdog_nmi_probe()) + nmi_watchdog_available = true; + lockup_detector_setup(); } diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 3a09ea1b1d3d..71a62ceacdc8 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -21,8 +21,10 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; +static unsigned int watchdog_cpus; void arch_touch_nmi_watchdog(void) { @@ -103,15 +105,12 @@ static struct perf_event_attr wd_hw_attr = { /* Callback function for perf event subsystem */ static void watchdog_overflow_callback(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) + struct perf_sample_data *data, + struct pt_regs *regs) { /* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; - if (atomic_read(&watchdog_park_in_progress) != 0) - return; - if (__this_cpu_read(watchdog_nmi_touch) == true) { __this_cpu_write(watchdog_nmi_touch, false); return; @@ -160,104 +159,131 @@ static void watchdog_overflow_callback(struct perf_event *event, return; } -/* - * People like the simple clean cpu node info on boot. - * Reduce the watchdog noise by only printing messages - * that are different from what cpu0 displayed. - */ -static unsigned long firstcpu_err; -static atomic_t watchdog_cpus; - -int watchdog_nmi_enable(unsigned int cpu) +static int hardlockup_detector_event_create(void) { + unsigned int cpu = smp_processor_id(); struct perf_event_attr *wd_attr; - struct perf_event *event = per_cpu(watchdog_ev, cpu); - int firstcpu = 0; - - /* nothing to do if the hard lockup detector is disabled */ - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - goto out; - - /* is it already setup and enabled? */ - if (event && event->state > PERF_EVENT_STATE_OFF) - goto out; - - /* it is setup but not enabled */ - if (event != NULL) - goto out_enable; - - if (atomic_inc_return(&watchdog_cpus) == 1) - firstcpu = 1; + struct perf_event *evt; wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); /* Try to register using hardware perf events */ - event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); + evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, + watchdog_overflow_callback, NULL); + if (IS_ERR(evt)) { + pr_info("Perf event create on CPU %d failed with %ld\n", cpu, + PTR_ERR(evt)); + return PTR_ERR(evt); + } + this_cpu_write(watchdog_ev, evt); + return 0; +} - /* save the first cpu's error for future comparision */ - if (firstcpu && IS_ERR(event)) - firstcpu_err = PTR_ERR(event); +/** + * hardlockup_detector_perf_enable - Enable the local event + */ +void hardlockup_detector_perf_enable(void) +{ + if (hardlockup_detector_event_create()) + return; - if (!IS_ERR(event)) { - /* only print for the first cpu initialized */ - if (firstcpu || firstcpu_err) - pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); - goto out_save; - } + if (!watchdog_cpus++) + pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); - /* - * Disable the hard lockup detector if _any_ CPU fails to set up - * set up the hardware perf event. The watchdog() function checks - * the NMI_WATCHDOG_ENABLED bit periodically. - * - * The barriers are for syncing up watchdog_enabled across all the - * cpus, as clear_bit() does not use barriers. - */ - smp_mb__before_atomic(); - clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); - smp_mb__after_atomic(); - - /* skip displaying the same error again */ - if (!firstcpu && (PTR_ERR(event) == firstcpu_err)) - return PTR_ERR(event); - - /* vary the KERN level based on the returned errno */ - if (PTR_ERR(event) == -EOPNOTSUPP) - pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); - else if (PTR_ERR(event) == -ENOENT) - pr_warn("disabled (cpu%i): hardware events not enabled\n", - cpu); - else - pr_err("disabled (cpu%i): unable to create perf event: %ld\n", - cpu, PTR_ERR(event)); - - pr_info("Shutting down hard lockup detector on all cpus\n"); - - return PTR_ERR(event); - - /* success path */ -out_save: - per_cpu(watchdog_ev, cpu) = event; -out_enable: - perf_event_enable(per_cpu(watchdog_ev, cpu)); -out: - return 0; + perf_event_enable(this_cpu_read(watchdog_ev)); } -void watchdog_nmi_disable(unsigned int cpu) +/** + * hardlockup_detector_perf_disable - Disable the local event + */ +void hardlockup_detector_perf_disable(void) { - struct perf_event *event = per_cpu(watchdog_ev, cpu); + struct perf_event *event = this_cpu_read(watchdog_ev); if (event) { perf_event_disable(event); + cpumask_set_cpu(smp_processor_id(), &dead_events_mask); + watchdog_cpus--; + } +} + +/** + * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them + * + * Called from lockup_detector_cleanup(). Serialized by the caller. + */ +void hardlockup_detector_perf_cleanup(void) +{ + int cpu; + + for_each_cpu(cpu, &dead_events_mask) { + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + /* + * Required because for_each_cpu() reports unconditionally + * CPU0 as set on UP kernels. Sigh. + */ + if (event) + perf_event_release_kernel(event); per_cpu(watchdog_ev, cpu) = NULL; + } + cpumask_clear(&dead_events_mask); +} + +/** + * hardlockup_detector_perf_stop - Globally stop watchdog events + * + * Special interface for x86 to handle the perf HT bug. + */ +void __init hardlockup_detector_perf_stop(void) +{ + int cpu; + + lockdep_assert_cpus_held(); + + for_each_online_cpu(cpu) { + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) + perf_event_disable(event); + } +} - /* should be in cleanup, but blocks oprofile */ - perf_event_release_kernel(event); +/** + * hardlockup_detector_perf_restart - Globally restart watchdog events + * + * Special interface for x86 to handle the perf HT bug. + */ +void __init hardlockup_detector_perf_restart(void) +{ + int cpu; + + lockdep_assert_cpus_held(); + + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + return; + + for_each_online_cpu(cpu) { + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) + perf_event_enable(event); + } +} + +/** + * hardlockup_detector_perf_init - Probe whether NMI event is available at all + */ +int __init hardlockup_detector_perf_init(void) +{ + int ret = hardlockup_detector_event_create(); - /* watchdog_nmi_enable() expects this to be zero initially. */ - if (atomic_dec_and_test(&watchdog_cpus)) - firstcpu_err = 0; + if (ret) { + pr_info("Perf NMI watchdog permanently disabled\n"); + } else { + perf_event_release_kernel(this_cpu_read(watchdog_ev)); + this_cpu_write(watchdog_ev, NULL); } + return ret; } |