diff options
Diffstat (limited to 'kernel/bpf')
| -rw-r--r-- | kernel/bpf/Makefile | 2 | ||||
| -rw-r--r-- | kernel/bpf/arraymap.c | 193 | ||||
| -rw-r--r-- | kernel/bpf/bpf_lru_list.c | 22 | ||||
| -rw-r--r-- | kernel/bpf/cgroup.c | 101 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 310 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 455 | ||||
| -rw-r--r-- | kernel/bpf/helpers.c | 4 | ||||
| -rw-r--r-- | kernel/bpf/inode.c | 35 | ||||
| -rw-r--r-- | kernel/bpf/lpm_trie.c | 516 | ||||
| -rw-r--r-- | kernel/bpf/map_in_map.c | 102 | ||||
| -rw-r--r-- | kernel/bpf/map_in_map.h | 24 | ||||
| -rw-r--r-- | kernel/bpf/stackmap.c | 15 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 720 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 1034 |
14 files changed, 2871 insertions, 662 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 1276474ac3cd..e1e5e658f2db 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3d55d95dcf49..d771a3872500 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016,2017 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -16,6 +17,8 @@ #include <linux/filter.h> #include <linux/perf_event.h> +#include "map_in_map.h" + static void bpf_array_free_percpu(struct bpf_array *array) { int i; @@ -83,6 +86,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; + array->map.map_flags = attr->map_flags; array->elem_size = elem_size; if (!percpu) @@ -113,6 +117,30 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + array->elem_size * index; } +/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ +static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + u32 elem_size = round_up(map->value_size, 8); + const int ret = BPF_REG_0; + const int map_ptr = BPF_REG_1; + const int index = BPF_REG_2; + + *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); + *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + + if (is_power_of_2(elem_size)) { + *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); + } else { + *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); + } + *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); + *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *insn++ = BPF_MOV64_IMM(ret, 0); + return insn - insn_buf; +} + /* Called from eBPF program */ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) { @@ -155,7 +183,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_array *array = container_of(map, struct bpf_array, map); - u32 index = *(u32 *)key; + u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; if (index >= array->map.max_entries) { @@ -260,21 +288,17 @@ static void array_map_free(struct bpf_map *map) bpf_map_area_free(array); } -static const struct bpf_map_ops array_ops = { +const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_gen_lookup = array_map_gen_lookup, }; -static struct bpf_map_type_list array_type __read_mostly = { - .ops = &array_ops, - .type = BPF_MAP_TYPE_ARRAY, -}; - -static const struct bpf_map_ops percpu_array_ops = { +const struct bpf_map_ops percpu_array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -283,19 +307,6 @@ static const struct bpf_map_ops percpu_array_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map_type_list percpu_array_type __read_mostly = { - .ops = &percpu_array_ops, - .type = BPF_MAP_TYPE_PERCPU_ARRAY, -}; - -static int __init register_array_map(void) -{ - bpf_register_map_type(&array_type); - bpf_register_map_type(&percpu_array_type); - return 0; -} -late_initcall(register_array_map); - static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) { /* only file descriptors can be stored in this type of map */ @@ -324,6 +335,26 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) } /* only called from syscall */ +int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) +{ + void **elem, *ptr; + int ret = 0; + + if (!map->ops->map_fd_sys_lookup_elem) + return -ENOTSUPP; + + rcu_read_lock(); + elem = array_map_lookup_elem(map, key); + if (elem && (ptr = READ_ONCE(*elem))) + *value = map->ops->map_fd_sys_lookup_elem(ptr); + else + ret = -ENOENT; + rcu_read_unlock(); + + return ret; +} + +/* only called from syscall */ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) { @@ -389,6 +420,11 @@ static void prog_fd_array_put_ptr(void *ptr) bpf_prog_put(ptr); } +static u32 prog_fd_array_sys_lookup_elem(void *ptr) +{ + return ((struct bpf_prog *)ptr)->aux->id; +} + /* decrement refcnt of all bpf_progs that are stored in this map */ void bpf_fd_array_map_clear(struct bpf_map *map) { @@ -399,7 +435,7 @@ void bpf_fd_array_map_clear(struct bpf_map *map) fd_array_map_delete_elem(map, &i); } -static const struct bpf_map_ops prog_array_ops = { +const struct bpf_map_ops prog_array_map_ops = { .map_alloc = fd_array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, @@ -407,20 +443,9 @@ static const struct bpf_map_ops prog_array_ops = { .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, + .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, }; -static struct bpf_map_type_list prog_array_type __read_mostly = { - .ops = &prog_array_ops, - .type = BPF_MAP_TYPE_PROG_ARRAY, -}; - -static int __init register_prog_array_map(void) -{ - bpf_register_map_type(&prog_array_type); - return 0; -} -late_initcall(register_prog_array_map); - static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, struct file *map_file) { @@ -453,38 +478,24 @@ static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) static void *perf_event_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { - const struct perf_event_attr *attr; struct bpf_event_entry *ee; struct perf_event *event; struct file *perf_file; + u64 value; perf_file = perf_event_get(fd); if (IS_ERR(perf_file)) return perf_file; + ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; - ee = ERR_PTR(-EINVAL); - - attr = perf_event_attrs(event); - if (IS_ERR(attr) || attr->inherit) + if (perf_event_read_local(event, &value) == -EOPNOTSUPP) goto err_out; - switch (attr->type) { - case PERF_TYPE_SOFTWARE: - if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) - goto err_out; - /* fall-through */ - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - ee = bpf_event_entry_gen(perf_file, map_file); - if (ee) - return ee; - ee = ERR_PTR(-ENOMEM); - /* fall-through */ - default: - break; - } - + ee = bpf_event_entry_gen(perf_file, map_file); + if (ee) + return ee; + ee = ERR_PTR(-ENOMEM); err_out: fput(perf_file); return ee; @@ -511,7 +522,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, rcu_read_unlock(); } -static const struct bpf_map_ops perf_event_array_ops = { +const struct bpf_map_ops perf_event_array_map_ops = { .map_alloc = fd_array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, @@ -522,18 +533,6 @@ static const struct bpf_map_ops perf_event_array_ops = { .map_release = perf_event_fd_array_release, }; -static struct bpf_map_type_list perf_event_array_type __read_mostly = { - .ops = &perf_event_array_ops, - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, -}; - -static int __init register_perf_event_array_map(void) -{ - bpf_register_map_type(&perf_event_array_type); - return 0; -} -late_initcall(register_perf_event_array_map); - #ifdef CONFIG_CGROUPS static void *cgroup_fd_array_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, @@ -554,7 +553,7 @@ static void cgroup_fd_array_free(struct bpf_map *map) fd_array_map_free(map); } -static const struct bpf_map_ops cgroup_array_ops = { +const struct bpf_map_ops cgroup_array_map_ops = { .map_alloc = fd_array_map_alloc, .map_free = cgroup_fd_array_free, .map_get_next_key = array_map_get_next_key, @@ -563,16 +562,54 @@ static const struct bpf_map_ops cgroup_array_ops = { .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, }; +#endif -static struct bpf_map_type_list cgroup_array_type __read_mostly = { - .ops = &cgroup_array_ops, - .type = BPF_MAP_TYPE_CGROUP_ARRAY, -}; +static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map, *inner_map_meta; + + inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); + if (IS_ERR(inner_map_meta)) + return inner_map_meta; + + map = fd_array_map_alloc(attr); + if (IS_ERR(map)) { + bpf_map_meta_free(inner_map_meta); + return map; + } + + map->inner_map_meta = inner_map_meta; -static int __init register_cgroup_array_map(void) + return map; +} + +static void array_of_map_free(struct bpf_map *map) { - bpf_register_map_type(&cgroup_array_type); - return 0; + /* map->inner_map_meta is only accessed by syscall which + * is protected by fdget/fdput. + */ + bpf_map_meta_free(map->inner_map_meta); + bpf_fd_array_map_clear(map); + fd_array_map_free(map); } -late_initcall(register_cgroup_array_map); -#endif + +static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_map **inner_map = array_map_lookup_elem(map, key); + + if (!inner_map) + return NULL; + + return READ_ONCE(*inner_map); +} + +const struct bpf_map_ops array_of_maps_map_ops = { + .map_alloc = array_of_map_alloc, + .map_free = array_of_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = array_of_map_lookup_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = bpf_map_fd_get_ptr, + .map_fd_put_ptr = bpf_map_fd_put_ptr, + .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, +}; diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 89b7ef41c86b..e6ef4401a138 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -13,7 +13,7 @@ #define LOCAL_FREE_TARGET (128) #define LOCAL_NR_SCANS LOCAL_FREE_TARGET -#define PERCPU_FREE_TARGET (16) +#define PERCPU_FREE_TARGET (4) #define PERCPU_NR_SCANS PERCPU_FREE_TARGET /* Helpers to get the local list index */ @@ -213,11 +213,10 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru, enum bpf_lru_list_type tgt_free_type) { struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; - struct bpf_lru_node *node, *tmp_node, *first_node; + struct bpf_lru_node *node, *tmp_node; unsigned int nshrinked = 0; unsigned int i = 0; - first_node = list_first_entry(inactive, struct bpf_lru_node, list); list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { if (bpf_lru_node_is_ref(node)) { __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); @@ -361,7 +360,8 @@ static void __local_list_add_pending(struct bpf_lru *lru, list_add(&node->list, local_pending_list(loc_l)); } -struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) +static struct bpf_lru_node * +__local_list_pop_free(struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; @@ -374,8 +374,8 @@ struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) return node; } -struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru, - struct bpf_lru_locallist *loc_l) +static struct bpf_lru_node * +__local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; bool force = false; @@ -558,8 +558,9 @@ void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) bpf_common_lru_push_free(lru, node); } -void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, - u32 elem_size, u32 nr_elems) +static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, + u32 node_offset, u32 elem_size, + u32 nr_elems) { struct bpf_lru_list *l = &lru->common_lru.lru_list; u32 i; @@ -575,8 +576,9 @@ void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, } } -void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, - u32 elem_size, u32 nr_elems) +static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, + u32 node_offset, u32 elem_size, + u32 nr_elems) { u32 i, pcpu_entries; int cpu; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index a515f7b007c6..546113430049 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -52,6 +52,7 @@ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) e = rcu_dereference_protected(parent->bpf.effective[type], lockdep_is_held(&cgroup_mutex)); rcu_assign_pointer(cgrp->bpf.effective[type], e); + cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type]; } } @@ -82,30 +83,63 @@ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) * * Must be called with cgroup_mutex held. */ -void __cgroup_bpf_update(struct cgroup *cgrp, - struct cgroup *parent, - struct bpf_prog *prog, - enum bpf_attach_type type) +int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, + struct bpf_prog *prog, enum bpf_attach_type type, + bool new_overridable) { - struct bpf_prog *old_prog, *effective; + struct bpf_prog *old_prog, *effective = NULL; struct cgroup_subsys_state *pos; + bool overridable = true; - old_prog = xchg(cgrp->bpf.prog + type, prog); + if (parent) { + overridable = !parent->bpf.disallow_override[type]; + effective = rcu_dereference_protected(parent->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + } + + if (prog && effective && !overridable) + /* if parent has non-overridable prog attached, disallow + * attaching new programs to descendent cgroup + */ + return -EPERM; + + if (prog && effective && overridable != new_overridable) + /* if parent has overridable prog attached, only + * allow overridable programs in descendent cgroup + */ + return -EPERM; - effective = (!prog && parent) ? - rcu_dereference_protected(parent->bpf.effective[type], - lockdep_is_held(&cgroup_mutex)) : - prog; + old_prog = cgrp->bpf.prog[type]; + + if (prog) { + overridable = new_overridable; + effective = prog; + if (old_prog && + cgrp->bpf.disallow_override[type] == new_overridable) + /* disallow attaching non-overridable on top + * of existing overridable in this cgroup + * and vice versa + */ + return -EPERM; + } + + if (!prog && !old_prog) + /* report error when trying to detach and nothing is attached */ + return -ENOENT; + + cgrp->bpf.prog[type] = prog; css_for_each_descendant_pre(pos, &cgrp->self) { struct cgroup *desc = container_of(pos, struct cgroup, self); /* skip the subtree if the descendant has its own program */ - if (desc->bpf.prog[type] && desc != cgrp) + if (desc->bpf.prog[type] && desc != cgrp) { pos = css_rightmost_descendant(pos); - else + } else { rcu_assign_pointer(desc->bpf.effective[type], effective); + desc->bpf.disallow_override[type] = !overridable; + } } if (prog) @@ -115,11 +149,12 @@ void __cgroup_bpf_update(struct cgroup *cgrp, bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } + return 0; } /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering - * @sk: The socken sending or receiving traffic + * @sk: The socket sending or receiving traffic * @skb: The skb that is being sent or received * @type: The type of program to be exectuted * @@ -154,10 +189,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, prog = rcu_dereference(cgrp->bpf.effective[type]); if (prog) { unsigned int offset = skb->data - skb_network_header(skb); + struct sock *save_sk = skb->sk; + skb->sk = sk; __skb_push(skb, offset); ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; __skb_pull(skb, offset); + skb->sk = save_sk; } rcu_read_unlock(); @@ -198,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); + +/** + * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock + * @sk: socket to get cgroup from + * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains + * sk with connection information (IP addresses, etc.) May not contain + * cgroup info if it is a req sock. + * @type: The type of program to be exectuted + * + * socket passed is expected to be of type INET or INET6. + * + * The program type passed in via @type must be suitable for sock_ops + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, + struct bpf_sock_ops_kern *sock_ops, + enum bpf_attach_type type) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_prog *prog; + int ret = 0; + + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]); + if (prog) + ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM; + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 503d4211988a..ad5f55922a13 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -28,6 +28,9 @@ #include <linux/moduleloader.h> #include <linux/bpf.h> #include <linux/frame.h> +#include <linux/rbtree_latch.h> +#include <linux/kallsyms.h> +#include <linux/rcupdate.h> #include <asm/unaligned.h> @@ -73,8 +76,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | - gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog_aux *aux; struct bpf_prog *fp; @@ -95,6 +97,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) fp->aux = aux; fp->aux->prog = fp; + INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); + return fp; } EXPORT_SYMBOL_GPL(bpf_prog_alloc); @@ -102,8 +106,7 @@ EXPORT_SYMBOL_GPL(bpf_prog_alloc); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | - gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; u32 pages, delta; int ret; @@ -290,6 +293,202 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, } #ifdef CONFIG_BPF_JIT +static __always_inline void +bpf_get_prog_addr_region(const struct bpf_prog *prog, + unsigned long *symbol_start, + unsigned long *symbol_end) +{ + const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog); + unsigned long addr = (unsigned long)hdr; + + WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); + + *symbol_start = addr; + *symbol_end = addr + hdr->pages * PAGE_SIZE; +} + +static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +{ + BUILD_BUG_ON(sizeof("bpf_prog_") + + sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN); + + sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); + sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); + *sym = 0; +} + +static __always_inline unsigned long +bpf_get_prog_addr_start(struct latch_tree_node *n) +{ + unsigned long symbol_start, symbol_end; + const struct bpf_prog_aux *aux; + + aux = container_of(n, struct bpf_prog_aux, ksym_tnode); + bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + + return symbol_start; +} + +static __always_inline bool bpf_tree_less(struct latch_tree_node *a, + struct latch_tree_node *b) +{ + return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b); +} + +static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) +{ + unsigned long val = (unsigned long)key; + unsigned long symbol_start, symbol_end; + const struct bpf_prog_aux *aux; + + aux = container_of(n, struct bpf_prog_aux, ksym_tnode); + bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + + if (val < symbol_start) + return -1; + if (val >= symbol_end) + return 1; + + return 0; +} + +static const struct latch_tree_ops bpf_tree_ops = { + .less = bpf_tree_less, + .comp = bpf_tree_comp, +}; + +static DEFINE_SPINLOCK(bpf_lock); +static LIST_HEAD(bpf_kallsyms); +static struct latch_tree_root bpf_tree __cacheline_aligned; + +int bpf_jit_kallsyms __read_mostly; + +static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) +{ + WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); + list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms); + latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); +} + +static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux) +{ + if (list_empty(&aux->ksym_lnode)) + return; + + latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); + list_del_rcu(&aux->ksym_lnode); +} + +static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) +{ + return fp->jited && !bpf_prog_was_classic(fp); +} + +static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) +{ + return list_empty(&fp->aux->ksym_lnode) || + fp->aux->ksym_lnode.prev == LIST_POISON2; +} + +void bpf_prog_kallsyms_add(struct bpf_prog *fp) +{ + if (!bpf_prog_kallsyms_candidate(fp) || + !capable(CAP_SYS_ADMIN)) + return; + + spin_lock_bh(&bpf_lock); + bpf_prog_ksym_node_add(fp->aux); + spin_unlock_bh(&bpf_lock); +} + +void bpf_prog_kallsyms_del(struct bpf_prog *fp) +{ + if (!bpf_prog_kallsyms_candidate(fp)) + return; + + spin_lock_bh(&bpf_lock); + bpf_prog_ksym_node_del(fp->aux); + spin_unlock_bh(&bpf_lock); +} + +static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) +{ + struct latch_tree_node *n; + + if (!bpf_jit_kallsyms_enabled()) + return NULL; + + n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); + return n ? + container_of(n, struct bpf_prog_aux, ksym_tnode)->prog : + NULL; +} + +const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char *sym) +{ + unsigned long symbol_start, symbol_end; + struct bpf_prog *prog; + char *ret = NULL; + + rcu_read_lock(); + prog = bpf_prog_kallsyms_find(addr); + if (prog) { + bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end); + bpf_get_prog_name(prog, sym); + + ret = sym; + if (size) + *size = symbol_end - symbol_start; + if (off) + *off = addr - symbol_start; + } + rcu_read_unlock(); + + return ret; +} + +bool is_bpf_text_address(unsigned long addr) +{ + bool ret; + + rcu_read_lock(); + ret = bpf_prog_kallsyms_find(addr) != NULL; + rcu_read_unlock(); + + return ret; +} + +int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym) +{ + unsigned long symbol_start, symbol_end; + struct bpf_prog_aux *aux; + unsigned int it = 0; + int ret = -ERANGE; + + if (!bpf_jit_kallsyms_enabled()) + return ret; + + rcu_read_lock(); + list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) { + if (it++ != symnum) + continue; + + bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + bpf_get_prog_name(aux->prog, sym); + + *value = symbol_start; + *type = BPF_SYM_ELF_TYPE; + + ret = 0; + break; + } + rcu_read_unlock(); + + return ret; +} + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -326,6 +525,24 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) module_memfree(hdr); } +/* This symbol is only overridden by archs that have different + * requirements than the usual eBPF JITs, f.e. when they only + * implement cBPF JIT, do not set images read-only, etc. + */ +void __weak bpf_jit_free(struct bpf_prog *fp) +{ + if (fp->jited) { + struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); + + bpf_jit_binary_unlock_ro(hdr); + bpf_jit_binary_free(hdr); + + WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); + } + + bpf_prog_unlock_free(fp); +} + int bpf_jit_harden __read_mostly; static int bpf_jit_blind_insn(const struct bpf_insn *from, @@ -436,8 +653,7 @@ out: static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | - gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); @@ -547,10 +763,10 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); * * Decode and execute eBPF instructions. */ -static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) +static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, + u64 *stack) { - u64 stack[MAX_BPF_STACK / sizeof(u64)]; - u64 regs[MAX_BPF_REG], tmp; + u64 tmp; static const void *jumptable[256] = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ @@ -608,7 +824,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, /* Call instruction */ [BPF_JMP | BPF_CALL] = &&JMP_CALL, - [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL, + [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, /* Jumps */ [BPF_JMP | BPF_JA] = &&JMP_JA, [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, @@ -658,9 +874,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) #define CONT ({ insn++; goto select_insn; }) #define CONT_JMP ({ insn++; goto select_insn; }) - FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; - ARG1 = (u64) (unsigned long) ctx; - select_insn: goto *jumptable[insn->code]; @@ -939,12 +1152,12 @@ out: LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ off = IMM; load_word: - /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are - * only appearing in the programs where ctx == - * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] - * == BPF_R6, bpf_convert_filter() saves it in BPF_R6, - * internal BPF verifier will check that BPF_R6 == - * ctx. + /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only + * appearing in the programs where ctx == skb + * (see may_access_skb() in the verifier). All programs + * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6, + * bpf_convert_filter() saves it in BPF_R6, internal BPF + * verifier will check that BPF_R6 == ctx. * * BPF_ABS and BPF_IND are wrappers of function calls, * so they scratch BPF_R1-BPF_R5 registers, preserve @@ -1003,7 +1216,39 @@ load_byte: WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); return 0; } -STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ +STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ + +#define PROG_NAME(stack_size) __bpf_prog_run##stack_size +#define DEFINE_BPF_PROG_RUN(stack_size) \ +static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ +{ \ + u64 stack[stack_size / sizeof(u64)]; \ + u64 regs[MAX_BPF_REG]; \ +\ + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ + ARG1 = (u64) (unsigned long) ctx; \ + return ___bpf_prog_run(regs, insn, stack); \ +} + +#define EVAL1(FN, X) FN(X) +#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) +#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) +#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y) +#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y) +#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y) + +EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); +EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); +EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); + +#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), + +static unsigned int (*interpreters[])(const void *ctx, + const struct bpf_insn *insn) = { +EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) +EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) +EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) +}; bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) @@ -1052,7 +1297,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { - fp->bpf_func = (void *) __bpf_prog_run; + u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); + + fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -1154,12 +1401,22 @@ const struct bpf_func_proto bpf_tail_call_proto = { .arg3_type = ARG_ANYTHING, }; -/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */ +/* Stub for JITs that only support cBPF. eBPF programs are interpreted. + * It is encouraged to implement bpf_int_jit_compile() instead, so that + * eBPF and implicitly also cBPF can get JITed! + */ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) { return prog; } +/* Stub for JITs that support eBPF. All cBPF code gets transformed into + * eBPF by the kernel and is later compiled by bpf_int_jit_compile(). + */ +void __weak bpf_jit_compile(struct bpf_prog *prog) +{ +} + bool __weak bpf_helper_changes_pkt_data(void *func) { return false; @@ -1173,3 +1430,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, { return -EFAULT; } + +/* All definitions of tracepoints related to BPF. */ +#define CREATE_TRACE_POINTS +#include <linux/bpf_trace.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type); +EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a753bbe7df0a..4fb463172aa8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -13,11 +13,13 @@ #include <linux/bpf.h> #include <linux/jhash.h> #include <linux/filter.h> +#include <linux/rculist_nulls.h> #include "percpu_freelist.h" #include "bpf_lru_list.h" +#include "map_in_map.h" struct bucket { - struct hlist_head head; + struct hlist_nulls_head head; raw_spinlock_t lock; }; @@ -29,28 +31,26 @@ struct bpf_htab { struct pcpu_freelist freelist; struct bpf_lru lru; }; - void __percpu *extra_elems; + struct htab_elem *__percpu *extra_elems; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ }; -enum extra_elem_state { - HTAB_NOT_AN_EXTRA_ELEM = 0, - HTAB_EXTRA_ELEM_FREE, - HTAB_EXTRA_ELEM_USED -}; - /* each htab element is struct htab_elem + key + value */ struct htab_elem { union { - struct hlist_node hash_node; - struct bpf_htab *htab; - struct pcpu_freelist_node fnode; + struct hlist_nulls_node hash_node; + struct { + void *padding; + union { + struct bpf_htab *htab; + struct pcpu_freelist_node fnode; + }; + }; }; union { struct rcu_head rcu; - enum extra_elem_state state; struct bpf_lru_node lru_node; }; u32 hash; @@ -71,6 +71,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } +static bool htab_is_prealloc(const struct bpf_htab *htab) +{ + return !(htab->map.map_flags & BPF_F_NO_PREALLOC); +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { @@ -82,6 +87,11 @@ static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size return *(void __percpu **)(l->key + key_size); } +static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) +{ + return *(void **)(l->key + roundup(map->key_size, 8)); +} + static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) { return (struct htab_elem *) (htab->elems + i * htab->elem_size); @@ -122,17 +132,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, static int prealloc_init(struct bpf_htab *htab) { + u32 num_entries = htab->map.max_entries; int err = -ENOMEM, i; - htab->elems = bpf_map_area_alloc(htab->elem_size * - htab->map.max_entries); + if (!htab_is_percpu(htab) && !htab_is_lru(htab)) + num_entries += num_possible_cpus(); + + htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries); if (!htab->elems) return -ENOMEM; if (!htab_is_percpu(htab)) goto skip_percpu_elems; - for (i = 0; i < htab->map.max_entries; i++) { + for (i = 0; i < num_entries; i++) { u32 size = round_up(htab->map.value_size, 8); void __percpu *pptr; @@ -160,10 +173,11 @@ skip_percpu_elems: if (htab_is_lru(htab)) bpf_lru_populate(&htab->lru, htab->elems, offsetof(struct htab_elem, lru_node), - htab->elem_size, htab->map.max_entries); + htab->elem_size, num_entries); else - pcpu_freelist_populate(&htab->freelist, htab->elems, - htab->elem_size, htab->map.max_entries); + pcpu_freelist_populate(&htab->freelist, + htab->elems + offsetof(struct htab_elem, fnode), + htab->elem_size, num_entries); return 0; @@ -184,16 +198,22 @@ static void prealloc_destroy(struct bpf_htab *htab) static int alloc_extra_elems(struct bpf_htab *htab) { - void __percpu *pptr; + struct htab_elem *__percpu *pptr, *l_new; + struct pcpu_freelist_node *l; int cpu; - pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); + pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8, + GFP_USER | __GFP_NOWARN); if (!pptr) return -ENOMEM; for_each_possible_cpu(cpu) { - ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = - HTAB_EXTRA_ELEM_FREE; + l = pcpu_freelist_pop(&htab->freelist); + /* pop will succeed, since prealloc_init() + * preallocated extra num_possible_cpus elements + */ + l_new = container_of(l, struct htab_elem, fnode); + *per_cpu_ptr(pptr, cpu) = l_new; } htab->extra_elems = pptr; return 0; @@ -217,6 +237,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) int err, i; u64 cost; + BUILD_BUG_ON(offsetof(struct htab_elem, htab) != + offsetof(struct htab_elem, hash_node.pprev)); + BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != + offsetof(struct htab_elem, hash_node.pprev)); + if (lru && !capable(CAP_SYS_ADMIN)) /* LRU implementation is much complicated than other * maps. Hence, limit to CAP_SYS_ADMIN for now. @@ -326,29 +351,29 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) goto free_htab; for (i = 0; i < htab->n_buckets; i++) { - INIT_HLIST_HEAD(&htab->buckets[i].head); + INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); raw_spin_lock_init(&htab->buckets[i].lock); } - if (!percpu && !lru) { - /* lru itself can remove the least used element, so - * there is no need for an extra elem during map_update. - */ - err = alloc_extra_elems(htab); - if (err) - goto free_buckets; - } - if (prealloc) { err = prealloc_init(htab); if (err) - goto free_extra_elems; + goto free_buckets; + + if (!percpu && !lru) { + /* lru itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ + err = alloc_extra_elems(htab); + if (err) + goto free_prealloc; + } } return &htab->map; -free_extra_elems: - free_percpu(htab->extra_elems); +free_prealloc: + prealloc_destroy(htab); free_buckets: bpf_map_area_free(htab->buckets); free_htab: @@ -366,28 +391,56 @@ static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) return &htab->buckets[hash & (htab->n_buckets - 1)]; } -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) { return &__select_bucket(htab, hash)->head; } -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, +/* this lookup function can only be called with bucket lock taken */ +static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, void *key, u32 key_size) { + struct hlist_nulls_node *n; struct htab_elem *l; - hlist_for_each_entry_rcu(l, head, hash_node) + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l->hash == hash && !memcmp(&l->key, key, key_size)) return l; return NULL; } -/* Called from syscall or from eBPF program */ +/* can be called without bucket lock. it will repeat the loop in + * the unlikely event when elements moved from one bucket into another + * while link list is being walked + */ +static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, + u32 hash, void *key, + u32 key_size, u32 n_buckets) +{ + struct hlist_nulls_node *n; + struct htab_elem *l; + +again: + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + + if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) + goto again; + + return NULL; +} + +/* Called from syscall or from eBPF program directly, so + * arguments have to match bpf_map_lookup_elem() exactly. + * The return value is adjusted by BPF instructions + * in htab_map_gen_lookup(). + */ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; + struct hlist_nulls_head *head; struct htab_elem *l; u32 hash, key_size; @@ -400,7 +453,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) head = select_bucket(htab, hash); - l = lookup_elem_raw(head, hash, key, key_size); + l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); return l; } @@ -415,6 +468,30 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +/* inline bpf_map_lookup_elem() call. + * Instead of: + * bpf_prog + * bpf_map_lookup_elem + * map->ops->map_lookup_elem + * htab_map_lookup_elem + * __htab_map_lookup_elem + * do: + * bpf_prog + * __htab_map_lookup_elem + */ +static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + const int ret = BPF_REG_0; + + *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); + *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, + offsetof(struct htab_elem, key) + + round_up(map->key_size, 8)); + return insn - insn_buf; +} + static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) { struct htab_elem *l = __htab_map_lookup_elem(map, key); @@ -433,8 +510,9 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) { struct bpf_htab *htab = (struct bpf_htab *)arg; - struct htab_elem *l, *tgt_l; - struct hlist_head *head; + struct htab_elem *l = NULL, *tgt_l; + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; unsigned long flags; struct bucket *b; @@ -444,9 +522,9 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) raw_spin_lock_irqsave(&b->lock, flags); - hlist_for_each_entry_rcu(l, head, hash_node) + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { - hlist_del_rcu(&l->hash_node); + hlist_nulls_del_rcu(&l->hash_node); break; } @@ -459,29 +537,30 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; + struct hlist_nulls_head *head; struct htab_elem *l, *next_l; u32 hash, key_size; - int i; + int i = 0; WARN_ON_ONCE(!rcu_read_lock_held()); key_size = map->key_size; + if (!key) + goto find_first_elem; + hash = htab_map_hash(key, key_size); head = select_bucket(htab, hash); /* lookup the key */ - l = lookup_elem_raw(head, hash, key, key_size); + l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); - if (!l) { - i = 0; + if (!l) goto find_first_elem; - } /* key was found, get next key in the same bucket */ - next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), + next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), struct htab_elem, hash_node); if (next_l) { @@ -500,7 +579,7 @@ find_first_elem: head = select_bucket(htab, i); /* pick first element in the bucket */ - next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), struct htab_elem, hash_node); if (next_l) { /* if it's not empty, just return it */ @@ -538,12 +617,15 @@ static void htab_elem_free_rcu(struct rcu_head *head) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { - if (l->state == HTAB_EXTRA_ELEM_USED) { - l->state = HTAB_EXTRA_ELEM_FREE; - return; + struct bpf_map *map = &htab->map; + + if (map->ops->map_fd_put_ptr) { + void *ptr = fd_htab_map_get_ptr(map, l); + + map->ops->map_fd_put_ptr(ptr); } - if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { + if (htab_is_prealloc(htab)) { pcpu_freelist_push(&htab->freelist, &l->fnode); } else { atomic_dec(&htab->count); @@ -573,43 +655,43 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, - bool old_elem_exists) + struct htab_elem *old_elem) { u32 size = htab->map.value_size; - bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); - struct htab_elem *l_new; + bool prealloc = htab_is_prealloc(htab); + struct htab_elem *l_new, **pl_new; void __percpu *pptr; - int err = 0; if (prealloc) { - l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); - if (!l_new) - err = -E2BIG; - } else { - if (atomic_inc_return(&htab->count) > htab->map.max_entries) { - atomic_dec(&htab->count); - err = -E2BIG; + if (old_elem) { + /* if we're updating the existing element, + * use per-cpu extra elems to avoid freelist_pop/push + */ + pl_new = this_cpu_ptr(htab->extra_elems); + l_new = *pl_new; + *pl_new = old_elem; } else { - l_new = kmalloc(htab->elem_size, - GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return ERR_PTR(-ENOMEM); - } - } + struct pcpu_freelist_node *l; - if (err) { - if (!old_elem_exists) - return ERR_PTR(err); - - /* if we're updating the existing element and the hash table - * is full, use per-cpu extra elems - */ - l_new = this_cpu_ptr(htab->extra_elems); - if (l_new->state != HTAB_EXTRA_ELEM_FREE) - return ERR_PTR(-E2BIG); - l_new->state = HTAB_EXTRA_ELEM_USED; + l = pcpu_freelist_pop(&htab->freelist); + if (!l) + return ERR_PTR(-E2BIG); + l_new = container_of(l, struct htab_elem, fnode); + } } else { - l_new->state = HTAB_NOT_AN_EXTRA_ELEM; + if (atomic_inc_return(&htab->count) > htab->map.max_entries) + if (!old_elem) { + /* when map is full and update() is replacing + * old element, it's ok to allocate, since + * old element will be freed immediately. + * Otherwise return an error + */ + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); } memcpy(l_new->key, key, key_size); @@ -661,7 +743,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; - struct hlist_head *head; + struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -690,7 +772,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, goto err; l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, - !!l_old); + l_old); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */ ret = PTR_ERR(l_new); @@ -700,10 +782,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* add new element to the head of the list, so that * concurrent search will find it before old elem */ - hlist_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { - hlist_del_rcu(&l_old->hash_node); - free_htab_elem(htab, l_old); + hlist_nulls_del_rcu(&l_old->hash_node); + if (!htab_is_prealloc(htab)) + free_htab_elem(htab, l_old); } ret = 0; err: @@ -716,7 +799,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new, *l_old = NULL; - struct hlist_head *head; + struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -757,10 +840,10 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, /* add new element to the head of the list, so that * concurrent search will find it before old elem */ - hlist_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { bpf_lru_node_set_ref(&l_new->lru_node); - hlist_del_rcu(&l_old->hash_node); + hlist_nulls_del_rcu(&l_old->hash_node); } ret = 0; @@ -781,7 +864,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; - struct hlist_head *head; + struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -815,12 +898,12 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, value, onallcpus); } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true, onallcpus, false); + hash, true, onallcpus, NULL); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; } - hlist_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_add_head_rcu(&l_new->hash_node, head); } ret = 0; err: @@ -834,7 +917,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; - struct hlist_head *head; + struct hlist_nulls_head *head; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -882,7 +965,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, } else { pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), value, onallcpus); - hlist_add_head_rcu(&l_new->hash_node, head); + hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; } ret = 0; @@ -910,7 +993,7 @@ static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; + struct hlist_nulls_head *head; struct bucket *b; struct htab_elem *l; unsigned long flags; @@ -930,7 +1013,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); if (l) { - hlist_del_rcu(&l->hash_node); + hlist_nulls_del_rcu(&l->hash_node); free_htab_elem(htab, l); ret = 0; } @@ -942,7 +1025,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; + struct hlist_nulls_head *head; struct bucket *b; struct htab_elem *l; unsigned long flags; @@ -962,7 +1045,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); if (l) { - hlist_del_rcu(&l->hash_node); + hlist_nulls_del_rcu(&l->hash_node); ret = 0; } @@ -977,17 +1060,17 @@ static void delete_all_elements(struct bpf_htab *htab) int i; for (i = 0; i < htab->n_buckets; i++) { - struct hlist_head *head = select_bucket(htab, i); - struct hlist_node *n; + struct hlist_nulls_head *head = select_bucket(htab, i); + struct hlist_nulls_node *n; struct htab_elem *l; - hlist_for_each_entry_safe(l, n, head, hash_node) { - hlist_del_rcu(&l->hash_node); - if (l->state != HTAB_EXTRA_ELEM_USED) - htab_elem_free(htab, l); + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { + hlist_nulls_del_rcu(&l->hash_node); + htab_elem_free(htab, l); } } } + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -1004,7 +1087,7 @@ static void htab_map_free(struct bpf_map *map) * not have executed. Wait for them. */ rcu_barrier(); - if (htab->map.map_flags & BPF_F_NO_PREALLOC) + if (!htab_is_prealloc(htab)) delete_all_elements(htab); else prealloc_destroy(htab); @@ -1014,21 +1097,17 @@ static void htab_map_free(struct bpf_map *map) kfree(htab); } -static const struct bpf_map_ops htab_ops = { +const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_map_lookup_elem, .map_update_elem = htab_map_update_elem, .map_delete_elem = htab_map_delete_elem, + .map_gen_lookup = htab_map_gen_lookup, }; -static struct bpf_map_type_list htab_type __read_mostly = { - .ops = &htab_ops, - .type = BPF_MAP_TYPE_HASH, -}; - -static const struct bpf_map_ops htab_lru_ops = { +const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1037,11 +1116,6 @@ static const struct bpf_map_ops htab_lru_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map_type_list htab_lru_type __read_mostly = { - .ops = &htab_lru_ops, - .type = BPF_MAP_TYPE_LRU_HASH, -}; - /* Called from eBPF program */ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) { @@ -1115,7 +1189,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, return ret; } -static const struct bpf_map_ops htab_percpu_ops = { +const struct bpf_map_ops htab_percpu_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1124,12 +1198,7 @@ static const struct bpf_map_ops htab_percpu_ops = { .map_delete_elem = htab_map_delete_elem, }; -static struct bpf_map_type_list htab_percpu_type __read_mostly = { - .ops = &htab_percpu_ops, - .type = BPF_MAP_TYPE_PERCPU_HASH, -}; - -static const struct bpf_map_ops htab_lru_percpu_ops = { +const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1138,17 +1207,123 @@ static const struct bpf_map_ops htab_lru_percpu_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = { - .ops = &htab_lru_percpu_ops, - .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, -}; +static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map; + + if (attr->value_size != sizeof(u32)) + return ERR_PTR(-EINVAL); + + /* pointer is stored internally */ + attr->value_size = sizeof(void *); + map = htab_map_alloc(attr); + attr->value_size = sizeof(u32); -static int __init register_htab_map(void) + return map; +} + +static void fd_htab_map_free(struct bpf_map *map) { - bpf_register_map_type(&htab_type); - bpf_register_map_type(&htab_percpu_type); - bpf_register_map_type(&htab_lru_type); - bpf_register_map_type(&htab_lru_percpu_type); - return 0; + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_nulls_node *n; + struct hlist_nulls_head *head; + struct htab_elem *l; + int i; + + for (i = 0; i < htab->n_buckets; i++) { + head = select_bucket(htab, i); + + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { + void *ptr = fd_htab_map_get_ptr(map, l); + + map->ops->map_fd_put_ptr(ptr); + } + } + + htab_map_free(map); +} + +/* only called from syscall */ +int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) +{ + void **ptr; + int ret = 0; + + if (!map->ops->map_fd_sys_lookup_elem) + return -ENOTSUPP; + + rcu_read_lock(); + ptr = htab_map_lookup_elem(map, key); + if (ptr) + *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); + else + ret = -ENOENT; + rcu_read_unlock(); + + return ret; +} + +/* only called from syscall */ +int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, + void *key, void *value, u64 map_flags) +{ + void *ptr; + int ret; + u32 ufd = *(u32 *)value; + + ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + ret = htab_map_update_elem(map, key, &ptr, map_flags); + if (ret) + map->ops->map_fd_put_ptr(ptr); + + return ret; } -late_initcall(register_htab_map); + +static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map, *inner_map_meta; + + inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); + if (IS_ERR(inner_map_meta)) + return inner_map_meta; + + map = fd_htab_map_alloc(attr); + if (IS_ERR(map)) { + bpf_map_meta_free(inner_map_meta); + return map; + } + + map->inner_map_meta = inner_map_meta; + + return map; +} + +static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_map **inner_map = htab_map_lookup_elem(map, key); + + if (!inner_map) + return NULL; + + return READ_ONCE(*inner_map); +} + +static void htab_of_map_free(struct bpf_map *map) +{ + bpf_map_meta_free(map->inner_map_meta); + fd_htab_map_free(map); +} + +const struct bpf_map_ops htab_of_maps_map_ops = { + .map_alloc = htab_of_map_alloc, + .map_free = htab_of_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_of_map_lookup_elem, + .map_delete_elem = htab_map_delete_elem, + .map_fd_get_ptr = bpf_map_fd_get_ptr, + .map_fd_put_ptr = bpf_map_fd_put_ptr, + .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, +}; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 045cbe673356..3d24e238221e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -176,6 +176,6 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { .func = bpf_get_current_comm, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_RAW_STACK, - .arg2_type = ARG_CONST_STACK_SIZE, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, }; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 0b030c9126d3..e833ed914358 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -21,6 +21,7 @@ #include <linux/parser.h> #include <linux/filter.h> #include <linux/bpf.h> +#include <linux/bpf_trace.h> enum bpf_type { BPF_TYPE_UNSPEC = 0, @@ -281,6 +282,13 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname) ret = bpf_obj_do_pin(pname, raw, type); if (ret != 0) bpf_any_put(raw, type); + if ((trace_bpf_obj_pin_prog_enabled() || + trace_bpf_obj_pin_map_enabled()) && !ret) { + if (type == BPF_TYPE_PROG) + trace_bpf_obj_pin_prog(raw, ufd, pname); + if (type == BPF_TYPE_MAP) + trace_bpf_obj_pin_map(raw, ufd, pname); + } out: putname(pname); return ret; @@ -342,8 +350,15 @@ int bpf_obj_get_user(const char __user *pathname) else goto out; - if (ret < 0) + if (ret < 0) { bpf_any_put(raw, type); + } else if (trace_bpf_obj_get_prog_enabled() || + trace_bpf_obj_get_map_enabled()) { + if (type == BPF_TYPE_PROG) + trace_bpf_obj_get_prog(raw, ret, pname); + if (type == BPF_TYPE_MAP) + trace_bpf_obj_get_map(raw, ret, pname); + } out: putname(pname); return ret; @@ -362,10 +377,22 @@ static void bpf_evict_inode(struct inode *inode) bpf_any_put(inode->i_private, type); } +/* + * Display the mount options in /proc/mounts. + */ +static int bpf_show_options(struct seq_file *m, struct dentry *root) +{ + umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; + + if (mode != S_IRWXUGO) + seq_printf(m, ",mode=%o", mode); + return 0; +} + static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, - .show_options = generic_show_options, + .show_options = bpf_show_options, .evict_inode = bpf_evict_inode, }; @@ -414,13 +441,11 @@ static int bpf_parse_options(char *data, struct bpf_mount_opts *opts) static int bpf_fill_super(struct super_block *sb, void *data, int silent) { - static struct tree_descr bpf_rfiles[] = { { "" } }; + static const struct tree_descr bpf_rfiles[] = { { "" } }; struct bpf_mount_opts opts; struct inode *inode; int ret; - save_mount_options(sb, data); - ret = bpf_parse_options(data, &opts); if (ret) return ret; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c new file mode 100644 index 000000000000..b09185f0f17d --- /dev/null +++ b/kernel/bpf/lpm_trie.c @@ -0,0 +1,516 @@ +/* + * Longest prefix match list implementation + * + * Copyright (c) 2016,2017 Daniel Mack + * Copyright (c) 2016 David Herrmann + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <net/ipv6.h> + +/* Intermediate node */ +#define LPM_TREE_NODE_FLAG_IM BIT(0) + +struct lpm_trie_node; + +struct lpm_trie_node { + struct rcu_head rcu; + struct lpm_trie_node __rcu *child[2]; + u32 prefixlen; + u32 flags; + u8 data[0]; +}; + +struct lpm_trie { + struct bpf_map map; + struct lpm_trie_node __rcu *root; + size_t n_entries; + size_t max_prefixlen; + size_t data_size; + raw_spinlock_t lock; +}; + +/* This trie implements a longest prefix match algorithm that can be used to + * match IP addresses to a stored set of ranges. + * + * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is + * interpreted as big endian, so data[0] stores the most significant byte. + * + * Match ranges are internally stored in instances of struct lpm_trie_node + * which each contain their prefix length as well as two pointers that may + * lead to more nodes containing more specific matches. Each node also stores + * a value that is defined by and returned to userspace via the update_elem + * and lookup functions. + * + * For instance, let's start with a trie that was created with a prefix length + * of 32, so it can be used for IPv4 addresses, and one single element that + * matches 192.168.0.0/16. The data array would hence contain + * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will + * stick to IP-address notation for readability though. + * + * As the trie is empty initially, the new node (1) will be places as root + * node, denoted as (R) in the example below. As there are no other node, both + * child pointers are %NULL. + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * + * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already + * a node with the same data and a smaller prefix (ie, a less specific one), + * node (2) will become a child of (1). In child index depends on the next bit + * that is outside of what (1) matches, and that bit is 0, so (2) will be + * child[0] of (1): + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | + * +----------------+ + * | (2) | + * | 192.168.0.0/24 | + * | value: 2 | + * | [0] [1] | + * +----------------+ + * + * The child[1] slot of (1) could be filled with another node which has bit #17 + * (the next bit after the ones that (1) matches on) set to 1. For instance, + * 192.168.128.0/24: + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | | + * +----------------+ +------------------+ + * | (2) | | (3) | + * | 192.168.0.0/24 | | 192.168.128.0/24 | + * | value: 2 | | value: 3 | + * | [0] [1] | | [0] [1] | + * +----------------+ +------------------+ + * + * Let's add another node (4) to the game for 192.168.1.0/24. In order to place + * it, node (1) is looked at first, and because (4) of the semantics laid out + * above (bit #17 is 0), it would normally be attached to (1) as child[0]. + * However, that slot is already allocated, so a new node is needed in between. + * That node does not have a value attached to it and it will never be + * returned to users as result of a lookup. It is only there to differentiate + * the traversal further. It will get a prefix as wide as necessary to + * distinguish its two children: + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | | + * +----------------+ +------------------+ + * | (4) (I) | | (3) | + * | 192.168.0.0/23 | | 192.168.128.0/24 | + * | value: --- | | value: 3 | + * | [0] [1] | | [0] [1] | + * +----------------+ +------------------+ + * | | + * +----------------+ +----------------+ + * | (2) | | (5) | + * | 192.168.0.0/24 | | 192.168.1.0/24 | + * | value: 2 | | value: 5 | + * | [0] [1] | | [0] [1] | + * +----------------+ +----------------+ + * + * 192.168.1.1/32 would be a child of (5) etc. + * + * An intermediate node will be turned into a 'real' node on demand. In the + * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie. + * + * A fully populated trie would have a height of 32 nodes, as the trie was + * created with a prefix length of 32. + * + * The lookup starts at the root node. If the current node matches and if there + * is a child that can be used to become more specific, the trie is traversed + * downwards. The last node in the traversal that is a non-intermediate one is + * returned. + */ + +static inline int extract_bit(const u8 *data, size_t index) +{ + return !!(data[index / 8] & (1 << (7 - (index % 8)))); +} + +/** + * longest_prefix_match() - determine the longest prefix + * @trie: The trie to get internal sizes from + * @node: The node to operate on + * @key: The key to compare to @node + * + * Determine the longest prefix of @node that matches the bits in @key. + */ +static size_t longest_prefix_match(const struct lpm_trie *trie, + const struct lpm_trie_node *node, + const struct bpf_lpm_trie_key *key) +{ + size_t prefixlen = 0; + size_t i; + + for (i = 0; i < trie->data_size; i++) { + size_t b; + + b = 8 - fls(node->data[i] ^ key->data[i]); + prefixlen += b; + + if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen) + return min(node->prefixlen, key->prefixlen); + + if (b < 8) + break; + } + + return prefixlen; +} + +/* Called from syscall or from eBPF program */ +static void *trie_lookup_elem(struct bpf_map *map, void *_key) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *node, *found = NULL; + struct bpf_lpm_trie_key *key = _key; + + /* Start walking the trie from the root node ... */ + + for (node = rcu_dereference(trie->root); node;) { + unsigned int next_bit; + size_t matchlen; + + /* Determine the longest prefix of @node that matches @key. + * If it's the maximum possible prefix for this trie, we have + * an exact match and can return it directly. + */ + matchlen = longest_prefix_match(trie, node, key); + if (matchlen == trie->max_prefixlen) { + found = node; + break; + } + + /* If the number of bits that match is smaller than the prefix + * length of @node, bail out and return the node we have seen + * last in the traversal (ie, the parent). + */ + if (matchlen < node->prefixlen) + break; + + /* Consider this node as return candidate unless it is an + * artificially added intermediate one. + */ + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + found = node; + + /* If the node match is fully satisfied, let's see if we can + * become more specific. Determine the next bit in the key and + * traverse down. + */ + next_bit = extract_bit(key->data, node->prefixlen); + node = rcu_dereference(node->child[next_bit]); + } + + if (!found) + return NULL; + + return found->data + trie->data_size; +} + +static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, + const void *value) +{ + struct lpm_trie_node *node; + size_t size = sizeof(struct lpm_trie_node) + trie->data_size; + + if (value) + size += trie->map.value_size; + + node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); + if (!node) + return NULL; + + node->flags = 0; + + if (value) + memcpy(node->data + trie->data_size, value, + trie->map.value_size); + + return node; +} + +/* Called from syscall or from eBPF program */ +static int trie_update_elem(struct bpf_map *map, + void *_key, void *value, u64 flags) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; + struct lpm_trie_node __rcu **slot; + struct bpf_lpm_trie_key *key = _key; + unsigned long irq_flags; + unsigned int next_bit; + size_t matchlen = 0; + int ret = 0; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + if (key->prefixlen > trie->max_prefixlen) + return -EINVAL; + + raw_spin_lock_irqsave(&trie->lock, irq_flags); + + /* Allocate and fill a new node */ + + if (trie->n_entries == trie->map.max_entries) { + ret = -ENOSPC; + goto out; + } + + new_node = lpm_trie_node_alloc(trie, value); + if (!new_node) { + ret = -ENOMEM; + goto out; + } + + trie->n_entries++; + + new_node->prefixlen = key->prefixlen; + RCU_INIT_POINTER(new_node->child[0], NULL); + RCU_INIT_POINTER(new_node->child[1], NULL); + memcpy(new_node->data, key->data, trie->data_size); + + /* Now find a slot to attach the new node. To do that, walk the tree + * from the root and match as many bits as possible for each node until + * we either find an empty slot or a slot that needs to be replaced by + * an intermediate node. + */ + slot = &trie->root; + + while ((node = rcu_dereference_protected(*slot, + lockdep_is_held(&trie->lock)))) { + matchlen = longest_prefix_match(trie, node, key); + + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen || + node->prefixlen == trie->max_prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + slot = &node->child[next_bit]; + } + + /* If the slot is empty (a free child pointer or an empty root), + * simply assign the @new_node to that slot and be done. + */ + if (!node) { + rcu_assign_pointer(*slot, new_node); + goto out; + } + + /* If the slot we picked already exists, replace it with @new_node + * which already has the correct data array set. + */ + if (node->prefixlen == matchlen) { + new_node->child[0] = node->child[0]; + new_node->child[1] = node->child[1]; + + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + trie->n_entries--; + + rcu_assign_pointer(*slot, new_node); + kfree_rcu(node, rcu); + + goto out; + } + + /* If the new node matches the prefix completely, it must be inserted + * as an ancestor. Simply insert it between @node and *@slot. + */ + if (matchlen == key->prefixlen) { + next_bit = extract_bit(node->data, matchlen); + rcu_assign_pointer(new_node->child[next_bit], node); + rcu_assign_pointer(*slot, new_node); + goto out; + } + + im_node = lpm_trie_node_alloc(trie, NULL); + if (!im_node) { + ret = -ENOMEM; + goto out; + } + + im_node->prefixlen = matchlen; + im_node->flags |= LPM_TREE_NODE_FLAG_IM; + memcpy(im_node->data, node->data, trie->data_size); + + /* Now determine which child to install in which slot */ + if (extract_bit(key->data, matchlen)) { + rcu_assign_pointer(im_node->child[0], node); + rcu_assign_pointer(im_node->child[1], new_node); + } else { + rcu_assign_pointer(im_node->child[0], new_node); + rcu_assign_pointer(im_node->child[1], node); + } + + /* Finally, assign the intermediate node to the determined spot */ + rcu_assign_pointer(*slot, im_node); + +out: + if (ret) { + if (new_node) + trie->n_entries--; + + kfree(new_node); + kfree(im_node); + } + + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + + return ret; +} + +static int trie_delete_elem(struct bpf_map *map, void *key) +{ + /* TODO */ + return -ENOSYS; +} + +#define LPM_DATA_SIZE_MAX 256 +#define LPM_DATA_SIZE_MIN 1 + +#define LPM_VAL_SIZE_MAX (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \ + sizeof(struct lpm_trie_node)) +#define LPM_VAL_SIZE_MIN 1 + +#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X)) +#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) +#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) + +static struct bpf_map *trie_alloc(union bpf_attr *attr) +{ + struct lpm_trie *trie; + u64 cost = sizeof(*trie), cost_per_node; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || + attr->map_flags != BPF_F_NO_PREALLOC || + attr->key_size < LPM_KEY_SIZE_MIN || + attr->key_size > LPM_KEY_SIZE_MAX || + attr->value_size < LPM_VAL_SIZE_MIN || + attr->value_size > LPM_VAL_SIZE_MAX) + return ERR_PTR(-EINVAL); + + trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); + if (!trie) + return ERR_PTR(-ENOMEM); + + /* copy mandatory map attributes */ + trie->map.map_type = attr->map_type; + trie->map.key_size = attr->key_size; + trie->map.value_size = attr->value_size; + trie->map.max_entries = attr->max_entries; + trie->map.map_flags = attr->map_flags; + trie->data_size = attr->key_size - + offsetof(struct bpf_lpm_trie_key, data); + trie->max_prefixlen = trie->data_size * 8; + + cost_per_node = sizeof(struct lpm_trie_node) + + attr->value_size + trie->data_size; + cost += (u64) attr->max_entries * cost_per_node; + if (cost >= U32_MAX - PAGE_SIZE) { + ret = -E2BIG; + goto out_err; + } + + trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + ret = bpf_map_precharge_memlock(trie->map.pages); + if (ret) + goto out_err; + + raw_spin_lock_init(&trie->lock); + + return &trie->map; +out_err: + kfree(trie); + return ERR_PTR(ret); +} + +static void trie_free(struct bpf_map *map) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node __rcu **slot; + struct lpm_trie_node *node; + + raw_spin_lock(&trie->lock); + + /* Always start at the root and walk down to a node that has no + * children. Then free that node, nullify its reference in the parent + * and start over. + */ + + for (;;) { + slot = &trie->root; + + for (;;) { + node = rcu_dereference_protected(*slot, + lockdep_is_held(&trie->lock)); + if (!node) + goto unlock; + + if (rcu_access_pointer(node->child[0])) { + slot = &node->child[0]; + continue; + } + + if (rcu_access_pointer(node->child[1])) { + slot = &node->child[1]; + continue; + } + + kfree(node); + RCU_INIT_POINTER(*slot, NULL); + break; + } + } + +unlock: + raw_spin_unlock(&trie->lock); +} + +static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -ENOTSUPP; +} + +const struct bpf_map_ops trie_map_ops = { + .map_alloc = trie_alloc, + .map_free = trie_free, + .map_get_next_key = trie_get_next_key, + .map_lookup_elem = trie_lookup_elem, + .map_update_elem = trie_update_elem, + .map_delete_elem = trie_delete_elem, +}; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c new file mode 100644 index 000000000000..1da574612bea --- /dev/null +++ b/kernel/bpf/map_in_map.c @@ -0,0 +1,102 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/slab.h> +#include <linux/bpf.h> + +#include "map_in_map.h" + +struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) +{ + struct bpf_map *inner_map, *inner_map_meta; + struct fd f; + + f = fdget(inner_map_ufd); + inner_map = __bpf_map_get(f); + if (IS_ERR(inner_map)) + return inner_map; + + /* prog_array->owner_prog_type and owner_jited + * is a runtime binding. Doing static check alone + * in the verifier is not enough. + */ + if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + fdput(f); + return ERR_PTR(-ENOTSUPP); + } + + /* Does not support >1 level map-in-map */ + if (inner_map->inner_map_meta) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER); + if (!inner_map_meta) { + fdput(f); + return ERR_PTR(-ENOMEM); + } + + inner_map_meta->map_type = inner_map->map_type; + inner_map_meta->key_size = inner_map->key_size; + inner_map_meta->value_size = inner_map->value_size; + inner_map_meta->map_flags = inner_map->map_flags; + inner_map_meta->ops = inner_map->ops; + inner_map_meta->max_entries = inner_map->max_entries; + + fdput(f); + return inner_map_meta; +} + +void bpf_map_meta_free(struct bpf_map *map_meta) +{ + kfree(map_meta); +} + +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1) +{ + /* No need to compare ops because it is covered by map_type */ + return meta0->map_type == meta1->map_type && + meta0->key_size == meta1->key_size && + meta0->value_size == meta1->value_size && + meta0->map_flags == meta1->map_flags && + meta0->max_entries == meta1->max_entries; +} + +void *bpf_map_fd_get_ptr(struct bpf_map *map, + struct file *map_file /* not used */, + int ufd) +{ + struct bpf_map *inner_map; + struct fd f; + + f = fdget(ufd); + inner_map = __bpf_map_get(f); + if (IS_ERR(inner_map)) + return inner_map; + + if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) + inner_map = bpf_map_inc(inner_map, false); + else + inner_map = ERR_PTR(-EINVAL); + + fdput(f); + return inner_map; +} + +void bpf_map_fd_put_ptr(void *ptr) +{ + /* ptr->ops->map_free() has to go through one + * rcu grace period by itself. + */ + bpf_map_put(ptr); +} + +u32 bpf_map_fd_sys_lookup_elem(void *ptr) +{ + return ((struct bpf_map *)ptr)->id; +} diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h new file mode 100644 index 000000000000..6183db9ec08c --- /dev/null +++ b/kernel/bpf/map_in_map.h @@ -0,0 +1,24 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __MAP_IN_MAP_H__ +#define __MAP_IN_MAP_H__ + +#include <linux/types.h> + +struct file; +struct bpf_map; + +struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); +void bpf_map_meta_free(struct bpf_map *map_meta); +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1); +void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, + int ufd); +void bpf_map_fd_put_ptr(void *ptr); +u32 bpf_map_fd_sys_lookup_elem(void *ptr); + +#endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index be8519148c25..31147d730abf 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -88,6 +88,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) smap->map.key_size = attr->key_size; smap->map.value_size = value_size; smap->map.max_entries = attr->max_entries; + smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; @@ -264,7 +265,7 @@ static void stack_map_free(struct bpf_map *map) put_callchain_buffers(); } -static const struct bpf_map_ops stack_map_ops = { +const struct bpf_map_ops stack_map_ops = { .map_alloc = stack_map_alloc, .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, @@ -272,15 +273,3 @@ static const struct bpf_map_ops stack_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, }; - -static struct bpf_map_type_list stack_map_type __read_mostly = { - .ops = &stack_map_ops, - .type = BPF_MAP_TYPE_STACK_TRACE, -}; - -static int __init register_stack_map(void) -{ - bpf_register_map_type(&stack_map_type); - return 0; -} -late_initcall(register_stack_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 19b6129eab23..045646da97cc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -10,8 +10,10 @@ * General Public License for more details. */ #include <linux/bpf.h> +#include <linux/bpf_trace.h> #include <linux/syscalls.h> #include <linux/slab.h> +#include <linux/sched/signal.h> #include <linux/vmalloc.h> #include <linux/mmzone.h> #include <linux/anon_inodes.h> @@ -20,35 +22,46 @@ #include <linux/filter.h> #include <linux/version.h> #include <linux/kernel.h> +#include <linux/idr.h> + +#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) +#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) DEFINE_PER_CPU(int, bpf_prog_active); +static DEFINE_IDR(prog_idr); +static DEFINE_SPINLOCK(prog_idr_lock); +static DEFINE_IDR(map_idr); +static DEFINE_SPINLOCK(map_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; -static LIST_HEAD(bpf_map_types); +static const struct bpf_map_ops * const bpf_map_types[] = { +#define BPF_PROG_TYPE(_id, _ops) +#define BPF_MAP_TYPE(_id, _ops) \ + [_id] = &_ops, +#include <linux/bpf_types.h> +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +}; static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { - struct bpf_map_type_list *tl; struct bpf_map *map; - list_for_each_entry(tl, &bpf_map_types, list_node) { - if (tl->type == attr->map_type) { - map = tl->ops->map_alloc(attr); - if (IS_ERR(map)) - return map; - map->ops = tl->ops; - map->map_type = attr->map_type; - return map; - } - } - return ERR_PTR(-EINVAL); -} + if (attr->map_type >= ARRAY_SIZE(bpf_map_types) || + !bpf_map_types[attr->map_type]) + return ERR_PTR(-EINVAL); -/* boot time registration of different map implementations */ -void bpf_register_map_type(struct bpf_map_type_list *tl) -{ - list_add(&tl->list_node, &bpf_map_types); + map = bpf_map_types[attr->map_type]->map_alloc(attr); + if (IS_ERR(map)) + return map; + map->ops = bpf_map_types[attr->map_type]; + map->map_type = attr->map_type; + return map; } void *bpf_map_area_alloc(size_t size) @@ -66,8 +79,7 @@ void *bpf_map_area_alloc(size_t size) return area; } - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | flags, - PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL); } void bpf_map_area_free(void *area) @@ -114,6 +126,37 @@ static void bpf_map_uncharge_memlock(struct bpf_map *map) free_uid(user); } +static int bpf_map_alloc_id(struct bpf_map *map) +{ + int id; + + spin_lock_bh(&map_idr_lock); + id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); + if (id > 0) + map->id = id; + spin_unlock_bh(&map_idr_lock); + + if (WARN_ON_ONCE(!id)) + return -ENOSPC; + + return id > 0 ? 0 : id; +} + +static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) +{ + if (do_idr_lock) + spin_lock_bh(&map_idr_lock); + else + __acquire(&map_idr_lock); + + idr_remove(&map_idr, map->id); + + if (do_idr_lock) + spin_unlock_bh(&map_idr_lock); + else + __release(&map_idr_lock); +} + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { @@ -135,14 +178,21 @@ static void bpf_map_put_uref(struct bpf_map *map) /* decrement map refcnt and schedule it for freeing via workqueue * (unrelying map implementation ops->map_free() might sleep) */ -void bpf_map_put(struct bpf_map *map) +static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) { if (atomic_dec_and_test(&map->refcnt)) { + /* bpf_map_free_id() must be called first */ + bpf_map_free_id(map, do_idr_lock); INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); } } +void bpf_map_put(struct bpf_map *map) +{ + __bpf_map_put(map, true); +} + void bpf_map_put_with_uref(struct bpf_map *map) { bpf_map_put_uref(map); @@ -166,10 +216,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) const struct bpf_map *map = filp->private_data; const struct bpf_array *array; u32 owner_prog_type = 0; + u32 owner_jited = 0; if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { array = container_of(map, struct bpf_array, map); owner_prog_type = array->owner_prog_type; + owner_jited = array->owner_jited; } seq_printf(m, @@ -186,9 +238,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->map_flags, map->pages * 1ULL << PAGE_SHIFT); - if (owner_prog_type) + if (owner_prog_type) { seq_printf(m, "owner_prog_type:\t%u\n", owner_prog_type); + seq_printf(m, "owner_jited:\t%u\n", + owner_jited); + } } #endif @@ -213,7 +268,7 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD map_flags +#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -236,11 +291,23 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map_nouncharge; - err = bpf_map_new_fd(map); - if (err < 0) - /* failed to allocate fd */ + err = bpf_map_alloc_id(map); + if (err) goto free_map; + err = bpf_map_new_fd(map); + if (err < 0) { + /* failed to allocate fd. + * bpf_map_put() is needed because the above + * bpf_map_alloc_id() has published the map + * to the userspace and the userspace may + * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. + */ + bpf_map_put(map); + return err; + } + + trace_bpf_map_create(map, err); return err; free_map: @@ -294,6 +361,28 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) return map; } +/* map_idr_lock should have been held */ +static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, + bool uref) +{ + int refold; + + refold = __atomic_add_unless(&map->refcnt, 1, 0); + + if (refold >= BPF_MAX_REFCNT) { + __bpf_map_put(map, false); + return ERR_PTR(-EBUSY); + } + + if (!refold) + return ERR_PTR(-ENOENT); + + if (uref) + atomic_inc(&map->usercnt); + + return map; +} + int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { return -ENOTSUPP; @@ -321,19 +410,18 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) + key = memdup_user(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); goto err_put; - - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + } if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else if (IS_FD_MAP(map)) + value_size = sizeof(u32); else value_size = map->value_size; @@ -349,6 +437,10 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_array_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); + } else if (IS_FD_ARRAY(map)) { + err = bpf_fd_array_map_lookup_elem(map, key, value); + } else if (IS_FD_HASH(map)) { + err = bpf_fd_htab_map_lookup_elem(map, key, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -365,6 +457,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; + trace_bpf_map_lookup_elem(map, ufd, key, value); err = 0; free_value: @@ -397,14 +490,11 @@ static int map_update_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) + key = memdup_user(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); goto err_put; - - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + } if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -434,11 +524,17 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_array_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { + map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || + map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + rcu_read_lock(); + err = bpf_fd_htab_map_update_elem(map, f.file, key, value, + attr->flags); + rcu_read_unlock(); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); @@ -447,6 +543,8 @@ static int map_update_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); + if (!err) + trace_bpf_map_update_elem(map, ufd, key, value); free_value: kfree(value); free_key: @@ -475,14 +573,11 @@ static int map_delete_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) + key = memdup_user(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); goto err_put; - - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + } preempt_disable(); __this_cpu_inc(bpf_prog_active); @@ -492,7 +587,8 @@ static int map_delete_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); -free_key: + if (!err) + trace_bpf_map_delete_elem(map, ufd, key); kfree(key); err_put: fdput(f); @@ -520,14 +616,15 @@ static int map_get_next_key(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) - goto err_put; - - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + if (ukey) { + key = memdup_user(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); + goto err_put; + } + } else { + key = NULL; + } err = -ENOMEM; next_key = kmalloc(map->key_size, GFP_USER); @@ -544,6 +641,7 @@ static int map_get_next_key(union bpf_attr *attr) if (copy_to_user(unext_key, next_key, map->key_size) != 0) goto free_next_key; + trace_bpf_map_next_key(map, ufd, key, next_key); err = 0; free_next_key: @@ -555,79 +653,23 @@ err_put: return err; } -static LIST_HEAD(bpf_prog_types); +static const struct bpf_verifier_ops * const bpf_prog_types[] = { +#define BPF_PROG_TYPE(_id, _ops) \ + [_id] = &_ops, +#define BPF_MAP_TYPE(_id, _ops) +#include <linux/bpf_types.h> +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +}; static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) { - struct bpf_prog_type_list *tl; - - list_for_each_entry(tl, &bpf_prog_types, list_node) { - if (tl->type == type) { - prog->aux->ops = tl->ops; - prog->type = type; - return 0; - } - } - - return -EINVAL; -} - -void bpf_register_prog_type(struct bpf_prog_type_list *tl) -{ - list_add(&tl->list_node, &bpf_prog_types); -} - -/* fixup insn->imm field of bpf_call instructions: - * if (insn->imm == BPF_FUNC_map_lookup_elem) - * insn->imm = bpf_map_lookup_elem - __bpf_call_base; - * else if (insn->imm == BPF_FUNC_map_update_elem) - * insn->imm = bpf_map_update_elem - __bpf_call_base; - * else ... - * - * this function is called after eBPF program passed verification - */ -static void fixup_bpf_calls(struct bpf_prog *prog) -{ - const struct bpf_func_proto *fn; - int i; + if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) + return -EINVAL; - for (i = 0; i < prog->len; i++) { - struct bpf_insn *insn = &prog->insnsi[i]; - - if (insn->code == (BPF_JMP | BPF_CALL)) { - /* we reach here when program has bpf_call instructions - * and it passed bpf_check(), means that - * ops->get_func_proto must have been supplied, check it - */ - BUG_ON(!prog->aux->ops->get_func_proto); - - if (insn->imm == BPF_FUNC_get_route_realm) - prog->dst_needed = 1; - if (insn->imm == BPF_FUNC_get_prandom_u32) - bpf_user_rnd_init_once(); - if (insn->imm == BPF_FUNC_xdp_adjust_head) - prog->xdp_adjust_head = 1; - if (insn->imm == BPF_FUNC_tail_call) { - /* mark bpf_tail_call as different opcode - * to avoid conditional branch in - * interpeter for every normal call - * and to prevent accidental JITing by - * JIT compiler that doesn't support - * bpf_tail_call yet - */ - insn->imm = 0; - insn->code |= BPF_X; - continue; - } - - fn = prog->aux->ops->get_func_proto(insn->imm); - /* all functions that have prototype and verifier allowed - * programs to call them, must be real in-kernel functions - */ - BUG_ON(!fn->func); - insn->imm = fn->func - __bpf_call_base; - } - } + prog->aux->ops = bpf_prog_types[type]; + prog->type = type; + return 0; } /* drop refcnt on maps used by eBPF program and free auxilary data */ @@ -686,6 +728,42 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) free_uid(user); } +static int bpf_prog_alloc_id(struct bpf_prog *prog) +{ + int id; + + spin_lock_bh(&prog_idr_lock); + id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); + if (id > 0) + prog->aux->id = id; + spin_unlock_bh(&prog_idr_lock); + + /* id is in [1, INT_MAX) */ + if (WARN_ON_ONCE(!id)) + return -ENOSPC; + + return id > 0 ? 0 : id; +} + +static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) +{ + /* cBPF to eBPF migrations are currently not in the idr store. */ + if (!prog->aux->id) + return; + + if (do_idr_lock) + spin_lock_bh(&prog_idr_lock); + else + __acquire(&prog_idr_lock); + + idr_remove(&prog_idr, prog->aux->id); + + if (do_idr_lock) + spin_unlock_bh(&prog_idr_lock); + else + __release(&prog_idr_lock); +} + static void __bpf_prog_put_rcu(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); @@ -695,10 +773,20 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) bpf_prog_free(aux->prog); } -void bpf_prog_put(struct bpf_prog *prog) +static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { - if (atomic_dec_and_test(&prog->aux->refcnt)) + if (atomic_dec_and_test(&prog->aux->refcnt)) { + trace_bpf_prog_put_rcu(prog); + /* bpf_prog_free_id() must be called first */ + bpf_prog_free_id(prog, do_idr_lock); + bpf_prog_kallsyms_del(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + } +} + +void bpf_prog_put(struct bpf_prog *prog) +{ + __bpf_prog_put(prog, true); } EXPORT_SYMBOL_GPL(bpf_prog_put); @@ -781,6 +869,24 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc); +/* prog_idr_lock should have been held */ +static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) +{ + int refold; + + refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0); + + if (refold >= BPF_MAX_REFCNT) { + __bpf_prog_put(prog, false); + return ERR_PTR(-EBUSY); + } + + if (!refold) + return ERR_PTR(-ENOENT); + + return prog; +} + static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { struct fd f = fdget(ufd); @@ -807,12 +913,16 @@ struct bpf_prog *bpf_prog_get(u32 ufd) struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { - return __bpf_prog_get(ufd, &type); + struct bpf_prog *prog = __bpf_prog_get(ufd, &type); + + if (!IS_ERR(prog)) + trace_bpf_prog_get_type(prog); + return prog; } EXPORT_SYMBOL_GPL(bpf_prog_get_type); /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD kern_version +#define BPF_PROG_LOAD_LAST_FIELD prog_flags static int bpf_prog_load(union bpf_attr *attr) { @@ -825,6 +935,9 @@ static int bpf_prog_load(union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; + if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT) + return -EINVAL; + /* copy eBPF program license from user space */ if (strncpy_from_user(license, u64_to_user_ptr(attr->license), sizeof(license) - 1) < 0) @@ -841,7 +954,9 @@ static int bpf_prog_load(union bpf_attr *attr) attr->kern_version != LINUX_VERSION_CODE) return -EINVAL; - if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) + if (type != BPF_PROG_TYPE_SOCKET_FILTER && + type != BPF_PROG_TYPE_CGROUP_SKB && + !capable(CAP_SYS_ADMIN)) return -EPERM; /* plain bpf_prog allocation */ @@ -876,19 +991,29 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_used_maps; - /* fixup BPF_CALL->imm field */ - fixup_bpf_calls(prog); - /* eBPF program is ready to be JITed */ prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; - err = bpf_prog_new_fd(prog); - if (err < 0) - /* failed to allocate fd */ + err = bpf_prog_alloc_id(prog); + if (err) goto free_used_maps; + err = bpf_prog_new_fd(prog); + if (err < 0) { + /* failed to allocate fd. + * bpf_prog_put() is needed because the above + * bpf_prog_alloc_id() has published the prog + * to the userspace and the userspace may + * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID. + */ + bpf_prog_put(prog); + return err; + } + + bpf_prog_kallsyms_add(prog); + trace_bpf_prog_load(prog, err); return err; free_used_maps: @@ -920,13 +1045,14 @@ static int bpf_obj_get(const union bpf_attr *attr) #ifdef CONFIG_CGROUP_BPF -#define BPF_PROG_ATTACH_LAST_FIELD attach_type +#define BPF_PROG_ATTACH_LAST_FIELD attach_flags static int bpf_prog_attach(const union bpf_attr *attr) { + enum bpf_prog_type ptype; struct bpf_prog *prog; struct cgroup *cgrp; - enum bpf_prog_type ptype; + int ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -934,6 +1060,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; + if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE) + return -EINVAL; + switch (attr->attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: @@ -942,6 +1071,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET_SOCK_CREATE: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_SOCK_OPS: + ptype = BPF_PROG_TYPE_SOCK_OPS; + break; default: return -EINVAL; } @@ -956,10 +1088,13 @@ static int bpf_prog_attach(const union bpf_attr *attr) return PTR_ERR(cgrp); } - cgroup_bpf_update(cgrp, prog, attr->attach_type); + ret = cgroup_bpf_update(cgrp, prog, attr->attach_type, + attr->attach_flags & BPF_F_ALLOW_OVERRIDE); + if (ret) + bpf_prog_put(prog); cgroup_put(cgrp); - return 0; + return ret; } #define BPF_PROG_DETACH_LAST_FIELD attach_type @@ -967,6 +1102,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) static int bpf_prog_detach(const union bpf_attr *attr) { struct cgroup *cgrp; + int ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -978,11 +1114,12 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_SOCK_OPS: cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) return PTR_ERR(cgrp); - cgroup_bpf_update(cgrp, NULL, attr->attach_type); + ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false); cgroup_put(cgrp); break; @@ -990,10 +1127,264 @@ static int bpf_prog_detach(const union bpf_attr *attr) return -EINVAL; } - return 0; + return ret; } + #endif /* CONFIG_CGROUP_BPF */ +#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration + +static int bpf_prog_test_run(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_prog *prog; + int ret = -ENOTSUPP; + + if (CHECK_ATTR(BPF_PROG_TEST_RUN)) + return -EINVAL; + + prog = bpf_prog_get(attr->test.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->aux->ops->test_run) + ret = prog->aux->ops->test_run(prog, attr, uattr); + + bpf_prog_put(prog); + return ret; +} + +#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id + +static int bpf_obj_get_next_id(const union bpf_attr *attr, + union bpf_attr __user *uattr, + struct idr *idr, + spinlock_t *lock) +{ + u32 next_id = attr->start_id; + int err = 0; + + if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + next_id++; + spin_lock_bh(lock); + if (!idr_get_next(idr, &next_id)) + err = -ENOENT; + spin_unlock_bh(lock); + + if (!err) + err = put_user(next_id, &uattr->next_id); + + return err; +} + +#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id + +static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + u32 id = attr->prog_id; + int fd; + + if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock_bh(&prog_idr_lock); + prog = idr_find(&prog_idr, id); + if (prog) + prog = bpf_prog_inc_not_zero(prog); + else + prog = ERR_PTR(-ENOENT); + spin_unlock_bh(&prog_idr_lock); + + if (IS_ERR(prog)) + return PTR_ERR(prog); + + fd = bpf_prog_new_fd(prog); + if (fd < 0) + bpf_prog_put(prog); + + return fd; +} + +#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id + +static int bpf_map_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_map *map; + u32 id = attr->map_id; + int fd; + + if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock_bh(&map_idr_lock); + map = idr_find(&map_idr, id); + if (map) + map = bpf_map_inc_not_zero(map, true); + else + map = ERR_PTR(-ENOENT); + spin_unlock_bh(&map_idr_lock); + + if (IS_ERR(map)) + return PTR_ERR(map); + + fd = bpf_map_new_fd(map); + if (fd < 0) + bpf_map_put(map); + + return fd; +} + +static int check_uarg_tail_zero(void __user *uaddr, + size_t expected_size, + size_t actual_size) +{ + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + int err; + + if (actual_size <= expected_size) + return 0; + + addr = uaddr + expected_size; + end = uaddr + actual_size; + + for (; addr < end; addr++) { + err = get_user(val, addr); + if (err) + return err; + if (val) + return -E2BIG; + } + + return 0; +} + +static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_prog_info info = {}; + u32 info_len = attr->info.info_len; + char __user *uinsns; + u32 ulen; + int err; + + err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + if (copy_from_user(&info, uinfo, info_len)) + return err; + + info.type = prog->type; + info.id = prog->aux->id; + + memcpy(info.tag, prog->tag, sizeof(prog->tag)); + + if (!capable(CAP_SYS_ADMIN)) { + info.jited_prog_len = 0; + info.xlated_prog_len = 0; + goto done; + } + + ulen = info.jited_prog_len; + info.jited_prog_len = prog->jited_len; + if (info.jited_prog_len && ulen) { + uinsns = u64_to_user_ptr(info.jited_prog_insns); + ulen = min_t(u32, info.jited_prog_len, ulen); + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } + + ulen = info.xlated_prog_len; + info.xlated_prog_len = bpf_prog_size(prog->len); + if (info.xlated_prog_len && ulen) { + uinsns = u64_to_user_ptr(info.xlated_prog_insns); + ulen = min_t(u32, info.xlated_prog_len, ulen); + if (copy_to_user(uinsns, prog->insnsi, ulen)) + return -EFAULT; + } + +done: + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + +static int bpf_map_get_info_by_fd(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_map_info info = {}; + u32 info_len = attr->info.info_len; + int err; + + err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + info.type = map->map_type; + info.id = map->id; + info.key_size = map->key_size; + info.value_size = map->value_size; + info.max_entries = map->max_entries; + info.map_flags = map->map_flags; + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + +#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info + +static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + int ufd = attr->info.bpf_fd; + struct fd f; + int err; + + if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) + return -EINVAL; + + f = fdget(ufd); + if (!f.file) + return -EBADFD; + + if (f.file->f_op == &bpf_prog_fops) + err = bpf_prog_get_info_by_fd(f.file->private_data, attr, + uattr); + else if (f.file->f_op == &bpf_map_fops) + err = bpf_map_get_info_by_fd(f.file->private_data, attr, + uattr); + else + err = -EINVAL; + + fdput(f); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1013,23 +1404,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz * user-space does not rely on any kernel feature * extensions we dont know about yet. */ - if (size > sizeof(attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - err = get_user(val, addr); - if (err) - return err; - if (val) - return -E2BIG; - } - size = sizeof(attr); - } + err = check_uarg_tail_zero(uattr, sizeof(attr), size); + if (err) + return err; + size = min_t(u32, size, sizeof(attr)); /* copy attributes from user space, may be less than sizeof(bpf_attr) */ if (copy_from_user(&attr, uattr, size) != 0) @@ -1060,7 +1438,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; - #ifdef CONFIG_CGROUP_BPF case BPF_PROG_ATTACH: err = bpf_prog_attach(&attr); @@ -1069,7 +1446,26 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_prog_detach(&attr); break; #endif - + case BPF_PROG_TEST_RUN: + err = bpf_prog_test_run(&attr, uattr); + break; + case BPF_PROG_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &prog_idr, &prog_idr_lock); + break; + case BPF_MAP_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &map_idr, &map_idr_lock); + break; + case BPF_PROG_GET_FD_BY_ID: + err = bpf_prog_get_fd_by_id(&attr); + break; + case BPF_MAP_GET_FD_BY_ID: + err = bpf_map_get_fd_by_id(&attr); + break; + case BPF_OBJ_GET_INFO_BY_FD: + err = bpf_obj_get_info_by_fd(&attr, uattr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cdc43b899f28..af9e84a4944e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -33,7 +33,7 @@ * - out of bounds or malformed jumps * The second pass is all possible path descent from the 1st insn. * Since it's analyzing all pathes through the program, the length of the - * analysis is limited to 32k insn, which may be hit even if total number of + * analysis is limited to 64k insn, which may be hit even if total number of * insn is less then 4K, but there are too many branches that change stack/regs. * Number of 'branches to be analyzed' is limited to 1k * @@ -140,9 +140,11 @@ struct bpf_verifier_stack_elem { struct bpf_verifier_stack_elem *next; }; -#define BPF_COMPLEXITY_LIMIT_INSNS 65536 +#define BPF_COMPLEXITY_LIMIT_INSNS 98304 #define BPF_COMPLEXITY_LIMIT_STACK 1024 +#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -239,6 +241,12 @@ static void print_verifier_state(struct bpf_verifier_state *state) if (reg->max_value != BPF_REGISTER_MAX_RANGE) verbose(",max_value=%llu", (unsigned long long)reg->max_value); + if (reg->min_align) + verbose(",min_align=%u", reg->min_align); + if (reg->aux_off) + verbose(",aux_off=%u", reg->aux_off); + if (reg->aux_off_align) + verbose(",aux_off_align=%u", reg->aux_off_align); } for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] == STACK_SPILL) @@ -296,7 +304,8 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; -static void print_bpf_insn(struct bpf_insn *insn) +static void print_bpf_insn(const struct bpf_verifier_env *env, + const struct bpf_insn *insn) { u8 class = BPF_CLASS(insn->code); @@ -360,9 +369,19 @@ static void print_bpf_insn(struct bpf_insn *insn) insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IMM) { - verbose("(%02x) r%d = 0x%x\n", - insn->code, insn->dst_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM && + BPF_SIZE(insn->code) == BPF_DW) { + /* At this point, we already made sure that the second + * part of the ldimm64 insn is accessible. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + + if (map_ptr && !env->allow_ptr_leaks) + imm = 0; + + verbose("(%02x) r%d = 0x%llx\n", insn->code, + insn->dst_reg, (unsigned long long)imm); } else { verbose("BUG_ld_%02x\n", insn->code); return; @@ -444,16 +463,22 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; +static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +{ + BUG_ON(regno >= MAX_BPF_REG); + + memset(®s[regno], 0, sizeof(regs[regno])); + regs[regno].type = NOT_INIT; + regs[regno].min_value = BPF_REGISTER_MIN_RANGE; + regs[regno].max_value = BPF_REGISTER_MAX_RANGE; +} + static void init_reg_state(struct bpf_reg_state *regs) { int i; - for (i = 0; i < MAX_BPF_REG; i++) { - regs[i].type = NOT_INIT; - regs[i].imm = 0; - regs[i].min_value = BPF_REGISTER_MIN_RANGE; - regs[i].max_value = BPF_REGISTER_MAX_RANGE; - } + for (i = 0; i < MAX_BPF_REG; i++) + mark_reg_not_init(regs, i); /* frame pointer */ regs[BPF_REG_FP].type = FRAME_PTR; @@ -479,6 +504,15 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) { regs[regno].min_value = BPF_REGISTER_MIN_RANGE; regs[regno].max_value = BPF_REGISTER_MAX_RANGE; + regs[regno].value_from_signed = false; + regs[regno].min_align = 0; +} + +static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs, + u32 regno) +{ + mark_reg_unknown_value(regs, regno); + reset_reg_range_values(regs, regno); } enum reg_arg_type { @@ -513,25 +547,12 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno, return 0; } -static int bpf_size_to_bytes(int bpf_size) -{ - if (bpf_size == BPF_W) - return 4; - else if (bpf_size == BPF_H) - return 2; - else if (bpf_size == BPF_B) - return 1; - else if (bpf_size == BPF_DW) - return 8; - else - return -EINVAL; -} - static bool is_spillable_regtype(enum bpf_reg_type type) { switch (type) { case PTR_TO_MAP_VALUE: case PTR_TO_MAP_VALUE_OR_NULL: + case PTR_TO_MAP_VALUE_ADJ: case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: @@ -616,7 +637,8 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, } if (value_regno >= 0) /* have read misc data from the stack */ - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); return 0; } } @@ -627,7 +649,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, { struct bpf_map *map = env->cur_state.regs[regno].map_ptr; - if (off < 0 || off + size > map->value_size) { + if (off < 0 || size <= 0 || off + size > map->value_size) { verbose("invalid access to map value, value_size=%d off=%d size=%d\n", map->value_size, off, size); return -EACCES; @@ -635,6 +657,51 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, return 0; } +/* check read/write into an adjusted map element */ +static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno, + int off, int size) +{ + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *reg = &state->regs[regno]; + int err; + + /* We adjusted the register to this map value, so we + * need to change off and size to min_value and max_value + * respectively to make sure our theoretical access will be + * safe. + */ + if (log_level) + print_verifier_state(state); + env->varlen_map_value_access = true; + /* The minimum value is only important with signed + * comparisons where we can't assume the floor of a + * value is 0. If we are using signed variables for our + * index'es we need to make sure that whatever we use + * will have a set floor within our range. + */ + if (reg->min_value < 0) { + verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + regno); + return -EACCES; + } + err = check_map_access(env, regno, reg->min_value + off, size); + if (err) { + verbose("R%d min value is outside of the array range\n", + regno); + return err; + } + + /* If we haven't set a max value then we need to bail + * since we can't be sure we won't do bad things. + */ + if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", + regno); + return -EACCES; + } + return check_map_access(env, regno, reg->max_value + off, size); +} + #define MAX_PACKET_OFF 0xffff static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, @@ -647,6 +714,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; + /* fallthrough */ case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: @@ -677,15 +745,29 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, } /* check access to 'struct bpf_context' fields */ -static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, +static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { + struct bpf_insn_access_aux info = { + .reg_type = *reg_type, + }; + /* for analyzer ctx accesses are already validated and converted */ if (env->analyzer_ops) return 0; if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { + env->prog->aux->ops->is_valid_access(off, size, t, &info)) { + /* A non zero info.ctx_field_size indicates that this field is a + * candidate for later verifier transformation to load the whole + * field and then apply a mask when accessed with a narrower + * access than actual ctx access size. A zero info.ctx_field_size + * will only allow for whole field access and rejects any other + * type of narrower access. + */ + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + *reg_type = info.reg_type; + /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; @@ -696,12 +778,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, return -EACCES; } -static bool is_pointer_value(struct bpf_verifier_env *env, int regno) +static bool __is_pointer_value(bool allow_ptr_leaks, + const struct bpf_reg_state *reg) { - if (env->allow_ptr_leaks) + if (allow_ptr_leaks) return false; - switch (env->cur_state.regs[regno].type) { + switch (reg->type) { case UNKNOWN_VALUE: case CONST_IMM: return false; @@ -710,45 +793,89 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) } } -static int check_ptr_alignment(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, int off, int size) +static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { - if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) { - if (off % size != 0) { - verbose("misaligned access off %d size %d\n", - off, size); + return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); +} + +static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, + int off, int size, bool strict) +{ + int ip_align; + int reg_off; + + /* Byte size accesses are always allowed. */ + if (!strict || size == 1) + return 0; + + reg_off = reg->off; + if (reg->id) { + if (reg->aux_off_align % size) { + verbose("Packet access is only %u byte aligned, %d byte access not allowed\n", + reg->aux_off_align, size); return -EACCES; - } else { - return 0; } + reg_off += reg->aux_off; } - if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) - /* misaligned access to packet is ok on x86,arm,arm64 */ - return 0; - - if (reg->id && size != 1) { - verbose("Unknown packet alignment. Only byte-sized access allowed\n"); + /* For platforms that do not have a Kconfig enabling + * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of + * NET_IP_ALIGN is universally set to '2'. And on platforms + * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get + * to this code only in strict mode where we want to emulate + * the NET_IP_ALIGN==2 checking. Therefore use an + * unconditional IP align value of '2'. + */ + ip_align = 2; + if ((ip_align + reg_off + off) % size != 0) { + verbose("misaligned packet access off %d+%d+%d size %d\n", + ip_align, reg_off, off, size); return -EACCES; } - /* skb->data is NET_IP_ALIGN-ed */ - if (reg->type == PTR_TO_PACKET && - (NET_IP_ALIGN + reg->off + off) % size != 0) { - verbose("misaligned packet access off %d+%d+%d size %d\n", - NET_IP_ALIGN, reg->off, off, size); + return 0; +} + +static int check_val_ptr_alignment(const struct bpf_reg_state *reg, + int size, bool strict) +{ + if (strict && size != 1) { + verbose("Unknown alignment. Only byte-sized access allowed in value access.\n"); return -EACCES; } + return 0; } +static int check_ptr_alignment(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int off, int size) +{ + bool strict = env->strict_alignment; + + switch (reg->type) { + case PTR_TO_PACKET: + return check_pkt_ptr_alignment(reg, off, size, strict); + case PTR_TO_MAP_VALUE_ADJ: + return check_val_ptr_alignment(reg, size, strict); + default: + if (off % size != 0) { + verbose("misaligned access off %d size %d\n", + off, size); + return -EACCES; + } + + return 0; + } +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, +static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, int bpf_size, enum bpf_access_type t, int value_regno) { @@ -775,47 +902,13 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, return -EACCES; } - /* If we adjusted the register to this map value at all then we - * need to change off and size to min_value and max_value - * respectively to make sure our theoretical access will be - * safe. - */ - if (reg->type == PTR_TO_MAP_VALUE_ADJ) { - if (log_level) - print_verifier_state(state); - env->varlen_map_value_access = true; - /* The minimum value is only important with signed - * comparisons where we can't assume the floor of a - * value is 0. If we are using signed variables for our - * index'es we need to make sure that whatever we use - * will have a set floor within our range. - */ - if (reg->min_value < 0) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); - return -EACCES; - } - err = check_map_access(env, regno, reg->min_value + off, - size); - if (err) { - verbose("R%d min value is outside of the array range\n", - regno); - return err; - } - - /* If we haven't set a max value then we need to bail - * since we can't be sure we won't do bad things. - */ - if (reg->max_value == BPF_REGISTER_MAX_RANGE) { - verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", - regno); - return -EACCES; - } - off += reg->max_value; - } - err = check_map_access(env, regno, off, size); + if (reg->type == PTR_TO_MAP_VALUE_ADJ) + err = check_map_access_adj(env, regno, off, size); + else + err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = UNKNOWN_VALUE; @@ -825,11 +918,14 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, verbose("R%d leaks addr into ctx\n", value_regno); return -EACCES; } - err = check_ctx_access(env, off, size, t, ®_type); + err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); /* note that reg.[id|off|range] == 0 */ state->regs[value_regno].type = reg_type; + state->regs[value_regno].aux_off = 0; + state->regs[value_regno].aux_off_align = 0; } } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { @@ -837,6 +933,10 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, verbose("invalid stack off=%d size=%d\n", off, size); return -EACCES; } + + if (env->prog->aux->stack_depth < -off) + env->prog->aux->stack_depth = -off; + if (t == BPF_WRITE) { if (!env->allow_ptr_leaks && state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && @@ -860,7 +960,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); } else { verbose("R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -878,7 +979,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, return err; } -static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) +static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { struct bpf_reg_state *regs = env->cur_state.regs; int err; @@ -899,14 +1000,19 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; + if (is_pointer_value(env, insn->src_reg)) { + verbose("R%d leaks addr into mem\n", insn->src_reg); + return -EACCES; + } + /* check whether atomic_add can read the memory */ - err = check_mem_access(env, insn->dst_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); if (err) return err; /* check whether atomic_add can write into the same memory */ - return check_mem_access(env, insn->dst_reg, insn->off, + return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1); } @@ -942,6 +1048,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } + if (env->prog->aux->stack_depth < -off) + env->prog->aux->stack_depth = -off; + if (meta && meta->raw_mode) { meta->access_size = access_size; meta->regno = regno; @@ -958,6 +1067,25 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return 0; } +static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, + int access_size, bool zero_size_allowed, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = env->cur_state.regs; + + switch (regs[regno].type) { + case PTR_TO_PACKET: + return check_packet_access(env, regno, 0, access_size); + case PTR_TO_MAP_VALUE: + return check_map_access(env, regno, 0, access_size); + case PTR_TO_MAP_VALUE_ADJ: + return check_map_access_adj(env, regno, 0, access_size); + default: /* const_imm|ptr_to_stack or invalid ptr */ + return check_stack_boundary(env, regno, access_size, + zero_size_allowed, meta); + } +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -993,10 +1121,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_STACK; if (type != PTR_TO_PACKET && type != expected_type) goto err_type; - } else if (arg_type == ARG_CONST_STACK_SIZE || - arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { + } else if (arg_type == ARG_CONST_SIZE || + arg_type == ARG_CONST_SIZE_OR_ZERO) { expected_type = CONST_IMM; - if (type != expected_type) + /* One exception. Allow UNKNOWN_VALUE registers when the + * boundaries are known and don't cause unsafe memory accesses + */ + if (type != UNKNOWN_VALUE && type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; @@ -1006,8 +1137,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_CTX; if (type != expected_type) goto err_type; - } else if (arg_type == ARG_PTR_TO_STACK || - arg_type == ARG_PTR_TO_RAW_STACK) { + } else if (arg_type == ARG_PTR_TO_MEM || + arg_type == ARG_PTR_TO_UNINIT_MEM) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a CONST_IMM type. Final test @@ -1015,9 +1146,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (type == CONST_IMM && reg->imm == 0) /* final test in check_stack_boundary() */; - else if (type != PTR_TO_PACKET && type != expected_type) + else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && + type != PTR_TO_MAP_VALUE_ADJ && type != expected_type) goto err_type; - meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; + meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -1063,9 +1195,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_stack_boundary(env, regno, meta->map_ptr->value_size, false, NULL); - } else if (arg_type == ARG_CONST_STACK_SIZE || - arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { - bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); + } else if (arg_type == ARG_CONST_SIZE || + arg_type == ARG_CONST_SIZE_OR_ZERO) { + bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); /* bpf_xxx(..., buf, len) call will access 'len' bytes * from stack pointer 'buf'. Check it @@ -1073,14 +1205,50 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (regno == 0) { /* kernel subsystem misconfigured verifier */ - verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); + verbose("ARG_CONST_SIZE cannot be first argument\n"); return -EACCES; } - if (regs[regno - 1].type == PTR_TO_PACKET) - err = check_packet_access(env, regno - 1, 0, reg->imm); - else - err = check_stack_boundary(env, regno - 1, reg->imm, - zero_size_allowed, meta); + + /* If the register is UNKNOWN_VALUE, the access check happens + * using its boundaries. Otherwise, just use its imm + */ + if (type == UNKNOWN_VALUE) { + /* For unprivileged variable accesses, disable raw + * mode so that the program is required to + * initialize all the memory that the helper could + * just partially fill up. + */ + meta = NULL; + + if (reg->min_value < 0) { + verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", + regno); + return -EACCES; + } + + if (reg->min_value == 0) { + err = check_helper_mem_access(env, regno - 1, 0, + zero_size_allowed, + meta); + if (err) + return err; + } + + if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + err = check_helper_mem_access(env, regno - 1, + reg->max_value, + zero_size_allowed, meta); + if (err) + return err; + } else { + /* register is CONST_IMM */ + err = check_helper_mem_access(env, regno - 1, reg->imm, + zero_size_allowed, meta); + } } return err; @@ -1115,6 +1283,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: + if (func_id != BPF_FUNC_map_lookup_elem) + goto error; default: break; } @@ -1154,15 +1326,15 @@ static int check_raw_mode(const struct bpf_func_proto *fn) { int count = 0; - if (fn->arg1_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg2_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg3_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg4_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg5_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) count++; return count > 1 ? -EINVAL : 0; @@ -1186,17 +1358,16 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_PACKET_END) continue; - reg->type = UNKNOWN_VALUE; - reg->imm = 0; + __mark_reg_unknown_value(state->spilled_regs, + i / BPF_REG_SIZE); } } -static int check_call(struct bpf_verifier_env *env, int func_id) +static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs = state->regs; - struct bpf_reg_state *reg; struct bpf_call_arg_meta meta; bool changes_data; int i, err; @@ -1257,17 +1428,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id) * is inferred from register state. */ for (i = 0; i < meta.access_size; i++) { - err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1); + err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1); if (err) return err; } /* reset caller saved regs */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - reg = regs + caller_saved[i]; - reg->type = NOT_INIT; - reg->imm = 0; - } + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(regs, caller_saved[i]); /* update return register */ if (fn->ret_type == RET_INTEGER) { @@ -1275,6 +1443,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id) } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { + struct bpf_insn_aux_data *insn_aux; + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0; /* remember map_ptr, so that check_map_access() @@ -1287,6 +1457,11 @@ static int check_call(struct bpf_verifier_env *env, int func_id) } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; + insn_aux = &env->insn_aux_data[insn_idx]; + if (!insn_aux->map_ptr) + insn_aux->map_ptr = meta.map_ptr; + else if (insn_aux->map_ptr != meta.map_ptr) + insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { verbose("unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -1316,7 +1491,7 @@ static int check_packet_ptr_add(struct bpf_verifier_env *env, imm = insn->imm; add_imm: - if (imm <= 0) { + if (imm < 0) { verbose("addition of negative constant to packet pointer is not allowed\n"); return -EACCES; } @@ -1331,6 +1506,8 @@ add_imm: */ dst_reg->off += imm; } else { + bool had_id; + if (src_reg->type == PTR_TO_PACKET) { /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */ tmp_reg = *dst_reg; /* save r7 state */ @@ -1364,14 +1541,23 @@ add_imm: src_reg->imm); return -EACCES; } + + had_id = (dst_reg->id != 0); + /* dst_reg stays as pkt_ptr type and since some positive * integer value was added to the pointer, increment its 'id' */ dst_reg->id = ++env->id_gen; - /* something was added to pkt_ptr, set range and off to zero */ + /* something was added to pkt_ptr, set range to zero */ + dst_reg->aux_off += dst_reg->off; dst_reg->off = 0; dst_reg->range = 0; + if (had_id) + dst_reg->aux_off_align = min(dst_reg->aux_off_align, + src_reg->min_align); + else + dst_reg->aux_off_align = src_reg->min_align; } return 0; } @@ -1478,6 +1664,65 @@ static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } +static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; + u8 opcode = BPF_OP(insn->code); + s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm); + + /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */ + if (src_reg->imm > 0 && dst_reg->imm) { + switch (opcode) { + case BPF_ADD: + /* dreg += sreg + * where both have zero upper bits. Adding them + * can only result making one more bit non-zero + * in the larger value. + * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47) + * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47) + */ + dst_reg->imm = min(src_reg->imm, 63 - imm_log2); + dst_reg->imm--; + break; + case BPF_AND: + /* dreg &= sreg + * AND can not extend zero bits only shrink + * Ex. 0x00..00ffffff + * & 0x0f..ffffffff + * ---------------- + * 0x00..00ffffff + */ + dst_reg->imm = max(src_reg->imm, 63 - imm_log2); + break; + case BPF_OR: + /* dreg |= sreg + * OR can only extend zero bits + * Ex. 0x00..00ffffff + * | 0x0f..ffffffff + * ---------------- + * 0x0f..00ffffff + */ + dst_reg->imm = min(src_reg->imm, 63 - imm_log2); + break; + case BPF_SUB: + case BPF_MUL: + case BPF_RSH: + case BPF_LSH: + /* These may be flushed out later */ + default: + mark_reg_unknown_value(regs, insn->dst_reg); + } + } else { + mark_reg_unknown_value(regs, insn->dst_reg); + } + + dst_reg->type = UNKNOWN_VALUE; + return 0; +} + static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) { @@ -1485,22 +1730,57 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; struct bpf_reg_state *src_reg = ®s[insn->src_reg]; u8 opcode = BPF_OP(insn->code); + u64 dst_imm = dst_reg->imm; - /* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or' - * insn. Don't care about overflow or negative values, just add them + if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE) + return evaluate_reg_imm_alu_unknown(env, insn); + + /* dst_reg->type == CONST_IMM here. Simulate execution of insns + * containing ALU ops. Don't care about overflow or negative + * values, just add/sub/... them; registers are in u64. */ - if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) - dst_reg->imm += insn->imm; - else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) - dst_reg->imm += src_reg->imm; - else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) - dst_reg->imm |= insn->imm; - else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) - dst_reg->imm |= src_reg->imm; - else + if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) { + dst_imm += insn->imm; + } else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm += src_reg->imm; + } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) { + dst_imm -= insn->imm; + } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm -= src_reg->imm; + } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) { + dst_imm *= insn->imm; + } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm *= src_reg->imm; + } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) { + dst_imm |= insn->imm; + } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm |= src_reg->imm; + } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) { + dst_imm &= insn->imm; + } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm &= src_reg->imm; + } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) { + dst_imm >>= insn->imm; + } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm >>= src_reg->imm; + } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) { + dst_imm <<= insn->imm; + } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm <<= src_reg->imm; + } else { mark_reg_unknown_value(regs, insn->dst_reg); + goto out; + } + + dst_reg->imm = dst_imm; +out: return 0; } @@ -1513,6 +1793,13 @@ static void check_reg_overflow(struct bpf_reg_state *reg) reg->min_value = BPF_REGISTER_MIN_RANGE; } +static u32 calc_align(u32 imm) +{ + if (!imm) + return 1U << 31; + return imm - ((imm - 1) & imm); +} + static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { @@ -1520,8 +1807,10 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, s64 min_val = BPF_REGISTER_MIN_RANGE; u64 max_val = BPF_REGISTER_MAX_RANGE; u8 opcode = BPF_OP(insn->code); + u32 dst_align, src_align; dst_reg = ®s[insn->dst_reg]; + src_align = 0; if (BPF_SRC(insn->code) == BPF_X) { check_reg_overflow(®s[insn->src_reg]); min_val = regs[insn->src_reg].min_value; @@ -1537,17 +1826,37 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, regs[insn->src_reg].type != UNKNOWN_VALUE) { min_val = BPF_REGISTER_MIN_RANGE; max_val = BPF_REGISTER_MAX_RANGE; + src_align = 0; + } else { + src_align = regs[insn->src_reg].min_align; } } else if (insn->imm < BPF_REGISTER_MAX_RANGE && (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { min_val = max_val = insn->imm; + src_align = calc_align(insn->imm); } + dst_align = dst_reg->min_align; + /* We don't know anything about what was done to this register, mark it - * as unknown. + * as unknown. Also, if both derived bounds came from signed/unsigned + * mixed compares and one side is unbounded, we cannot really do anything + * with them as boundaries cannot be trusted. Thus, arithmetic of two + * regs of such kind will get invalidated bounds on the dst side. */ - if (min_val == BPF_REGISTER_MIN_RANGE && - max_val == BPF_REGISTER_MAX_RANGE) { + if ((min_val == BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) || + (BPF_SRC(insn->code) == BPF_X && + ((min_val != BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) || + (min_val == BPF_REGISTER_MIN_RANGE && + max_val != BPF_REGISTER_MAX_RANGE) || + (dst_reg->min_value != BPF_REGISTER_MIN_RANGE && + dst_reg->max_value == BPF_REGISTER_MAX_RANGE) || + (dst_reg->min_value == BPF_REGISTER_MIN_RANGE && + dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) && + regs[insn->dst_reg].value_from_signed != + regs[insn->src_reg].value_from_signed)) { reset_reg_range_values(regs, insn->dst_reg); return; } @@ -1567,18 +1876,21 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_reg->min_value += min_val; if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value += max_val; + dst_reg->min_align = min(src_align, dst_align); break; case BPF_SUB: if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) dst_reg->min_value -= min_val; if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value -= max_val; + dst_reg->min_align = min(src_align, dst_align); break; case BPF_MUL: if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) dst_reg->min_value *= min_val; if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value *= max_val; + dst_reg->min_align = max(src_align, dst_align); break; case BPF_AND: /* Disallow AND'ing of negative numbers, ain't nobody got time @@ -1590,17 +1902,23 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, else dst_reg->min_value = 0; dst_reg->max_value = max_val; + dst_reg->min_align = max(src_align, dst_align); break; case BPF_LSH: /* Gotta have special overflow logic here, if we're shifting * more than MAX_RANGE then just assume we have an invalid * range. */ - if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) + if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) { dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) - dst_reg->min_value <<= min_val; - + dst_reg->min_align = 1; + } else { + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value <<= min_val; + if (!dst_reg->min_align) + dst_reg->min_align = 1; + dst_reg->min_align <<= min_val; + } if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) dst_reg->max_value = BPF_REGISTER_MAX_RANGE; else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) @@ -1610,11 +1928,19 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* RSH by a negative number is undefined, and the BPF_RSH is an * unsigned shift, so make the appropriate casts. */ - if (min_val < 0 || dst_reg->min_value < 0) + if (min_val < 0 || dst_reg->min_value < 0) { dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - else + } else { dst_reg->min_value = (u64)(dst_reg->min_value) >> min_val; + } + if (min_val < 0) { + dst_reg->min_align = 1; + } else { + dst_reg->min_align >>= (u64) min_val; + if (!dst_reg->min_align) + dst_reg->min_align = 1; + } if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value >>= max_val; break; @@ -1714,8 +2040,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = insn->imm; + regs[insn->dst_reg].id = 0; regs[insn->dst_reg].max_value = insn->imm; regs[insn->dst_reg].min_value = insn->imm; + regs[insn->dst_reg].min_align = calc_align(insn->imm); + regs[insn->dst_reg].value_from_signed = false; } } else if (opcode > BPF_END) { @@ -1779,6 +2108,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } else if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && + dst_reg->type == PTR_TO_STACK && + ((BPF_SRC(insn->code) == BPF_X && + regs[insn->src_reg].type == CONST_IMM) || + BPF_SRC(insn->code) == BPF_K)) { + if (BPF_SRC(insn->code) == BPF_X) + dst_reg->imm += regs[insn->src_reg].imm; + else + dst_reg->imm += insn->imm; + return 0; + } else if (opcode == BPF_ADD && + BPF_CLASS(insn->code) == BPF_ALU64 && (dst_reg->type == PTR_TO_PACKET || (BPF_SRC(insn->code) == BPF_X && regs[insn->src_reg].type == PTR_TO_PACKET))) { @@ -1811,6 +2151,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * register as unknown. */ if (env->allow_ptr_leaks && + BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD && (dst_reg->type == PTR_TO_MAP_VALUE || dst_reg->type == PTR_TO_MAP_VALUE_ADJ)) dst_reg->type = PTR_TO_MAP_VALUE_ADJ; @@ -1859,14 +2200,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, for (i = 0; i < MAX_BPF_REG; i++) if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) - regs[i].range = dst_reg->off; + /* keep the maximum range already checked */ + regs[i].range = max(regs[i].range, dst_reg->off); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) continue; reg = &state->spilled_regs[i / BPF_REG_SIZE]; if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) - reg->range = dst_reg->off; + reg->range = max(reg->range, dst_reg->off); } } @@ -1878,38 +2220,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + bool value_from_signed = true; + bool is_range = true; + switch (opcode) { case BPF_JEQ: /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; + is_range = false; break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; + is_range = false; break; case BPF_JGT: - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; + value_from_signed = false; + /* fallthrough */ case BPF_JSGT: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGT) { + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + } /* If this is false then we know the maximum val is val, * otherwise we know the min val is val+1. */ false_reg->max_value = val; + false_reg->value_from_signed = value_from_signed; true_reg->min_value = val + 1; + true_reg->value_from_signed = value_from_signed; break; case BPF_JGE: - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; + value_from_signed = false; + /* fallthrough */ case BPF_JSGE: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGE) { + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + } /* If this is false then we know the maximum value is val - 1, * otherwise we know the mimimum value is val. */ false_reg->max_value = val - 1; + false_reg->value_from_signed = value_from_signed; true_reg->min_value = val; + true_reg->value_from_signed = value_from_signed; break; default: break; @@ -1917,6 +2284,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); + if (is_range) { + if (__is_pointer_value(false, false_reg)) + reset_reg_range_values(false_reg, 0); + if (__is_pointer_value(false, true_reg)) + reset_reg_range_values(true_reg, 0); + } } /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg @@ -1926,39 +2299,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + bool value_from_signed = true; + bool is_range = true; + switch (opcode) { case BPF_JEQ: /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; + is_range = false; break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; + is_range = false; break; case BPF_JGT: - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; + value_from_signed = false; + /* fallthrough */ case BPF_JSGT: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGT) { + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + } /* * If this is false, then the val is <= the register, if it is * true the register <= to the val. */ false_reg->min_value = val; + false_reg->value_from_signed = value_from_signed; true_reg->max_value = val - 1; + true_reg->value_from_signed = value_from_signed; break; case BPF_JGE: - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; + value_from_signed = false; + /* fallthrough */ case BPF_JSGE: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGE) { + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + } /* If this is false then constant < register, if it is true then * the register < constant. */ false_reg->min_value = val + 1; + false_reg->value_from_signed = value_from_signed; true_reg->max_value = val; + true_reg->value_from_signed = value_from_signed; break; default: break; @@ -1966,6 +2364,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); + if (is_range) { + if (__is_pointer_value(false, false_reg)) + reset_reg_range_values(false_reg, 0); + if (__is_pointer_value(false, true_reg)) + reset_reg_range_values(true_reg, 0); + } } static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, @@ -1974,14 +2378,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, struct bpf_reg_state *reg = ®s[regno]; if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { - reg->type = type; + if (type == UNKNOWN_VALUE) { + __mark_reg_unknown_value(regs, regno); + } else if (reg->map_ptr->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = reg->map_ptr->inner_map_meta; + } else { + reg->type = type; + } /* We don't need id from this point onwards anymore, thus we * should better reset it, so that state pruning has chances * to take effect. */ reg->id = 0; - if (type == UNKNOWN_VALUE) - __mark_reg_unknown_value(regs, regno); } } @@ -2144,16 +2553,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (insn->src_reg == 0) { - /* generic move 64-bit immediate into a register, - * only analyzer needs to collect the ld_imm value. - */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - if (!env->analyzer_ops) - return 0; - regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = imm; + regs[insn->dst_reg].id = 0; return 0; } @@ -2196,7 +2600,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_reg_state *regs = env->cur_state.regs; u8 mode = BPF_MODE(insn->code); - struct bpf_reg_state *reg; int i, err; if (!may_access_skb(env->prog->type)) { @@ -2229,11 +2632,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* reset caller saved regs to unreadable */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - reg = regs + caller_saved[i]; - reg->type = NOT_INIT; - reg->imm = 0; - } + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(regs, caller_saved[i]); /* mark destination R0 register as readable, since it contains * the value fetched from the packet @@ -2392,6 +2792,7 @@ peek_stack: env->explored_states[t + 1] = STATE_LIST_MARK; } else { /* conditional jump with two edges */ + env->explored_states[t] = STATE_LIST_MARK; ret = push_insn(t, t + 1, FALLTHROUGH, env); if (ret == 1) goto peek_stack; @@ -2443,7 +2844,8 @@ err_free: /* the following conditions reduce the number of explored insns * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet */ -static bool compare_ptrs_to_packet(struct bpf_reg_state *old, +static bool compare_ptrs_to_packet(struct bpf_verifier_env *env, + struct bpf_reg_state *old, struct bpf_reg_state *cur) { if (old->id != cur->id) @@ -2486,7 +2888,7 @@ static bool compare_ptrs_to_packet(struct bpf_reg_state *old, * 'if (R4 > data_end)' and all further insn were already good with r=20, * so they will be good with r=30 and we can prune the search. */ - if (old->off <= cur->off && + if (!env->strict_alignment && old->off <= cur->off && old->off >= old->range && cur->off >= cur->range) return true; @@ -2550,8 +2952,14 @@ static bool states_equal(struct bpf_verifier_env *env, rcur->type != NOT_INIT)) continue; + /* Don't care about the reg->id in this case. */ + if (rold->type == PTR_TO_MAP_VALUE_OR_NULL && + rcur->type == PTR_TO_MAP_VALUE_OR_NULL && + rold->map_ptr == rcur->map_ptr) + continue; + if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && - compare_ptrs_to_packet(rold, rcur)) + compare_ptrs_to_packet(env, rold, rcur)) continue; return false; @@ -2569,6 +2977,8 @@ static bool states_equal(struct bpf_verifier_env *env, return false; if (i % BPF_REG_SIZE) continue; + if (old->stack_slot_type[i] != STACK_SPILL) + continue; if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], &cur->spilled_regs[i / BPF_REG_SIZE], sizeof(old->spilled_regs[0]))) @@ -2664,7 +3074,7 @@ static int do_check(struct bpf_verifier_env *env) class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { - verbose("BPF program is too large. Proccessed %d insn\n", + verbose("BPF program is too large. Processed %d insn\n", insn_processed); return -E2BIG; } @@ -2684,15 +3094,22 @@ static int do_check(struct bpf_verifier_env *env) goto process_bpf_exit; } - if (log_level && do_print_state) { - verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); + if (need_resched()) + cond_resched(); + + if (log_level > 1 || (log_level && do_print_state)) { + if (log_level > 1) + verbose("%d:", insn_idx); + else + verbose("\nfrom %d to %d:", + prev_insn_idx, insn_idx); print_verifier_state(&env->cur_state); do_print_state = false; } if (log_level) { verbose("%d: ", insn_idx); - print_bpf_insn(insn); + print_bpf_insn(env, insn); } err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); @@ -2723,19 +3140,12 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ - err = check_mem_access(env, insn->src_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg); if (err) return err; - reset_reg_range_values(regs, insn->dst_reg); - if (BPF_SIZE(insn->code) != BPF_W && - BPF_SIZE(insn->code) != BPF_DW) { - insn_idx++; - continue; - } - prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; if (*prev_src_type == NOT_INIT) { @@ -2763,7 +3173,7 @@ static int do_check(struct bpf_verifier_env *env) enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, insn); + err = check_xadd(env, insn_idx, insn); if (err) return err; insn_idx++; @@ -2782,7 +3192,7 @@ static int do_check(struct bpf_verifier_env *env) dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn->dst_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg); if (err) @@ -2811,7 +3221,7 @@ static int do_check(struct bpf_verifier_env *env) return err; /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn->dst_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1); if (err) @@ -2829,7 +3239,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - err = check_call(env, insn->imm); + err = check_call(env, insn->imm, insn_idx); if (err) return err; @@ -2909,20 +3319,38 @@ process_bpf_exit: insn_idx++; } - verbose("processed %d insns\n", insn_processed); + verbose("processed %d insns, stack depth %d\n", + insn_processed, env->prog->aux->stack_depth); return 0; } +static int check_map_prealloc(struct bpf_map *map) +{ + return (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_PERCPU_HASH && + map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) || + !(map->map_flags & BPF_F_NO_PREALLOC); +} + static int check_map_prog_compatibility(struct bpf_map *map, struct bpf_prog *prog) { - if (prog->type == BPF_PROG_TYPE_PERF_EVENT && - (map->map_type == BPF_MAP_TYPE_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && - (map->map_flags & BPF_F_NO_PREALLOC)) { - verbose("perf_event programs can only use preallocated hash map\n"); - return -EINVAL; + /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use + * preallocated hash maps, since doing memory allocation + * in overflow_handler can crash depending on where nmi got + * triggered. + */ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { + if (!check_map_prealloc(map)) { + verbose("perf_event programs can only use preallocated hash map\n"); + return -EINVAL; + } + if (map->inner_map_meta && + !check_map_prealloc(map->inner_map_meta)) { + verbose("perf_event programs can only use preallocated inner hash map\n"); + return -EINVAL; + } } return 0; } @@ -3051,17 +3479,54 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) insn->src_reg = 0; } +/* single env->prog->insni[off] instruction was replaced with the range + * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying + * [0, off) and [off, end) to new locations, so the patched range stays zero + */ +static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, + u32 off, u32 cnt) +{ + struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; + + if (cnt == 1) + return 0; + new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len); + if (!new_data) + return -ENOMEM; + memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); + memcpy(new_data + off + cnt - 1, old_data + off, + sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); + env->insn_aux_data = new_data; + vfree(old_data); + return 0; +} + +static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, + const struct bpf_insn *patch, u32 len) +{ + struct bpf_prog *new_prog; + + new_prog = bpf_patch_insn_single(env->prog, off, patch, len); + if (!new_prog) + return NULL; + if (adjust_insn_aux_data(env, new_prog->len, off, len)) + return NULL; + return new_prog; +} + /* convert load instructions that access fields of 'struct __sk_buff' * into sequence of instructions that access fields of 'struct sk_buff' */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { const struct bpf_verifier_ops *ops = env->prog->aux->ops; + int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; struct bpf_prog *new_prog; enum bpf_access_type type; - int i, cnt, delta = 0; + bool is_narrower_load; + u32 target_size; if (ops->gen_prologue) { cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, @@ -3070,10 +3535,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) verbose("bpf verifier is misconfigured\n"); return -EINVAL; } else if (cnt) { - new_prog = bpf_patch_insn_single(env->prog, 0, - insn_buf, cnt); + new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); if (!new_prog) return -ENOMEM; + env->prog = new_prog; delta += cnt - 1; } @@ -3085,27 +3550,69 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn = env->prog->insnsi + delta; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || + if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || + insn->code == (BPF_LDX | BPF_MEM | BPF_H) || + insn->code == (BPF_LDX | BPF_MEM | BPF_W) || insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) type = BPF_READ; - else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) || + else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || + insn->code == (BPF_STX | BPF_MEM | BPF_H) || + insn->code == (BPF_STX | BPF_MEM | BPF_W) || insn->code == (BPF_STX | BPF_MEM | BPF_DW)) type = BPF_WRITE; else continue; - if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) + if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) continue; - cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg, - insn->off, insn_buf, env->prog); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; + size = BPF_LDST_BYTES(insn); + + /* If the read access is a narrower load of the field, + * convert to a 4/8-byte load, to minimum program type specific + * convert_ctx_access changes. If conversion is successful, + * we will apply proper mask to the result. + */ + is_narrower_load = size < ctx_field_size; + if (is_narrower_load) { + u32 off = insn->off; + u8 size_code; + + if (type == BPF_WRITE) { + verbose("bpf verifier narrow ctx access misconfigured\n"); + return -EINVAL; + } + + size_code = BPF_H; + if (ctx_field_size == 4) + size_code = BPF_W; + else if (ctx_field_size == 8) + size_code = BPF_DW; + + insn->off = off & ~(ctx_field_size - 1); + insn->code = BPF_LDX | BPF_MEM | size_code; + } + + target_size = 0; + cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog, + &target_size); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || + (ctx_field_size && !target_size)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; } - new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf, - cnt); + if (is_narrower_load && size < target_size) { + if (ctx_field_size <= 4) + insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, + (1 << size * 8) - 1); + else + insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, + (1 << size * 8) - 1); + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; @@ -3119,6 +3626,90 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return 0; } +/* fixup insn->imm field of bpf_call instructions + * and inline eligible helpers as explicit sequence of BPF instructions + * + * this function is called after eBPF program passed verification + */ +static int fixup_bpf_calls(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog; + struct bpf_insn *insn = prog->insnsi; + const struct bpf_func_proto *fn; + const int insn_cnt = prog->len; + struct bpf_insn insn_buf[16]; + struct bpf_prog *new_prog; + struct bpf_map *map_ptr; + int i, cnt, delta = 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL)) + continue; + + if (insn->imm == BPF_FUNC_get_route_realm) + prog->dst_needed = 1; + if (insn->imm == BPF_FUNC_get_prandom_u32) + bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_tail_call) { + /* If we tail call into other programs, we + * cannot make any assumptions since they can + * be replaced dynamically during runtime in + * the program array. + */ + prog->cb_access = 1; + env->prog->aux->stack_depth = MAX_BPF_STACK; + + /* mark bpf_tail_call as different opcode to avoid + * conditional branch in the interpeter for every normal + * call and to prevent accidental JITing by JIT compiler + * that doesn't support bpf_tail_call yet + */ + insn->imm = 0; + insn->code = BPF_JMP | BPF_TAIL_CALL; + continue; + } + + if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) { + map_ptr = env->insn_aux_data[i + delta].map_ptr; + if (map_ptr == BPF_MAP_PTR_POISON || + !map_ptr->ops->map_gen_lookup) + goto patch_call_imm; + + cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose("bpf verifier is misconfigured\n"); + return -EINVAL; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, + cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + + /* keep walking new program and skip insns we just inserted */ + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + +patch_call_imm: + fn = prog->aux->ops->get_func_proto(insn->imm); + /* all functions that have prototype and verifier allowed + * programs to call them, must be real in-kernel functions + */ + if (!fn->func) { + verbose("kernel subsystem misconfigured func %s#%d\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; + } + insn->imm = fn->func - __bpf_call_base; + } + + return 0; +} + static void free_states(struct bpf_verifier_env *env) { struct bpf_verifier_state_list *sl, *sln; @@ -3187,6 +3778,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) log_level = 0; } + env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + env->strict_alignment = true; + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; @@ -3214,6 +3809,9 @@ skip_full_check: /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); + if (ret == 0) + ret = fixup_bpf_calls(env); + if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); /* verifier log exceeded user supplied buffer */ @@ -3289,6 +3887,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, log_level = 0; + env->strict_alignment = false; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + env->strict_alignment = true; + env->explored_states = kcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), GFP_KERNEL); |