aboutsummaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/Kconfig1
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/bloom_filter.c3
-rw-r--r--kernel/bpf/bpf_local_storage.c3
-rw-r--r--kernel/bpf/bpf_lru_list.c21
-rw-r--r--kernel/bpf/bpf_lru_list.h8
-rw-r--r--kernel/bpf/bpf_struct_ops.c24
-rw-r--r--kernel/bpf/btf.c117
-rw-r--r--kernel/bpf/cgroup.c15
-rw-r--r--kernel/bpf/core.c214
-rw-r--r--kernel/bpf/cpumap.c143
-rw-r--r--kernel/bpf/cpumask.c58
-rw-r--r--kernel/bpf/devmap.c5
-rw-r--r--kernel/bpf/disasm.c58
-rw-r--r--kernel/bpf/hashtab.c28
-rw-r--r--kernel/bpf/helpers.c198
-rw-r--r--kernel/bpf/inode.c33
-rw-r--r--kernel/bpf/log.c3
-rw-r--r--kernel/bpf/lpm_trie.c3
-rw-r--r--kernel/bpf/map_iter.c42
-rw-r--r--kernel/bpf/memalloc.c409
-rw-r--r--kernel/bpf/mprog.c447
-rw-r--r--kernel/bpf/offload.c1
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c4
-rw-r--r--kernel/bpf/preload/iterators/Makefile2
-rw-r--r--kernel/bpf/preload/iterators/iterators.bpf.c9
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel-little-endian.h526
-rw-r--r--kernel/bpf/queue_stack_maps.c4
-rw-r--r--kernel/bpf/reuseport_array.c3
-rw-r--r--kernel/bpf/ringbuf.c26
-rw-r--r--kernel/bpf/stackmap.c3
-rw-r--r--kernel/bpf/syscall.c614
-rw-r--r--kernel/bpf/tcx.c352
-rw-r--r--kernel/bpf/trampoline.c32
-rw-r--r--kernel/bpf/verifier.c1585
35 files changed, 3757 insertions, 1240 deletions
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index 2dfe1079f772..6a906ff93006 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -31,6 +31,7 @@ config BPF_SYSCALL
select TASKS_TRACE_RCU
select BINARY_PRINTF
select NET_SOCK_MSG if NET
+ select NET_XGRESS if NET
select PAGE_POOL if NET
default n
help
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1d3892168d32..f526b7573e97 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -12,7 +12,7 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
-obj-$(CONFIG_BPF_SYSCALL) += disasm.o
+obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
obj-$(CONFIG_BPF_JIT) += dispatcher.o
@@ -21,6 +21,7 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
obj-$(CONFIG_BPF_SYSCALL) += offload.o
obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
+obj-$(CONFIG_BPF_SYSCALL) += tcx.o
endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index 540331b610a9..addf3dd57b59 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -86,9 +86,6 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_bloom_filter *bloom;
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
if (attr->key_size != 0 || attr->value_size == 0 ||
attr->max_entries == 0 ||
attr->map_flags & ~BLOOM_CREATE_FLAG_MASK ||
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 47d9948d768f..b5149cfce7d4 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -723,9 +723,6 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
!attr->btf_key_type_id || !attr->btf_value_type_id)
return -EINVAL;
- if (!bpf_capable())
- return -EPERM;
-
if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
return -E2BIG;
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index d99e89f113c4..3dabdd137d10 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -41,7 +41,12 @@ static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
/* bpf_lru_node helpers */
static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
{
- return node->ref;
+ return READ_ONCE(node->ref);
+}
+
+static void bpf_lru_node_clear_ref(struct bpf_lru_node *node)
+{
+ WRITE_ONCE(node->ref, 0);
}
static void bpf_lru_list_count_inc(struct bpf_lru_list *l,
@@ -89,7 +94,7 @@ static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
bpf_lru_list_count_inc(l, tgt_type);
node->type = tgt_type;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_move(&node->list, &l->lists[tgt_type]);
}
@@ -110,7 +115,7 @@ static void __bpf_lru_node_move(struct bpf_lru_list *l,
bpf_lru_list_count_inc(l, tgt_type);
node->type = tgt_type;
}
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
/* If the moving node is the next_inactive_rotation candidate,
* move the next_inactive_rotation pointer also.
@@ -353,7 +358,7 @@ static void __local_list_add_pending(struct bpf_lru *lru,
*(u32 *)((void *)node + lru->hash_offset) = hash;
node->cpu = cpu;
node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_add(&node->list, local_pending_list(loc_l));
}
@@ -419,7 +424,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
if (!list_empty(free_list)) {
node = list_first_entry(free_list, struct bpf_lru_node, list);
*(u32 *)((void *)node + lru->hash_offset) = hash;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
}
@@ -522,7 +527,7 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru,
}
node->type = BPF_LRU_LOCAL_LIST_T_FREE;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_move(&node->list, local_free_list(loc_l));
raw_spin_unlock_irqrestore(&loc_l->lock, flags);
@@ -568,7 +573,7 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
node = (struct bpf_lru_node *)(buf + node_offset);
node->type = BPF_LRU_LIST_T_FREE;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
buf += elem_size;
}
@@ -594,7 +599,7 @@ again:
node = (struct bpf_lru_node *)(buf + node_offset);
node->cpu = cpu;
node->type = BPF_LRU_LIST_T_FREE;
- node->ref = 0;
+ bpf_lru_node_clear_ref(node);
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
i++;
buf += elem_size;
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 4ea227c9c1ad..cbd8d3720c2b 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -64,11 +64,8 @@ struct bpf_lru {
static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
{
- /* ref is an approximation on access frequency. It does not
- * have to be very accurate. Hence, no protection is used.
- */
- if (!node->ref)
- node->ref = 1;
+ if (!READ_ONCE(node->ref))
+ WRITE_ONCE(node->ref, 1);
}
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
@@ -78,6 +75,5 @@ void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
void bpf_lru_destroy(struct bpf_lru *lru);
struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash);
void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node);
-void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node);
#endif
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index d3f0a4825fa6..fdc3e8705a3c 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -374,9 +374,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
struct bpf_struct_ops_value *uvalue, *kvalue;
const struct btf_member *member;
const struct btf_type *t = st_ops->type;
- struct bpf_tramp_links *tlinks = NULL;
+ struct bpf_tramp_links *tlinks;
void *udata, *kdata;
- int prog_fd, err = 0;
+ int prog_fd, err;
void *image, *image_end;
u32 i;
@@ -509,9 +509,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
}
if (st_map->map.map_flags & BPF_F_LINK) {
- err = st_ops->validate(kdata);
- if (err)
- goto reset_unlock;
+ err = 0;
+ if (st_ops->validate) {
+ err = st_ops->validate(kdata);
+ if (err)
+ goto reset_unlock;
+ }
set_memory_rox((long)st_map->image, 1);
/* Let bpf_link handle registration & unregistration.
*
@@ -655,9 +658,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
const struct btf_type *t, *vt;
struct bpf_map *map;
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
if (!st_ops)
return ERR_PTR(-ENOTSUPP);
@@ -666,9 +666,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
if (attr->value_size != vt->size)
return ERR_PTR(-EINVAL);
- if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update))
- return ERR_PTR(-EOPNOTSUPP);
-
t = st_ops->type;
st_map_size = sizeof(*st_map) +
@@ -818,7 +815,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
struct bpf_struct_ops_map *st_map, *old_st_map;
struct bpf_map *old_map;
struct bpf_struct_ops_link *st_link;
- int err = 0;
+ int err;
st_link = container_of(link, struct bpf_struct_ops_link, link);
st_map = container_of(new_map, struct bpf_struct_ops_map, map);
@@ -826,6 +823,9 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
if (!bpf_struct_ops_valid_to_reg(new_map))
return -EINVAL;
+ if (!st_map->st_ops->update)
+ return -EOPNOTSUPP;
+
mutex_lock(&update_mutex);
old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 72b32b7cd9cd..1095bbe29859 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -29,6 +29,7 @@
#include <net/netfilter/nf_bpf_link.h>
#include <net/sock.h>
+#include <net/xdp.h>
#include "../tools/lib/bpf/relo_core.h"
/* BTF (BPF Type Format) is the meta data format which describes
@@ -222,10 +223,17 @@ enum btf_kfunc_hook {
enum {
BTF_KFUNC_SET_MAX_CNT = 256,
BTF_DTOR_KFUNC_MAX_CNT = 256,
+ BTF_KFUNC_FILTER_MAX_CNT = 16,
+};
+
+struct btf_kfunc_hook_filter {
+ btf_kfunc_filter_t filters[BTF_KFUNC_FILTER_MAX_CNT];
+ u32 nr_filters;
};
struct btf_kfunc_set_tab {
struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX];
+ struct btf_kfunc_hook_filter hook_filters[BTF_KFUNC_HOOK_MAX];
};
struct btf_id_dtor_kfunc_tab {
@@ -485,25 +493,26 @@ static bool btf_type_is_fwd(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
}
-static bool btf_type_nosize(const struct btf_type *t)
+static bool btf_type_is_datasec(const struct btf_type *t)
{
- return btf_type_is_void(t) || btf_type_is_fwd(t) ||
- btf_type_is_func(t) || btf_type_is_func_proto(t);
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
}
-static bool btf_type_nosize_or_null(const struct btf_type *t)
+static bool btf_type_is_decl_tag(const struct btf_type *t)
{
- return !t || btf_type_nosize(t);
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
}
-static bool btf_type_is_datasec(const struct btf_type *t)
+static bool btf_type_nosize(const struct btf_type *t)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
+ return btf_type_is_void(t) || btf_type_is_fwd(t) ||
+ btf_type_is_func(t) || btf_type_is_func_proto(t) ||
+ btf_type_is_decl_tag(t);
}
-static bool btf_type_is_decl_tag(const struct btf_type *t)
+static bool btf_type_nosize_or_null(const struct btf_type *t)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
+ return !t || btf_type_nosize(t);
}
static bool btf_type_is_decl_tag_target(const struct btf_type *t)
@@ -544,7 +553,7 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
return -ENOENT;
}
-static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
+s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
{
struct btf *btf;
s32 ret;
@@ -6125,8 +6134,9 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
const char *tname, *mname, *tag_value;
u32 vlen, elem_id, mid;
- *flag = 0;
again:
+ if (btf_type_is_modifier(t))
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
tname = __btf_name_by_offset(btf, t->name_off);
if (!btf_type_is_struct(t)) {
bpf_log(log, "Type '%s' is not a struct\n", tname);
@@ -6134,6 +6144,14 @@ again:
}
vlen = btf_type_vlen(t);
+ if (BTF_INFO_KIND(t->info) == BTF_KIND_UNION && vlen != 1 && !(*flag & PTR_UNTRUSTED))
+ /*
+ * walking unions yields untrusted pointers
+ * with exception of __bpf_md_ptr and other
+ * unions with a single member
+ */
+ *flag |= PTR_UNTRUSTED;
+
if (off + size > t->size) {
/* If the last element is a variable size array, we may
* need to relax the rule.
@@ -6294,15 +6312,6 @@ error:
* of this field or inside of this struct
*/
if (btf_type_is_struct(mtype)) {
- if (BTF_INFO_KIND(mtype->info) == BTF_KIND_UNION &&
- btf_type_vlen(mtype) != 1)
- /*
- * walking unions yields untrusted pointers
- * with exception of __bpf_md_ptr and other
- * unions with a single member
- */
- *flag |= PTR_UNTRUSTED;
-
/* our field must be inside that union or struct */
t = mtype;
@@ -6360,7 +6369,7 @@ error:
* that also allows using an array of int as a scratch
* space. e.g. skb->cb[].
*/
- if (off + size > mtrue_end) {
+ if (off + size > mtrue_end && !(*flag & PTR_UNTRUSTED)) {
bpf_log(log,
"access beyond the end of member %s (mend:%u) in struct %s with off %u size %u\n",
mname, mtrue_end, tname, off, size);
@@ -6468,7 +6477,7 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log,
bool strict)
{
const struct btf_type *type;
- enum bpf_type_flag flag;
+ enum bpf_type_flag flag = 0;
int err;
/* Are we already done? */
@@ -7665,9 +7674,12 @@ static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
/* Kernel Function (kfunc) BTF ID set registration API */
static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
- struct btf_id_set8 *add_set)
+ const struct btf_kfunc_id_set *kset)
{
+ struct btf_kfunc_hook_filter *hook_filter;
+ struct btf_id_set8 *add_set = kset->set;
bool vmlinux_set = !btf_is_module(btf);
+ bool add_filter = !!kset->filter;
struct btf_kfunc_set_tab *tab;
struct btf_id_set8 *set;
u32 set_cnt;
@@ -7682,6 +7694,24 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
return 0;
tab = btf->kfunc_set_tab;
+
+ if (tab && add_filter) {
+ u32 i;
+
+ hook_filter = &tab->hook_filters[hook];
+ for (i = 0; i < hook_filter->nr_filters; i++) {
+ if (hook_filter->filters[i] == kset->filter) {
+ add_filter = false;
+ break;
+ }
+ }
+
+ if (add_filter && hook_filter->nr_filters == BTF_KFUNC_FILTER_MAX_CNT) {
+ ret = -E2BIG;
+ goto end;
+ }
+ }
+
if (!tab) {
tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN);
if (!tab)
@@ -7704,7 +7734,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
*/
if (!vmlinux_set) {
tab->sets[hook] = add_set;
- return 0;
+ goto do_add_filter;
}
/* In case of vmlinux sets, there may be more than one set being
@@ -7746,6 +7776,11 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL);
+do_add_filter:
+ if (add_filter) {
+ hook_filter = &tab->hook_filters[hook];
+ hook_filter->filters[hook_filter->nr_filters++] = kset->filter;
+ }
return 0;
end:
btf_free_kfunc_set_tab(btf);
@@ -7754,15 +7789,22 @@ end:
static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
enum btf_kfunc_hook hook,
- u32 kfunc_btf_id)
+ u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
{
+ struct btf_kfunc_hook_filter *hook_filter;
struct btf_id_set8 *set;
- u32 *id;
+ u32 *id, i;
if (hook >= BTF_KFUNC_HOOK_MAX)
return NULL;
if (!btf->kfunc_set_tab)
return NULL;
+ hook_filter = &btf->kfunc_set_tab->hook_filters[hook];
+ for (i = 0; i < hook_filter->nr_filters; i++) {
+ if (hook_filter->filters[i](prog, kfunc_btf_id))
+ return NULL;
+ }
set = btf->kfunc_set_tab->sets[hook];
if (!set)
return NULL;
@@ -7817,23 +7859,25 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
* protection for looking up a well-formed btf->kfunc_set_tab.
*/
u32 *btf_kfunc_id_set_contains(const struct btf *btf,
- enum bpf_prog_type prog_type,
- u32 kfunc_btf_id)
+ u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
{
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
enum btf_kfunc_hook hook;
u32 *kfunc_flags;
- kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id);
+ kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog);
if (kfunc_flags)
return kfunc_flags;
hook = bpf_prog_type_to_kfunc_hook(prog_type);
- return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
+ return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog);
}
-u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id)
+u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
{
- return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id);
+ return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog);
}
static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
@@ -7848,10 +7892,8 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
pr_err("missing vmlinux BTF, cannot register kfuncs\n");
return -ENOENT;
}
- if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) {
- pr_err("missing module BTF, cannot register kfuncs\n");
- return -ENOENT;
- }
+ if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+ pr_warn("missing module BTF, cannot register kfuncs\n");
return 0;
}
if (IS_ERR(btf))
@@ -7864,7 +7906,8 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
goto err_out;
}
- ret = btf_populate_kfunc_set(btf, hook, kset->set);
+ ret = btf_populate_kfunc_set(btf, hook, kset);
+
err_out:
btf_put(btf);
return ret;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 517b6a5928cc..5b2741aa0d9b 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1826,6 +1826,12 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
ret = 1;
} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
/* optlen is out of bounds */
+ if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
+ pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
+ ctx.optlen, max_optlen);
+ ret = 0;
+ goto out;
+ }
ret = -EFAULT;
} else {
/* optlen within bounds, run kernel handler */
@@ -1881,8 +1887,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
.optname = optname,
.current_task = current,
};
+ int orig_optlen;
int ret;
+ orig_optlen = max_optlen;
ctx.optlen = max_optlen;
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
@@ -1905,6 +1913,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
ret = -EFAULT;
goto out;
}
+ orig_optlen = ctx.optlen;
if (copy_from_user(ctx.optval, optval,
min(ctx.optlen, max_optlen)) != 0) {
@@ -1922,6 +1931,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
goto out;
if (optval && (ctx.optlen > max_optlen || ctx.optlen < 0)) {
+ if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) {
+ pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
+ ctx.optlen, max_optlen);
+ ret = retval;
+ goto out;
+ }
ret = -EFAULT;
goto out;
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7421487422d4..0f8f036d8bd1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -61,6 +61,7 @@
#define AX regs[BPF_REG_AX]
#define ARG1 regs[BPF_REG_ARG1]
#define CTX regs[BPF_REG_CTX]
+#define OFF insn->off
#define IMM insn->imm
struct bpf_mem_alloc bpf_global_ma;
@@ -372,7 +373,12 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
{
const s32 off_min = S16_MIN, off_max = S16_MAX;
s32 delta = end_new - end_old;
- s32 off = insn->off;
+ s32 off;
+
+ if (insn->code == (BPF_JMP32 | BPF_JA))
+ off = insn->imm;
+ else
+ off = insn->off;
if (curr < pos && curr + off + 1 >= end_old)
off += delta;
@@ -380,8 +386,12 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
off -= delta;
if (off < off_min || off > off_max)
return -ERANGE;
- if (!probe_pass)
- insn->off = off;
+ if (!probe_pass) {
+ if (insn->code == (BPF_JMP32 | BPF_JA))
+ insn->imm = off;
+ else
+ insn->off = off;
+ }
return 0;
}
@@ -1271,7 +1281,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case BPF_ALU | BPF_MOD | BPF_K:
*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
- *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX);
+ *to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
break;
case BPF_ALU64 | BPF_ADD | BPF_K:
@@ -1285,7 +1295,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case BPF_ALU64 | BPF_MOD | BPF_K:
*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
- *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX);
+ *to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
break;
case BPF_JMP | BPF_JEQ | BPF_K:
@@ -1523,6 +1533,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(ALU64, DIV, X), \
INSN_3(ALU64, MOD, X), \
INSN_2(ALU64, NEG), \
+ INSN_3(ALU64, END, TO_LE), \
/* Immediate based. */ \
INSN_3(ALU64, ADD, K), \
INSN_3(ALU64, SUB, K), \
@@ -1591,6 +1602,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(JMP, JSLE, K), \
INSN_3(JMP, JSET, K), \
INSN_2(JMP, JA), \
+ INSN_2(JMP32, JA), \
/* Store instructions. */ \
/* Register based. */ \
INSN_3(STX, MEM, B), \
@@ -1610,6 +1622,9 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(LDX, MEM, H), \
INSN_3(LDX, MEM, W), \
INSN_3(LDX, MEM, DW), \
+ INSN_3(LDX, MEMSX, B), \
+ INSN_3(LDX, MEMSX, H), \
+ INSN_3(LDX, MEMSX, W), \
/* Immediate based. */ \
INSN_3(LD, IMM, DW)
@@ -1635,12 +1650,6 @@ bool bpf_opcode_in_insntable(u8 code)
}
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
-u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
-{
- memset(dst, 0, size);
- return -EFAULT;
-}
-
/**
* ___bpf_prog_run - run eBPF program on a given context
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
@@ -1666,6 +1675,9 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
[BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
+ [BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B,
+ [BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H,
+ [BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W,
};
#undef BPF_INSN_3_LBL
#undef BPF_INSN_2_LBL
@@ -1733,13 +1745,36 @@ select_insn:
DST = -DST;
CONT;
ALU_MOV_X:
- DST = (u32) SRC;
+ switch (OFF) {
+ case 0:
+ DST = (u32) SRC;
+ break;
+ case 8:
+ DST = (u32)(s8) SRC;
+ break;
+ case 16:
+ DST = (u32)(s16) SRC;
+ break;
+ }
CONT;
ALU_MOV_K:
DST = (u32) IMM;
CONT;
ALU64_MOV_X:
- DST = SRC;
+ switch (OFF) {
+ case 0:
+ DST = SRC;
+ break;
+ case 8:
+ DST = (s8) SRC;
+ break;
+ case 16:
+ DST = (s16) SRC;
+ break;
+ case 32:
+ DST = (s32) SRC;
+ break;
+ }
CONT;
ALU64_MOV_K:
DST = IMM;
@@ -1761,36 +1796,114 @@ select_insn:
(*(s64 *) &DST) >>= IMM;
CONT;
ALU64_MOD_X:
- div64_u64_rem(DST, SRC, &AX);
- DST = AX;
+ switch (OFF) {
+ case 0:
+ div64_u64_rem(DST, SRC, &AX);
+ DST = AX;
+ break;
+ case 1:
+ AX = div64_s64(DST, SRC);
+ DST = DST - AX * SRC;
+ break;
+ }
CONT;
ALU_MOD_X:
- AX = (u32) DST;
- DST = do_div(AX, (u32) SRC);
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ DST = do_div(AX, (u32) SRC);
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ AX = do_div(AX, abs((s32)SRC));
+ if ((s32)DST < 0)
+ DST = (u32)-AX;
+ else
+ DST = (u32)AX;
+ break;
+ }
CONT;
ALU64_MOD_K:
- div64_u64_rem(DST, IMM, &AX);
- DST = AX;
+ switch (OFF) {
+ case 0:
+ div64_u64_rem(DST, IMM, &AX);
+ DST = AX;
+ break;
+ case 1:
+ AX = div64_s64(DST, IMM);
+ DST = DST - AX * IMM;
+ break;
+ }
CONT;
ALU_MOD_K:
- AX = (u32) DST;
- DST = do_div(AX, (u32) IMM);
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ DST = do_div(AX, (u32) IMM);
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ AX = do_div(AX, abs((s32)IMM));
+ if ((s32)DST < 0)
+ DST = (u32)-AX;
+ else
+ DST = (u32)AX;
+ break;
+ }
CONT;
ALU64_DIV_X:
- DST = div64_u64(DST, SRC);
+ switch (OFF) {
+ case 0:
+ DST = div64_u64(DST, SRC);
+ break;
+ case 1:
+ DST = div64_s64(DST, SRC);
+ break;
+ }
CONT;
ALU_DIV_X:
- AX = (u32) DST;
- do_div(AX, (u32) SRC);
- DST = (u32) AX;
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ do_div(AX, (u32) SRC);
+ DST = (u32) AX;
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ do_div(AX, abs((s32)SRC));
+ if (((s32)DST < 0) == ((s32)SRC < 0))
+ DST = (u32)AX;
+ else
+ DST = (u32)-AX;
+ break;
+ }
CONT;
ALU64_DIV_K:
- DST = div64_u64(DST, IMM);
+ switch (OFF) {
+ case 0:
+ DST = div64_u64(DST, IMM);
+ break;
+ case 1:
+ DST = div64_s64(DST, IMM);
+ break;
+ }
CONT;
ALU_DIV_K:
- AX = (u32) DST;
- do_div(AX, (u32) IMM);
- DST = (u32) AX;
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ do_div(AX, (u32) IMM);
+ DST = (u32) AX;
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ do_div(AX, abs((s32)IMM));
+ if (((s32)DST < 0) == ((s32)IMM < 0))
+ DST = (u32)AX;
+ else
+ DST = (u32)-AX;
+ break;
+ }
CONT;
ALU_END_TO_BE:
switch (IMM) {
@@ -1818,6 +1931,19 @@ select_insn:
break;
}
CONT;
+ ALU64_END_TO_LE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) __swab16(DST);
+ break;
+ case 32:
+ DST = (__force u32) __swab32(DST);
+ break;
+ case 64:
+ DST = (__force u64) __swab64(DST);
+ break;
+ }
+ CONT;
/* CALL */
JMP_CALL:
@@ -1867,6 +1993,9 @@ out:
JMP_JA:
insn += insn->off;
CONT;
+ JMP32_JA:
+ insn += insn->imm;
+ CONT;
JMP_EXIT:
return BPF_R0;
/* JMP */
@@ -1931,8 +2060,8 @@ out:
DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
CONT; \
LDX_PROBE_MEM_##SIZEOP: \
- bpf_probe_read_kernel(&DST, sizeof(SIZE), \
- (const void *)(long) (SRC + insn->off)); \
+ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \
+ (const void *)(long) (SRC + insn->off)); \
DST = *((SIZE *)&DST); \
CONT;
@@ -1942,6 +2071,21 @@ out:
LDST(DW, u64)
#undef LDST
+#define LDSX(SIZEOP, SIZE) \
+ LDX_MEMSX_##SIZEOP: \
+ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT; \
+ LDX_PROBE_MEMSX_##SIZEOP: \
+ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \
+ (const void *)(long) (SRC + insn->off)); \
+ DST = *((SIZE *)&DST); \
+ CONT;
+
+ LDSX(B, s8)
+ LDSX(H, s16)
+ LDSX(W, s32)
+#undef LDSX
+
#define ATOMIC_ALU_OP(BOP, KOP) \
case BOP: \
if (BPF_SIZE(insn->code) == BPF_W) \
@@ -2064,14 +2208,16 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST
#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
-static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
- const struct bpf_insn *insn) = {
+static __maybe_unused
+u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
+ const struct bpf_insn *insn) = {
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST
+#ifdef CONFIG_BPF_SYSCALL
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
{
stack_depth = max_t(u32, stack_depth, 1);
@@ -2080,7 +2226,7 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
__bpf_call_base_args;
insn->code = BPF_JMP | BPF_CALL_ARGS;
}
-
+#endif
#else
static unsigned int __bpf_prog_ret0_warn(const void *ctx,
const struct bpf_insn *insn)
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8ec18faa74ac..e42a1bdb7f53 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -28,7 +28,7 @@
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
-#include <linux/capability.h>
+#include <linux/completion.h>
#include <trace/events/xdp.h>
#include <linux/btf_ids.h>
@@ -61,8 +61,6 @@ struct bpf_cpu_map_entry {
/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
struct xdp_bulk_queue __percpu *bulkq;
- struct bpf_cpu_map *cmap;
-
/* Queue with potential multi-producers, and single-consumer kthread */
struct ptr_ring *queue;
struct task_struct *kthread;
@@ -70,10 +68,8 @@ struct bpf_cpu_map_entry {
struct bpf_cpumap_val value;
struct bpf_prog *prog;
- atomic_t refcnt; /* Control when this struct can be free'ed */
- struct rcu_head rcu;
-
- struct work_struct kthread_stop_wq;
+ struct completion kthread_running;
+ struct rcu_work free_work;
};
struct bpf_cpu_map {
@@ -89,9 +85,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
u32 value_size = attr->value_size;
struct bpf_cpu_map *cmap;
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
(value_size != offsetofend(struct bpf_cpumap_val, qsize) &&
@@ -121,27 +114,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
return &cmap->map;
}
-static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
- atomic_inc(&rcpu->refcnt);
-}
-
-/* called from workqueue, to workaround syscall using preempt_disable */
-static void cpu_map_kthread_stop(struct work_struct *work)
-{
- struct bpf_cpu_map_entry *rcpu;
-
- rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
-
- /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
- * as it waits until all in-flight call_rcu() callbacks complete.
- */
- rcu_barrier();
-
- /* kthread_stop will wake_up_process and wait for it to complete */
- kthread_stop(rcpu->kthread);
-}
-
static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
{
/* The tear-down procedure should have made sure that queue is
@@ -149,23 +121,16 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
* invoked cpu_map_kthread_stop(). Catch any broken behaviour
* gracefully and warn once.
*/
- struct xdp_frame *xdpf;
-
- while ((xdpf = ptr_ring_consume(ring)))
- if (WARN_ON_ONCE(xdpf))
- xdp_return_frame(xdpf);
-}
+ void *ptr;
-static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
- if (atomic_dec_and_test(&rcpu->refcnt)) {
- if (rcpu->prog)
- bpf_prog_put(rcpu->prog);
- /* The queue should be empty at this point */
- __cpu_map_ring_cleanup(rcpu->queue);
- ptr_ring_cleanup(rcpu->queue, NULL);
- kfree(rcpu->queue);
- kfree(rcpu);
+ while ((ptr = ptr_ring_consume(ring))) {
+ WARN_ON_ONCE(1);
+ if (unlikely(__ptr_test_bit(0, &ptr))) {
+ __ptr_clear_bit(0, &ptr);
+ kfree_skb(ptr);
+ continue;
+ }
+ xdp_return_frame(ptr);
}
}
@@ -294,11 +259,11 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
return nframes;
}
-
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
+ complete(&rcpu->kthread_running);
set_current_state(TASK_INTERRUPTIBLE);
/* When kthread gives stop order, then rcpu have been disconnected
@@ -393,7 +358,6 @@ static int cpu_map_kthread_run(void *data)
}
__set_current_state(TASK_RUNNING);
- put_cpu_map_entry(rcpu);
return 0;
}
@@ -463,19 +427,23 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
goto free_ptr_ring;
/* Setup kthread */
+ init_completion(&rcpu->kthread_running);
rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
"cpumap/%d/map:%d", cpu,
map->id);
if (IS_ERR(rcpu->kthread))
goto free_prog;
- get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
- get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
-
/* Make sure kthread runs on a single CPU */
kthread_bind(rcpu->kthread, cpu);
wake_up_process(rcpu->kthread);
+ /* Make sure kthread has been running, so kthread_stop() will not
+ * stop the kthread prematurely and all pending frames or skbs
+ * will be handled by the kthread before kthread_stop() returns.
+ */
+ wait_for_completion(&rcpu->kthread_running);
+
return rcpu;
free_prog:
@@ -492,40 +460,40 @@ free_rcu:
return NULL;
}
-static void __cpu_map_entry_free(struct rcu_head *rcu)
+static void __cpu_map_entry_free(struct work_struct *work)
{
struct bpf_cpu_map_entry *rcpu;
/* This cpu_map_entry have been disconnected from map and one
- * RCU grace-period have elapsed. Thus, XDP cannot queue any
+ * RCU grace-period have elapsed. Thus, XDP cannot queue any
* new packets and cannot change/set flush_needed that can
* find this entry.
*/
- rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
+ rcpu = container_of(to_rcu_work(work), struct bpf_cpu_map_entry, free_work);
+
+ /* kthread_stop will wake_up_process and wait for it to complete.
+ * cpu_map_kthread_run() makes sure the pointer ring is empty
+ * before exiting.
+ */
+ kthread_stop(rcpu->kthread);
+ if (rcpu->prog)
+ bpf_prog_put(rcpu->prog);
+ /* The queue should be empty at this point */
+ __cpu_map_ring_cleanup(rcpu->queue);
+ ptr_ring_cleanup(rcpu->queue, NULL);
+ kfree(rcpu->queue);
free_percpu(rcpu->bulkq);
- /* Cannot kthread_stop() here, last put free rcpu resources */
- put_cpu_map_entry(rcpu);
+ kfree(rcpu);
}
-/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
- * ensure any driver rcu critical sections have completed, but this
- * does not guarantee a flush has happened yet. Because driver side
- * rcu_read_lock/unlock only protects the running XDP program. The
- * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
- * pending flush op doesn't fail.
- *
- * The bpf_cpu_map_entry is still used by the kthread, and there can
- * still be pending packets (in queue and percpu bulkq). A refcnt
- * makes sure to last user (kthread_stop vs. call_rcu) free memory
- * resources.
- *
- * The rcu callback __cpu_map_entry_free flush remaining packets in
- * percpu bulkq to queue. Due to caller map_delete_elem() disable
- * preemption, cannot call kthread_stop() to make sure queue is empty.
- * Instead a work_queue is started for stopping kthread,
- * cpu_map_kthread_stop, which waits for an RCU grace period before
- * stopping kthread, emptying the queue.
+/* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old
+ * entry is no longer in use before freeing. We use queue_rcu_work() to call
+ * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace
+ * period. This means that (a) all pending enqueue and flush operations have
+ * completed (because of the RCU callback), and (b) we are in a workqueue
+ * context where we can stop the kthread and wait for it to exit before freeing
+ * everything.
*/
static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
@@ -534,9 +502,8 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu)));
if (old_rcpu) {
- call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
- INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
- schedule_work(&old_rcpu->kthread_stop_wq);
+ INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free);
+ queue_rcu_work(system_wq, &old_rcpu->free_work);
}
}
@@ -548,7 +515,7 @@ static long cpu_map_delete_elem(struct bpf_map *map, void *key)
if (key_cpu >= map->max_entries)
return -EINVAL;
- /* notice caller map_delete_elem() use preempt_disable() */
+ /* notice caller map_delete_elem() uses rcu_read_lock() */
__cpu_map_entry_replace(cmap, key_cpu, NULL);
return 0;
}
@@ -584,7 +551,6 @@ static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu);
if (!rcpu)
return -ENOMEM;
- rcpu->cmap = cmap;
}
rcu_read_lock();
__cpu_map_entry_replace(cmap, key_cpu, rcpu);
@@ -600,16 +566,15 @@ static void cpu_map_free(struct bpf_map *map)
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the bpf programs (can be more than one that used this map) were
* disconnected from events. Wait for outstanding critical sections in
- * these programs to complete. The rcu critical section only guarantees
- * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
- * It does __not__ ensure pending flush operations (if any) are
- * complete.
+ * these programs to complete. synchronize_rcu() below not only
+ * guarantees no further "XDP/bpf-side" reads against
+ * bpf_cpu_map->cpu_map, but also ensure pending flush operations
+ * (if any) are completed.
*/
-
synchronize_rcu();
- /* For cpu_map the remote CPUs can still be using the entries
- * (struct bpf_cpu_map_entry).
+ /* The only possible user of bpf_cpu_map_entry is
+ * cpu_map_kthread_run().
*/
for (i = 0; i < cmap->map.max_entries; i++) {
struct bpf_cpu_map_entry *rcpu;
@@ -618,8 +583,8 @@ static void cpu_map_free(struct bpf_map *map)
if (!rcpu)
continue;
- /* bq flush and cleanup happens after RCU grace-period */
- __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
+ /* Stop kthread and cleanup entry directly */
+ __cpu_map_entry_free(&rcpu->free_work.work);
}
bpf_map_area_free(cmap->cpu_map);
bpf_map_area_free(cmap);
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 7efdf5d770ca..6983af8e093c 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -9,7 +9,6 @@
/**
* struct bpf_cpumask - refcounted BPF cpumask wrapper structure
* @cpumask: The actual cpumask embedded in the struct.
- * @rcu: The RCU head used to free the cpumask with RCU safety.
* @usage: Object reference counter. When the refcount goes to 0, the
* memory is released back to the BPF allocator, which provides
* RCU safety.
@@ -25,7 +24,6 @@
*/
struct bpf_cpumask {
cpumask_t cpumask;
- struct rcu_head rcu;
refcount_t usage;
};
@@ -82,16 +80,6 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
return cpumask;
}
-static void cpumask_free_cb(struct rcu_head *head)
-{
- struct bpf_cpumask *cpumask;
-
- cpumask = container_of(head, struct bpf_cpumask, rcu);
- migrate_disable();
- bpf_mem_cache_free(&bpf_cpumask_ma, cpumask);
- migrate_enable();
-}
-
/**
* bpf_cpumask_release() - Release a previously acquired BPF cpumask.
* @cpumask: The cpumask being released.
@@ -102,8 +90,12 @@ static void cpumask_free_cb(struct rcu_head *head)
*/
__bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
{
- if (refcount_dec_and_test(&cpumask->usage))
- call_rcu(&cpumask->rcu, cpumask_free_cb);
+ if (!refcount_dec_and_test(&cpumask->usage))
+ return;
+
+ migrate_disable();
+ bpf_mem_cache_free_rcu(&bpf_cpumask_ma, cpumask);
+ migrate_enable();
}
/**
@@ -132,6 +124,21 @@ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask)
}
/**
+ * bpf_cpumask_first_and() - Return the index of the first nonzero bit from the
+ * AND of two cpumasks.
+ * @src1: The first cpumask.
+ * @src2: The second cpumask.
+ *
+ * Find the index of the first nonzero bit of the AND of two cpumasks.
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc u32 bpf_cpumask_first_and(const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ return cpumask_first_and(src1, src2);
+}
+
+/**
* bpf_cpumask_set_cpu() - Set a bit for a CPU in a BPF cpumask.
* @cpu: The CPU to be set in the cpumask.
* @cpumask: The BPF cpumask in which a bit is being set.
@@ -367,7 +374,7 @@ __bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask
}
/**
- * bpf_cpumask_any() - Return a random set CPU from a cpumask.
+ * bpf_cpumask_any_distribute() - Return a random set CPU from a cpumask.
* @cpumask: The cpumask being queried.
*
* Return:
@@ -376,26 +383,28 @@ __bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask
*
* A struct bpf_cpumask pointer may be safely passed to @src.
*/
-__bpf_kfunc u32 bpf_cpumask_any(const struct cpumask *cpumask)
+__bpf_kfunc u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask)
{
- return cpumask_any(cpumask);
+ return cpumask_any_distribute(cpumask);
}
/**
- * bpf_cpumask_any_and() - Return a random set CPU from the AND of two
- * cpumasks.
+ * bpf_cpumask_any_and_distribute() - Return a random set CPU from the AND of
+ * two cpumasks.
* @src1: The first cpumask.
* @src2: The second cpumask.
*
* Return:
- * * A random set bit within [0, num_cpus) if at least one bit is set.
+ * * A random set bit within [0, num_cpus) from the AND of two cpumasks, if at
+ * least one bit is set.
* * >= num_cpus if no bit is set.
*
* struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
*/
-__bpf_kfunc u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2)
+__bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
+ const struct cpumask *src2)
{
- return cpumask_any_and(src1, src2);
+ return cpumask_any_and_distribute(src1, src2);
}
__diag_pop();
@@ -406,6 +415,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_RCU)
@@ -422,8 +432,8 @@ BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
-BTF_ID_FLAGS(func, bpf_cpumask_any, KF_RCU)
-BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
BTF_SET8_END(cpumask_kfunc_btf_ids)
static const struct btf_kfunc_id_set cpumask_kfunc_set = {
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 802692fa3905..4d42f6ed6c11 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -65,7 +65,6 @@ struct xdp_dev_bulk_queue {
struct bpf_dtab_netdev {
struct net_device *dev; /* must be first member, due to tracepoint */
struct hlist_node index_hlist;
- struct bpf_dtab *dtab;
struct bpf_prog *xdp_prog;
struct rcu_head rcu;
unsigned int idx;
@@ -160,9 +159,6 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
struct bpf_dtab *dtab;
int err;
- if (!capable(CAP_NET_ADMIN))
- return ERR_PTR(-EPERM);
-
dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE);
if (!dtab)
return ERR_PTR(-ENOMEM);
@@ -877,7 +873,6 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
}
dev->idx = idx;
- dev->dtab = dtab;
if (prog) {
dev->xdp_prog = prog;
dev->val.bpf_prog.id = prog->aux->id;
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 7b4afb7d96db..49940c26a227 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -87,6 +87,17 @@ const char *const bpf_alu_string[16] = {
[BPF_END >> 4] = "endian",
};
+static const char *const bpf_alu_sign_string[16] = {
+ [BPF_DIV >> 4] = "s/=",
+ [BPF_MOD >> 4] = "s%=",
+};
+
+static const char *const bpf_movsx_string[4] = {
+ [0] = "(s8)",
+ [1] = "(s16)",
+ [3] = "(s32)",
+};
+
static const char *const bpf_atomic_alu_string[16] = {
[BPF_ADD >> 4] = "add",
[BPF_AND >> 4] = "and",
@@ -101,6 +112,12 @@ static const char *const bpf_ldst_string[] = {
[BPF_DW >> 3] = "u64",
};
+static const char *const bpf_ldsx_string[] = {
+ [BPF_W >> 3] = "s32",
+ [BPF_H >> 3] = "s16",
+ [BPF_B >> 3] = "s8",
+};
+
static const char *const bpf_jmp_string[16] = {
[BPF_JA >> 4] = "jmp",
[BPF_JEQ >> 4] = "==",
@@ -128,6 +145,27 @@ static void print_bpf_end_insn(bpf_insn_print_t verbose,
insn->imm, insn->dst_reg);
}
+static void print_bpf_bswap_insn(bpf_insn_print_t verbose,
+ void *private_data,
+ const struct bpf_insn *insn)
+{
+ verbose(private_data, "(%02x) r%d = bswap%d r%d\n",
+ insn->code, insn->dst_reg,
+ insn->imm, insn->dst_reg);
+}
+
+static bool is_sdiv_smod(const struct bpf_insn *insn)
+{
+ return (BPF_OP(insn->code) == BPF_DIV || BPF_OP(insn->code) == BPF_MOD) &&
+ insn->off == 1;
+}
+
+static bool is_movsx(const struct bpf_insn *insn)
+{
+ return BPF_OP(insn->code) == BPF_MOV &&
+ (insn->off == 8 || insn->off == 16 || insn->off == 32);
+}
+
void print_bpf_insn(const struct bpf_insn_cbs *cbs,
const struct bpf_insn *insn,
bool allow_ptr_leaks)
@@ -138,7 +176,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
if (class == BPF_ALU || class == BPF_ALU64) {
if (BPF_OP(insn->code) == BPF_END) {
if (class == BPF_ALU64)
- verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code);
+ print_bpf_bswap_insn(verbose, cbs->private_data, insn);
else
print_bpf_end_insn(verbose, cbs->private_data, insn);
} else if (BPF_OP(insn->code) == BPF_NEG) {
@@ -147,17 +185,20 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n",
+ verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n",
insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
+ is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4]
+ : bpf_alu_string[BPF_OP(insn->code) >> 4],
+ is_movsx(insn) ? bpf_movsx_string[(insn->off >> 3) - 1] : "",
class == BPF_ALU ? 'w' : 'r',
insn->src_reg);
} else {
verbose(cbs->private_data, "(%02x) %c%d %s %d\n",
insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
+ is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4]
+ : bpf_alu_string[BPF_OP(insn->code) >> 4],
insn->imm);
}
} else if (class == BPF_STX) {
@@ -218,13 +259,15 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
}
} else if (class == BPF_LDX) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
+ if (BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) {
verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
return;
}
verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n",
insn->code, insn->dst_reg,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ BPF_MODE(insn->code) == BPF_MEM ?
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3] :
+ bpf_ldsx_string[BPF_SIZE(insn->code) >> 3],
insn->src_reg, insn->off);
} else if (class == BPF_LD) {
if (BPF_MODE(insn->code) == BPF_ABS) {
@@ -279,6 +322,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
} else if (insn->code == (BPF_JMP | BPF_JA)) {
verbose(cbs->private_data, "(%02x) goto pc%+d\n",
insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP32 | BPF_JA)) {
+ verbose(cbs->private_data, "(%02x) gotol pc%+d\n",
+ insn->code, insn->imm);
} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
verbose(cbs->private_data, "(%02x) exit\n", insn->code);
} else if (BPF_SRC(insn->code) == BPF_X) {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 9901efee4339..a8c7e1c5abfa 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -302,6 +302,7 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
struct htab_elem *l;
if (node) {
+ bpf_map_inc_elem_count(&htab->map);
l = container_of(node, struct htab_elem, lru_node);
memcpy(l->key, key, htab->map.key_size);
return l;
@@ -422,12 +423,6 @@ static int htab_map_alloc_check(union bpf_attr *attr)
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
offsetof(struct htab_elem, hash_node.pprev));
- if (lru && !bpf_capable())
- /* LRU implementation is much complicated than other
- * maps. Hence, limit to CAP_BPF.
- */
- return -EPERM;
-
if (zero_seed && !capable(CAP_SYS_ADMIN))
/* Guard against local DoS, and discourage production use. */
return -EPERM;
@@ -516,12 +511,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->n_buckets > U32_MAX / sizeof(struct bucket))
goto free_htab;
+ err = bpf_map_init_elem_count(&htab->map);
+ if (err)
+ goto free_htab;
+
err = -ENOMEM;
htab->buckets = bpf_map_area_alloc(htab->n_buckets *
sizeof(struct bucket),
htab->map.numa_node);
if (!htab->buckets)
- goto free_htab;
+ goto free_elem_count;
for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
@@ -599,6 +598,8 @@ free_map_locked:
bpf_map_area_free(htab->buckets);
bpf_mem_alloc_destroy(&htab->pcpu_ma);
bpf_mem_alloc_destroy(&htab->ma);
+free_elem_count:
+ bpf_map_free_elem_count(&htab->map);
free_htab:
lockdep_unregister_key(&htab->lockdep_key);
bpf_map_area_free(htab);
@@ -810,6 +811,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
if (l == tgt_l) {
hlist_nulls_del_rcu(&l->hash_node);
check_and_free_fields(htab, l);
+ bpf_map_dec_elem_count(&htab->map);
break;
}
@@ -906,6 +908,8 @@ static bool is_map_full(struct bpf_htab *htab)
static void inc_elem_count(struct bpf_htab *htab)
{
+ bpf_map_inc_elem_count(&htab->map);
+
if (htab->use_percpu_counter)
percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
else
@@ -914,6 +918,8 @@ static void inc_elem_count(struct bpf_htab *htab)
static void dec_elem_count(struct bpf_htab *htab)
{
+ bpf_map_dec_elem_count(&htab->map);
+
if (htab->use_percpu_counter)
percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
else
@@ -926,6 +932,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
htab_put_fd_value(htab, l);
if (htab_is_prealloc(htab)) {
+ bpf_map_dec_elem_count(&htab->map);
check_and_free_fields(htab, l);
__pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
@@ -1006,6 +1013,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
if (!l)
return ERR_PTR(-E2BIG);
l_new = container_of(l, struct htab_elem, fnode);
+ bpf_map_inc_elem_count(&htab->map);
}
} else {
if (is_map_full(htab))
@@ -1174,6 +1182,7 @@ err:
static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
{
check_and_free_fields(htab, elem);
+ bpf_map_dec_elem_count(&htab->map);
bpf_lru_push_free(&htab->lru, &elem->lru_node);
}
@@ -1363,8 +1372,10 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
err:
htab_unlock_bucket(htab, b, hash, flags);
err_lock_bucket:
- if (l_new)
+ if (l_new) {
+ bpf_map_dec_elem_count(&htab->map);
bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+ }
return ret;
}
@@ -1529,6 +1540,7 @@ static void htab_map_free(struct bpf_map *map)
prealloc_destroy(htab);
}
+ bpf_map_free_elem_count(map);
free_percpu(htab->extra_elems);
bpf_map_area_free(htab->buckets);
bpf_mem_alloc_destroy(&htab->pcpu_ma);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 8d368fa353f9..8bd3812fb8df 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -286,6 +286,7 @@ static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
+ preempt_disable();
arch_spin_lock(l);
}
@@ -294,6 +295,7 @@ static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
arch_spinlock_t *l = (void *)lock;
arch_spin_unlock(l);
+ preempt_enable();
}
#else
@@ -1423,7 +1425,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
#define DYNPTR_SIZE_MASK 0xFFFFFF
#define DYNPTR_RDONLY_BIT BIT(31)
-static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
+static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_RDONLY_BIT;
}
@@ -1443,11 +1445,18 @@ static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *pt
return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
}
-u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)
+u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_SIZE_MASK;
}
+static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size)
+{
+ u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK;
+
+ ptr->size = new_size | metadata;
+}
+
int bpf_dynptr_check_size(u32 size)
{
return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
@@ -1469,7 +1478,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
{
- u32 size = bpf_dynptr_get_size(ptr);
+ u32 size = __bpf_dynptr_size(ptr);
if (len > size || offset > size - len)
return -E2BIG;
@@ -1563,7 +1572,7 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v
enum bpf_dynptr_type type;
int err;
- if (!dst->data || bpf_dynptr_is_rdonly(dst))
+ if (!dst->data || __bpf_dynptr_is_rdonly(dst))
return -EINVAL;
err = bpf_dynptr_check_off_len(dst, offset, len);
@@ -1619,7 +1628,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
if (err)
return 0;
- if (bpf_dynptr_is_rdonly(ptr))
+ if (__bpf_dynptr_is_rdonly(ptr))
return 0;
type = bpf_dynptr_get_type(ptr);
@@ -1906,7 +1915,11 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
if (rec)
bpf_obj_free_fields(rec, p);
- bpf_mem_free(&bpf_global_ma, p);
+
+ if (rec && rec->refcount_off >= 0)
+ bpf_mem_free_rcu(&bpf_global_ma, p);
+ else
+ bpf_mem_free(&bpf_global_ma, p);
}
__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
@@ -1926,28 +1939,38 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta
* bpf_refcount type so that it is emitted in vmlinux BTF
*/
ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off);
+ if (!refcount_inc_not_zero((refcount_t *)ref))
+ return NULL;
- refcount_inc((refcount_t *)ref);
+ /* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null
+ * in verifier.c
+ */
return (void *)p__refcounted_kptr;
}
-static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head,
+static int __bpf_list_add(struct bpf_list_node_kern *node,
+ struct bpf_list_head *head,
bool tail, struct btf_record *rec, u64 off)
{
- struct list_head *n = (void *)node, *h = (void *)head;
+ struct list_head *n = &node->list_head, *h = (void *)head;
/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
* called on its fields, so init here
*/
if (unlikely(!h->next))
INIT_LIST_HEAD(h);
- if (!list_empty(n)) {
+
+ /* node->owner != NULL implies !list_empty(n), no need to separately
+ * check the latter
+ */
+ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
/* Only called from BPF prog, no need to migrate_disable */
- __bpf_obj_drop_impl(n - off, rec);
+ __bpf_obj_drop_impl((void *)n - off, rec);
return -EINVAL;
}
tail ? list_add_tail(n, h) : list_add(n, h);
+ WRITE_ONCE(node->owner, head);
return 0;
}
@@ -1956,25 +1979,26 @@ __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
struct bpf_list_node *node,
void *meta__ign, u64 off)
{
+ struct bpf_list_node_kern *n = (void *)node;
struct btf_struct_meta *meta = meta__ign;
- return __bpf_list_add(node, head, false,
- meta ? meta->record : NULL, off);
+ return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off);
}
__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
struct bpf_list_node *node,
void *meta__ign, u64 off)
{
+ struct bpf_list_node_kern *n = (void *)node;
struct btf_struct_meta *meta = meta__ign;
- return __bpf_list_add(node, head, true,
- meta ? meta->record : NULL, off);
+ return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
}
static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
{
struct list_head *n, *h = (void *)head;
+ struct bpf_list_node_kern *node;
/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
* called on its fields, so init here
@@ -1983,8 +2007,14 @@ static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tai
INIT_LIST_HEAD(h);
if (list_empty(h))
return NULL;
+
n = tail ? h->prev : h->next;
+ node = container_of(n, struct bpf_list_node_kern, list_head);
+ if (WARN_ON_ONCE(READ_ONCE(node->owner) != head))
+ return NULL;
+
list_del_init(n);
+ WRITE_ONCE(node->owner, NULL);
return (struct bpf_list_node *)n;
}
@@ -2001,31 +2031,40 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
struct bpf_rb_node *node)
{
+ struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
struct rb_root_cached *r = (struct rb_root_cached *)root;
- struct rb_node *n = (struct rb_node *)node;
+ struct rb_node *n = &node_internal->rb_node;
- if (RB_EMPTY_NODE(n))
+ /* node_internal->owner != root implies either RB_EMPTY_NODE(n) or
+ * n is owned by some other tree. No need to check RB_EMPTY_NODE(n)
+ */
+ if (READ_ONCE(node_internal->owner) != root)
return NULL;
rb_erase_cached(n, r);
RB_CLEAR_NODE(n);
+ WRITE_ONCE(node_internal->owner, NULL);
return (struct bpf_rb_node *)n;
}
/* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
* program
*/
-static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+static int __bpf_rbtree_add(struct bpf_rb_root *root,
+ struct bpf_rb_node_kern *node,
void *less, struct btf_record *rec, u64 off)
{
struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
- struct rb_node *parent = NULL, *n = (struct rb_node *)node;
+ struct rb_node *parent = NULL, *n = &node->rb_node;
bpf_callback_t cb = (bpf_callback_t)less;
bool leftmost = true;
- if (!RB_EMPTY_NODE(n)) {
+ /* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately
+ * check the latter
+ */
+ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
/* Only called from BPF prog, no need to migrate_disable */
- __bpf_obj_drop_impl(n - off, rec);
+ __bpf_obj_drop_impl((void *)n - off, rec);
return -EINVAL;
}
@@ -2041,6 +2080,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
rb_link_node(n, parent, link);
rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
+ WRITE_ONCE(node->owner, root);
return 0;
}
@@ -2049,8 +2089,9 @@ __bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node
void *meta__ign, u64 off)
{
struct btf_struct_meta *meta = meta__ign;
+ struct bpf_rb_node_kern *n = (void *)node;
- return __bpf_rbtree_add(root, node, (void *)less, meta ? meta->record : NULL, off);
+ return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off);
}
__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
@@ -2142,6 +2183,22 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
return NULL;
return cgrp;
}
+
+/**
+ * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test
+ * task's membership of cgroup ancestry.
+ * @task: the task to be tested
+ * @ancestor: possible ancestor of @task's cgroup
+ *
+ * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
+ * It follows all the same rules as cgroup_is_descendant, and only applies
+ * to the default hierarchy.
+ */
+__bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
+ struct cgroup *ancestor)
+{
+ return task_under_cgroup_hierarchy(task, ancestor);
+}
#endif /* CONFIG_CGROUPS */
/**
@@ -2167,13 +2224,15 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
* bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
* @ptr: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
- * @buffer: User-provided buffer to copy contents into
- * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
- * requested slice. This must be a constant.
+ * @buffer__opt: User-provided buffer to copy contents into. May be NULL
+ * @buffer__szk: Size (in bytes) of the buffer if present. This is the
+ * length of the requested slice. This must be a constant.
*
* For non-skb and non-xdp type dynptrs, there is no difference between
* bpf_dynptr_slice and bpf_dynptr_data.
*
+ * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
+ *
* If the intention is to write to the data slice, please use
* bpf_dynptr_slice_rdwr.
*
@@ -2190,7 +2249,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
* direct pointer)
*/
__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
- void *buffer, u32 buffer__szk)
+ void *buffer__opt, u32 buffer__szk)
{
enum bpf_dynptr_type type;
u32 len = buffer__szk;
@@ -2210,15 +2269,20 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
case BPF_DYNPTR_TYPE_RINGBUF:
return ptr->data + ptr->offset + offset;
case BPF_DYNPTR_TYPE_SKB:
- return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
+ if (buffer__opt)
+ return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt);
+ else
+ return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len);
case BPF_DYNPTR_TYPE_XDP:
{
void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
- if (xdp_ptr)
+ if (!IS_ERR_OR_NULL(xdp_ptr))
return xdp_ptr;
- bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false);
- return buffer;
+ if (!buffer__opt)
+ return NULL;
+ bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
+ return buffer__opt;
}
default:
WARN_ONCE(true, "unknown dynptr type %d\n", type);
@@ -2230,13 +2294,15 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
* bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
* @ptr: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
- * @buffer: User-provided buffer to copy contents into
- * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
- * requested slice. This must be a constant.
+ * @buffer__opt: User-provided buffer to copy contents into. May be NULL
+ * @buffer__szk: Size (in bytes) of the buffer if present. This is the
+ * length of the requested slice. This must be a constant.
*
* For non-skb and non-xdp type dynptrs, there is no difference between
* bpf_dynptr_slice and bpf_dynptr_data.
*
+ * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
+ *
* The returned pointer is writable and may point to either directly the dynptr
* data at the requested offset or to the buffer if unable to obtain a direct
* data pointer to (example: the requested slice is to the paged area of an skb
@@ -2267,9 +2333,9 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
* direct pointer)
*/
__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
- void *buffer, u32 buffer__szk)
+ void *buffer__opt, u32 buffer__szk)
{
- if (!ptr->data || bpf_dynptr_is_rdonly(ptr))
+ if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
return NULL;
/* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
@@ -2294,7 +2360,59 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o
* will be copied out into the buffer and the user will need to call
* bpf_dynptr_write() to commit changes.
*/
- return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk);
+ return bpf_dynptr_slice(ptr, offset, buffer__opt, buffer__szk);
+}
+
+__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end)
+{
+ u32 size;
+
+ if (!ptr->data || start > end)
+ return -EINVAL;
+
+ size = __bpf_dynptr_size(ptr);
+
+ if (start > size || end > size)
+ return -ERANGE;
+
+ ptr->offset += start;
+ bpf_dynptr_set_size(ptr, end - start);
+
+ return 0;
+}
+
+__bpf_kfunc bool bpf_dynptr_is_null(struct bpf_dynptr_kern *ptr)
+{
+ return !ptr->data;
+}
+
+__bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+{
+ if (!ptr->data)
+ return false;
+
+ return __bpf_dynptr_is_rdonly(ptr);
+}
+
+__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
+{
+ if (!ptr->data)
+ return -EINVAL;
+
+ return __bpf_dynptr_size(ptr);
+}
+
+__bpf_kfunc int bpf_dynptr_clone(struct bpf_dynptr_kern *ptr,
+ struct bpf_dynptr_kern *clone__uninit)
+{
+ if (!ptr->data) {
+ bpf_dynptr_set_null(clone__uninit);
+ return -EINVAL;
+ }
+
+ *clone__uninit = *ptr;
+
+ return 0;
}
__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
@@ -2325,7 +2443,7 @@ BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
#endif
BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE)
+BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_push_front_impl)
BTF_ID_FLAGS(func, bpf_list_push_back_impl)
BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
@@ -2341,6 +2459,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
#endif
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
BTF_SET8_END(generic_btf_ids)
@@ -2369,6 +2488,11 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_dynptr_adjust)
+BTF_ID_FLAGS(func, bpf_dynptr_is_null)
+BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
+BTF_ID_FLAGS(func, bpf_dynptr_size)
+BTF_ID_FLAGS(func, bpf_dynptr_clone)
BTF_SET8_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 9948b542a470..99d0625b6c82 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -118,9 +118,8 @@ static struct inode *bpf_get_inode(struct super_block *sb,
return ERR_PTR(-ENOSPC);
inode->i_ino = get_next_ino();
- inode->i_atime = current_time(inode);
+ inode->i_atime = inode_set_ctime_current(inode);
inode->i_mtime = inode->i_atime;
- inode->i_ctime = inode->i_atime;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
@@ -148,8 +147,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
d_instantiate(dentry, inode);
dget(dentry);
- dir->i_mtime = current_time(dir);
- dir->i_ctime = dir->i_mtime;
+ dir->i_mtime = inode_set_ctime_current(dir);
}
static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
@@ -435,7 +433,7 @@ static int bpf_iter_link_pin_kernel(struct dentry *parent,
return ret;
}
-static int bpf_obj_do_pin(const char __user *pathname, void *raw,
+static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw,
enum bpf_type type)
{
struct dentry *dentry;
@@ -444,22 +442,21 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw,
umode_t mode;
int ret;
- dentry = user_path_create(AT_FDCWD, pathname, &path, 0);
+ dentry = user_path_create(path_fd, pathname, &path, 0);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
-
- ret = security_path_mknod(&path, dentry, mode, 0);
- if (ret)
- goto out;
-
dir = d_inode(path.dentry);
if (dir->i_op != &bpf_dir_iops) {
ret = -EPERM;
goto out;
}
+ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+ ret = security_path_mknod(&path, dentry, mode, 0);
+ if (ret)
+ goto out;
+
switch (type) {
case BPF_TYPE_PROG:
ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw);
@@ -478,7 +475,7 @@ out:
return ret;
}
-int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
+int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname)
{
enum bpf_type type;
void *raw;
@@ -488,14 +485,14 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
if (IS_ERR(raw))
return PTR_ERR(raw);
- ret = bpf_obj_do_pin(pathname, raw, type);
+ ret = bpf_obj_do_pin(path_fd, pathname, raw, type);
if (ret != 0)
bpf_any_put(raw, type);
return ret;
}
-static void *bpf_obj_do_get(const char __user *pathname,
+static void *bpf_obj_do_get(int path_fd, const char __user *pathname,
enum bpf_type *type, int flags)
{
struct inode *inode;
@@ -503,7 +500,7 @@ static void *bpf_obj_do_get(const char __user *pathname,
void *raw;
int ret;
- ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path);
+ ret = user_path_at(path_fd, pathname, LOOKUP_FOLLOW, &path);
if (ret)
return ERR_PTR(ret);
@@ -527,7 +524,7 @@ out:
return ERR_PTR(ret);
}
-int bpf_obj_get_user(const char __user *pathname, int flags)
+int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags)
{
enum bpf_type type = BPF_TYPE_UNSPEC;
int f_flags;
@@ -538,7 +535,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
if (f_flags < 0)
return f_flags;
- raw = bpf_obj_do_get(pathname, &type, f_flags);
+ raw = bpf_obj_do_get(path_fd, pathname, &type, f_flags);
if (IS_ERR(raw))
return PTR_ERR(raw);
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 046ddff37a76..850494423530 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -62,9 +62,6 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
- WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
- "verifier log line truncated - local buffer too short\n");
-
if (log->level == BPF_LOG_KERNEL) {
bool newline = n > 0 && log->kbuf[n - 1] == '\n';
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index e0d3ddf2037a..17c7e7782a1f 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -544,9 +544,6 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
struct lpm_trie *trie;
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
/* check sanity of attributes */
if (attr->max_entries == 0 ||
!(attr->map_flags & BPF_F_NO_PREALLOC) ||
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index b0fa190b0979..6fc9dae9edc8 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -78,8 +78,7 @@ static const struct seq_operations bpf_map_seq_ops = {
.show = bpf_map_seq_show,
};
-BTF_ID_LIST(btf_bpf_map_id)
-BTF_ID(struct, bpf_map)
+BTF_ID_LIST_GLOBAL_SINGLE(btf_bpf_map_id, struct, bpf_map)
static const struct bpf_iter_seq_info bpf_map_seq_info = {
.seq_ops = &bpf_map_seq_ops,
@@ -93,7 +92,7 @@ static struct bpf_iter_reg bpf_map_reg_info = {
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_map, map),
- PTR_TO_BTF_ID_OR_NULL },
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
},
.seq_info = &bpf_map_seq_info,
};
@@ -193,3 +192,40 @@ static int __init bpf_map_iter_init(void)
}
late_initcall(bpf_map_iter_init);
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
+{
+ s64 *pcount;
+ s64 ret = 0;
+ int cpu;
+
+ if (!map || !map->elem_count)
+ return 0;
+
+ for_each_possible_cpu(cpu) {
+ pcount = per_cpu_ptr(map->elem_count, cpu);
+ ret += READ_ONCE(*pcount);
+ }
+ return ret;
+}
+
+__diag_pop();
+
+BTF_SET8_START(bpf_map_iter_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS)
+BTF_SET8_END(bpf_map_iter_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_map_iter_kfunc_ids,
+};
+
+static int init_subsystem(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_map_iter_kfunc_set);
+}
+late_initcall(init_subsystem);
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 410637c225fb..9c49ae53deaf 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -98,11 +98,23 @@ struct bpf_mem_cache {
int free_cnt;
int low_watermark, high_watermark, batch;
int percpu_size;
+ bool draining;
+ struct bpf_mem_cache *tgt;
- struct rcu_head rcu;
+ /* list of objects to be freed after RCU GP */
struct llist_head free_by_rcu;
+ struct llist_node *free_by_rcu_tail;
struct llist_head waiting_for_gp;
+ struct llist_node *waiting_for_gp_tail;
+ struct rcu_head rcu;
atomic_t call_rcu_in_progress;
+ struct llist_head free_llist_extra_rcu;
+
+ /* list of objects to be freed after RCU tasks trace GP */
+ struct llist_head free_by_rcu_ttrace;
+ struct llist_head waiting_for_gp_ttrace;
+ struct rcu_head rcu_ttrace;
+ atomic_t call_rcu_ttrace_in_progress;
};
struct bpf_mem_caches {
@@ -153,67 +165,95 @@ static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
#endif
}
+static void inc_active(struct bpf_mem_cache *c, unsigned long *flags)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ /* In RT irq_work runs in per-cpu kthread, so disable
+ * interrupts to avoid preemption and interrupts and
+ * reduce the chance of bpf prog executing on this cpu
+ * when active counter is busy.
+ */
+ local_irq_save(*flags);
+ /* alloc_bulk runs from irq_work which will not preempt a bpf
+ * program that does unit_alloc/unit_free since IRQs are
+ * disabled there. There is no race to increment 'active'
+ * counter. It protects free_llist from corruption in case NMI
+ * bpf prog preempted this loop.
+ */
+ WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+}
+
+static void dec_active(struct bpf_mem_cache *c, unsigned long *flags)
+{
+ local_dec(&c->active);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(*flags);
+}
+
+static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj)
+{
+ unsigned long flags;
+
+ inc_active(c, &flags);
+ __llist_add(obj, &c->free_llist);
+ c->free_cnt++;
+ dec_active(c, &flags);
+}
+
/* Mostly runs from irq_work except __init phase. */
-static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
+static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
{
struct mem_cgroup *memcg = NULL, *old_memcg;
- unsigned long flags;
+ gfp_t gfp;
void *obj;
int i;
- memcg = get_memcg(c);
- old_memcg = set_active_memcg(memcg);
+ gfp = __GFP_NOWARN | __GFP_ACCOUNT;
+ gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL;
+
for (i = 0; i < cnt; i++) {
/*
- * free_by_rcu is only manipulated by irq work refill_work().
- * IRQ works on the same CPU are called sequentially, so it is
- * safe to use __llist_del_first() here. If alloc_bulk() is
- * invoked by the initial prefill, there will be no running
- * refill_work(), so __llist_del_first() is fine as well.
- *
- * In most cases, objects on free_by_rcu are from the same CPU.
- * If some objects come from other CPUs, it doesn't incur any
- * harm because NUMA_NO_NODE means the preference for current
- * numa node and it is not a guarantee.
+ * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is
+ * done only by one CPU == current CPU. Other CPUs might
+ * llist_add() and llist_del_all() in parallel.
*/
- obj = __llist_del_first(&c->free_by_rcu);
- if (!obj) {
- /* Allocate, but don't deplete atomic reserves that typical
- * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
- * will allocate from the current numa node which is what we
- * want here.
- */
- obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT);
- if (!obj)
- break;
- }
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- /* In RT irq_work runs in per-cpu kthread, so disable
- * interrupts to avoid preemption and interrupts and
- * reduce the chance of bpf prog executing on this cpu
- * when active counter is busy.
- */
- local_irq_save(flags);
- /* alloc_bulk runs from irq_work which will not preempt a bpf
- * program that does unit_alloc/unit_free since IRQs are
- * disabled there. There is no race to increment 'active'
- * counter. It protects free_llist from corruption in case NMI
- * bpf prog preempted this loop.
+ obj = llist_del_first(&c->free_by_rcu_ttrace);
+ if (!obj)
+ break;
+ add_obj_to_free_list(c, obj);
+ }
+ if (i >= cnt)
+ return;
+
+ for (; i < cnt; i++) {
+ obj = llist_del_first(&c->waiting_for_gp_ttrace);
+ if (!obj)
+ break;
+ add_obj_to_free_list(c, obj);
+ }
+ if (i >= cnt)
+ return;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ for (; i < cnt; i++) {
+ /* Allocate, but don't deplete atomic reserves that typical
+ * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+ * will allocate from the current numa node which is what we
+ * want here.
*/
- WARN_ON_ONCE(local_inc_return(&c->active) != 1);
- __llist_add(obj, &c->free_llist);
- c->free_cnt++;
- local_dec(&c->active);
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- local_irq_restore(flags);
+ obj = __alloc(c, node, gfp);
+ if (!obj)
+ break;
+ add_obj_to_free_list(c, obj);
}
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
}
-static void free_one(struct bpf_mem_cache *c, void *obj)
+static void free_one(void *obj, bool percpu)
{
- if (c->percpu_size) {
+ if (percpu) {
free_percpu(((void **)obj)[1]);
kfree(obj);
return;
@@ -222,15 +262,24 @@ static void free_one(struct bpf_mem_cache *c, void *obj)
kfree(obj);
}
-static void __free_rcu(struct rcu_head *head)
+static int free_all(struct llist_node *llnode, bool percpu)
{
- struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
- struct llist_node *llnode = llist_del_all(&c->waiting_for_gp);
struct llist_node *pos, *t;
+ int cnt = 0;
- llist_for_each_safe(pos, t, llnode)
- free_one(c, pos);
- atomic_set(&c->call_rcu_in_progress, 0);
+ llist_for_each_safe(pos, t, llnode) {
+ free_one(pos, percpu);
+ cnt++;
+ }
+ return cnt;
+}
+
+static void __free_rcu(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace);
+
+ free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
+ atomic_set(&c->call_rcu_ttrace_in_progress, 0);
}
static void __free_rcu_tasks_trace(struct rcu_head *head)
@@ -249,60 +298,128 @@ static void enque_to_free(struct bpf_mem_cache *c, void *obj)
struct llist_node *llnode = obj;
/* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
- * Nothing races to add to free_by_rcu list.
+ * Nothing races to add to free_by_rcu_ttrace list.
*/
- __llist_add(llnode, &c->free_by_rcu);
+ llist_add(llnode, &c->free_by_rcu_ttrace);
}
-static void do_call_rcu(struct bpf_mem_cache *c)
+static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
{
struct llist_node *llnode, *t;
- if (atomic_xchg(&c->call_rcu_in_progress, 1))
+ if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) {
+ if (unlikely(READ_ONCE(c->draining))) {
+ llnode = llist_del_all(&c->free_by_rcu_ttrace);
+ free_all(llnode, !!c->percpu_size);
+ }
return;
+ }
+
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace));
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace))
+ llist_add(llnode, &c->waiting_for_gp_ttrace);
+
+ if (unlikely(READ_ONCE(c->draining))) {
+ __free_rcu(&c->rcu_ttrace);
+ return;
+ }
- WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
- llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
- /* There is no concurrent __llist_add(waiting_for_gp) access.
- * It doesn't race with llist_del_all either.
- * But there could be two concurrent llist_del_all(waiting_for_gp):
- * from __free_rcu() and from drain_mem_cache().
- */
- __llist_add(llnode, &c->waiting_for_gp);
/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
* If RCU Tasks Trace grace period implies RCU grace period, free
* these elements directly, else use call_rcu() to wait for normal
* progs to finish and finally do free_one() on each element.
*/
- call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
+ call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace);
}
static void free_bulk(struct bpf_mem_cache *c)
{
+ struct bpf_mem_cache *tgt = c->tgt;
struct llist_node *llnode, *t;
unsigned long flags;
int cnt;
+ WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+
do {
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- local_irq_save(flags);
- WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+ inc_active(c, &flags);
llnode = __llist_del_first(&c->free_llist);
if (llnode)
cnt = --c->free_cnt;
else
cnt = 0;
- local_dec(&c->active);
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- local_irq_restore(flags);
+ dec_active(c, &flags);
if (llnode)
- enque_to_free(c, llnode);
+ enque_to_free(tgt, llnode);
} while (cnt > (c->high_watermark + c->low_watermark) / 2);
/* and drain free_llist_extra */
llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
- enque_to_free(c, llnode);
- do_call_rcu(c);
+ enque_to_free(tgt, llnode);
+ do_call_rcu_ttrace(tgt);
+}
+
+static void __free_by_rcu(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+ struct bpf_mem_cache *tgt = c->tgt;
+ struct llist_node *llnode;
+
+ llnode = llist_del_all(&c->waiting_for_gp);
+ if (!llnode)
+ goto out;
+
+ llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);
+
+ /* Objects went through regular RCU GP. Send them to RCU tasks trace */
+ do_call_rcu_ttrace(tgt);
+out:
+ atomic_set(&c->call_rcu_in_progress, 0);
+}
+
+static void check_free_by_rcu(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+ unsigned long flags;
+
+ /* drain free_llist_extra_rcu */
+ if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) {
+ inc_active(c, &flags);
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
+ if (__llist_add(llnode, &c->free_by_rcu))
+ c->free_by_rcu_tail = llnode;
+ dec_active(c, &flags);
+ }
+
+ if (llist_empty(&c->free_by_rcu))
+ return;
+
+ if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
+ /*
+ * Instead of kmalloc-ing new rcu_head and triggering 10k
+ * call_rcu() to hit rcutree.qhimark and force RCU to notice
+ * the overload just ask RCU to hurry up. There could be many
+ * objects in free_by_rcu list.
+ * This hint reduces memory consumption for an artificial
+ * benchmark from 2 Gbyte to 150 Mbyte.
+ */
+ rcu_request_urgent_qs_task(current);
+ return;
+ }
+
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+
+ inc_active(c, &flags);
+ WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu));
+ c->waiting_for_gp_tail = c->free_by_rcu_tail;
+ dec_active(c, &flags);
+
+ if (unlikely(READ_ONCE(c->draining))) {
+ free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
+ atomic_set(&c->call_rcu_in_progress, 0);
+ } else {
+ call_rcu_hurry(&c->rcu, __free_by_rcu);
+ }
}
static void bpf_mem_refill(struct irq_work *work)
@@ -316,9 +433,11 @@ static void bpf_mem_refill(struct irq_work *work)
/* irq_work runs on this cpu and kmalloc will allocate
* from the current numa node which is what we want here.
*/
- alloc_bulk(c, c->batch, NUMA_NO_NODE);
+ alloc_bulk(c, c->batch, NUMA_NO_NODE, true);
else if (cnt > c->high_watermark)
free_bulk(c);
+
+ check_free_by_rcu(c);
}
static void notrace irq_work_raise(struct bpf_mem_cache *c)
@@ -362,7 +481,7 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
* prog won't be doing more than 4 map_update_elem from
* irq disabled region
*/
- alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu));
+ alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
}
/* When size != 0 bpf_mem_cache for each cpu.
@@ -401,6 +520,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
c->unit_size = unit_size;
c->objcg = objcg;
c->percpu_size = percpu_size;
+ c->tgt = c;
prefill_mem_cache(c, cpu);
}
ma->cache = pc;
@@ -423,6 +543,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
c = &cc->cache[i];
c->unit_size = sizes[i];
c->objcg = objcg;
+ c->tgt = c;
prefill_mem_cache(c, cpu);
}
}
@@ -432,27 +553,61 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
static void drain_mem_cache(struct bpf_mem_cache *c)
{
- struct llist_node *llnode, *t;
+ bool percpu = !!c->percpu_size;
/* No progs are using this bpf_mem_cache, but htab_map_free() called
* bpf_mem_cache_free() for all remaining elements and they can be in
- * free_by_rcu or in waiting_for_gp lists, so drain those lists now.
+ * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now.
*
- * Except for waiting_for_gp list, there are no concurrent operations
+ * Except for waiting_for_gp_ttrace list, there are no concurrent operations
* on these lists, so it is safe to use __llist_del_all().
*/
- llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
- free_one(c, llnode);
- llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp))
- free_one(c, llnode);
- llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist))
- free_one(c, llnode);
- llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist_extra))
- free_one(c, llnode);
+ free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu);
+ free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
+ free_all(__llist_del_all(&c->free_llist), percpu);
+ free_all(__llist_del_all(&c->free_llist_extra), percpu);
+ free_all(__llist_del_all(&c->free_by_rcu), percpu);
+ free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
+ free_all(llist_del_all(&c->waiting_for_gp), percpu);
+}
+
+static void check_mem_cache(struct bpf_mem_cache *c)
+{
+ WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace));
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace));
+ WARN_ON_ONCE(!llist_empty(&c->free_llist));
+ WARN_ON_ONCE(!llist_empty(&c->free_llist_extra));
+ WARN_ON_ONCE(!llist_empty(&c->free_by_rcu));
+ WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu));
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+}
+
+static void check_leaked_objs(struct bpf_mem_alloc *ma)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i;
+
+ if (ma->cache) {
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ check_mem_cache(c);
+ }
+ }
+ if (ma->caches) {
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ check_mem_cache(c);
+ }
+ }
+ }
}
static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
{
+ check_leaked_objs(ma);
free_percpu(ma->cache);
free_percpu(ma->caches);
ma->cache = NULL;
@@ -461,8 +616,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
static void free_mem_alloc(struct bpf_mem_alloc *ma)
{
- /* waiting_for_gp lists was drained, but __free_rcu might
- * still execute. Wait for it now before we freeing percpu caches.
+ /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
+ * might still execute. Wait for them.
*
* rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
* but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
@@ -471,7 +626,8 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma)
* rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
* using rcu_trace_implies_rcu_gp() as well.
*/
- rcu_barrier_tasks_trace();
+ rcu_barrier(); /* wait for __free_by_rcu */
+ rcu_barrier_tasks_trace(); /* wait for __free_rcu */
if (!rcu_trace_implies_rcu_gp())
rcu_barrier();
free_mem_alloc_no_barrier(ma);
@@ -497,7 +653,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
return;
}
- copy = kmalloc(sizeof(*ma), GFP_KERNEL);
+ copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL);
if (!copy) {
/* Slow path with inline barrier-s */
free_mem_alloc(ma);
@@ -505,10 +661,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
}
/* Defer barriers into worker to let the rest of map memory to be freed */
- copy->cache = ma->cache;
- ma->cache = NULL;
- copy->caches = ma->caches;
- ma->caches = NULL;
+ memset(ma, 0, sizeof(*ma));
INIT_WORK(&copy->work, free_mem_alloc_deferred);
queue_work(system_unbound_wq, &copy->work);
}
@@ -523,17 +676,10 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
rcu_in_progress = 0;
for_each_possible_cpu(cpu) {
c = per_cpu_ptr(ma->cache, cpu);
- /*
- * refill_work may be unfinished for PREEMPT_RT kernel
- * in which irq work is invoked in a per-CPU RT thread.
- * It is also possible for kernel with
- * arch_irq_work_has_interrupt() being false and irq
- * work is invoked in timer interrupt. So waiting for
- * the completion of irq work to ease the handling of
- * concurrency.
- */
+ WRITE_ONCE(c->draining, true);
irq_work_sync(&c->refill_work);
drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
/* objcg is the same across cpus */
@@ -547,8 +693,10 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
cc = per_cpu_ptr(ma->caches, cpu);
for (i = 0; i < NUM_CACHES; i++) {
c = &cc->cache[i];
+ WRITE_ONCE(c->draining, true);
irq_work_sync(&c->refill_work);
drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
}
@@ -580,8 +728,10 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c)
local_irq_save(flags);
if (local_inc_return(&c->active) == 1) {
llnode = __llist_del_first(&c->free_llist);
- if (llnode)
+ if (llnode) {
cnt = --c->free_cnt;
+ *(struct bpf_mem_cache **)llnode = c;
+ }
}
local_dec(&c->active);
local_irq_restore(flags);
@@ -605,6 +755,12 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
BUILD_BUG_ON(LLIST_NODE_SZ > 8);
+ /*
+ * Remember bpf_mem_cache that allocated this object.
+ * The hint is not accurate.
+ */
+ c->tgt = *(struct bpf_mem_cache **)llnode;
+
local_irq_save(flags);
if (local_inc_return(&c->active) == 1) {
__llist_add(llnode, &c->free_llist);
@@ -626,6 +782,27 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
irq_work_raise(c);
}
+static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
+{
+ struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+ unsigned long flags;
+
+ c->tgt = *(struct bpf_mem_cache **)llnode;
+
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ if (__llist_add(llnode, &c->free_by_rcu))
+ c->free_by_rcu_tail = llnode;
+ } else {
+ llist_add(llnode, &c->free_llist_extra_rcu);
+ }
+ local_dec(&c->active);
+ local_irq_restore(flags);
+
+ if (!atomic_read(&c->call_rcu_in_progress))
+ irq_work_raise(c);
+}
+
/* Called from BPF program or from sys_bpf syscall.
* In both cases migration is disabled.
*/
@@ -659,6 +836,20 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
}
+void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
+{
+ int idx;
+
+ if (!ptr)
+ return;
+
+ idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+ if (idx < 0)
+ return;
+
+ unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
{
void *ret;
@@ -675,6 +866,14 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
unit_free(this_cpu_ptr(ma->cache), ptr);
}
+void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
+{
+ if (!ptr)
+ return;
+
+ unit_free_rcu(this_cpu_ptr(ma->cache), ptr);
+}
+
/* Directly does a kfree() without putting 'ptr' back to the free_llist
* for reuse and without waiting for a rcu_tasks_trace gp.
* The caller must first go through the rcu_tasks_trace gp for 'ptr'
diff --git a/kernel/bpf/mprog.c b/kernel/bpf/mprog.c
new file mode 100644
index 000000000000..32d2c4829eb8
--- /dev/null
+++ b/kernel/bpf/mprog.c
@@ -0,0 +1,447 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Isovalent */
+
+#include <linux/bpf.h>
+#include <linux/bpf_mprog.h>
+
+static int bpf_mprog_link(struct bpf_tuple *tuple,
+ u32 id_or_fd, u32 flags,
+ enum bpf_prog_type type)
+{
+ struct bpf_link *link = ERR_PTR(-EINVAL);
+ bool id = flags & BPF_F_ID;
+
+ if (id)
+ link = bpf_link_by_id(id_or_fd);
+ else if (id_or_fd)
+ link = bpf_link_get_from_fd(id_or_fd);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+ if (type && link->prog->type != type) {
+ bpf_link_put(link);
+ return -EINVAL;
+ }
+
+ tuple->link = link;
+ tuple->prog = link->prog;
+ return 0;
+}
+
+static int bpf_mprog_prog(struct bpf_tuple *tuple,
+ u32 id_or_fd, u32 flags,
+ enum bpf_prog_type type)
+{
+ struct bpf_prog *prog = ERR_PTR(-EINVAL);
+ bool id = flags & BPF_F_ID;
+
+ if (id)
+ prog = bpf_prog_by_id(id_or_fd);
+ else if (id_or_fd)
+ prog = bpf_prog_get(id_or_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ if (type && prog->type != type) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ tuple->link = NULL;
+ tuple->prog = prog;
+ return 0;
+}
+
+static int bpf_mprog_tuple_relative(struct bpf_tuple *tuple,
+ u32 id_or_fd, u32 flags,
+ enum bpf_prog_type type)
+{
+ bool link = flags & BPF_F_LINK;
+ bool id = flags & BPF_F_ID;
+
+ memset(tuple, 0, sizeof(*tuple));
+ if (link)
+ return bpf_mprog_link(tuple, id_or_fd, flags, type);
+ /* If no relevant flag is set and no id_or_fd was passed, then
+ * tuple link/prog is just NULLed. This is the case when before/
+ * after selects first/last position without passing fd.
+ */
+ if (!id && !id_or_fd)
+ return 0;
+ return bpf_mprog_prog(tuple, id_or_fd, flags, type);
+}
+
+static void bpf_mprog_tuple_put(struct bpf_tuple *tuple)
+{
+ if (tuple->link)
+ bpf_link_put(tuple->link);
+ else if (tuple->prog)
+ bpf_prog_put(tuple->prog);
+}
+
+/* The bpf_mprog_{replace,delete}() operate on exact idx position with the
+ * one exception that for deletion we support delete from front/back. In
+ * case of front idx is -1, in case of back idx is bpf_mprog_total(entry).
+ * Adjustment to first and last entry is trivial. The bpf_mprog_insert()
+ * we have to deal with the following cases:
+ *
+ * idx + before:
+ *
+ * Insert P4 before P3: idx for old array is 1, idx for new array is 2,
+ * hence we adjust target idx for the new array, so that memmove copies
+ * P1 and P2 to the new entry, and we insert P4 into idx 2. Inserting
+ * before P1 would have old idx -1 and new idx 0.
+ *
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3|
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ *
+ * idx + after:
+ *
+ * Insert P4 after P2: idx for old array is 2, idx for new array is 2.
+ * Again, memmove copies P1 and P2 to the new entry, and we insert P4
+ * into idx 2. Inserting after P3 would have both old/new idx at 4 aka
+ * bpf_mprog_total(entry).
+ *
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3|
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ */
+static int bpf_mprog_replace(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_tuple *ntuple, int idx)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ struct bpf_prog *oprog;
+
+ bpf_mprog_read(entry, idx, &fp, &cp);
+ oprog = READ_ONCE(fp->prog);
+ bpf_mprog_write(fp, cp, ntuple);
+ if (!ntuple->link) {
+ WARN_ON_ONCE(cp->link);
+ bpf_prog_put(oprog);
+ }
+ *entry_new = entry;
+ return 0;
+}
+
+static int bpf_mprog_insert(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_tuple *ntuple, int idx, u32 flags)
+{
+ int total = bpf_mprog_total(entry);
+ struct bpf_mprog_entry *peer;
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+
+ peer = bpf_mprog_peer(entry);
+ bpf_mprog_entry_copy(peer, entry);
+ if (idx == total)
+ goto insert;
+ else if (flags & BPF_F_BEFORE)
+ idx += 1;
+ bpf_mprog_entry_grow(peer, idx);
+insert:
+ bpf_mprog_read(peer, idx, &fp, &cp);
+ bpf_mprog_write(fp, cp, ntuple);
+ bpf_mprog_inc(peer);
+ *entry_new = peer;
+ return 0;
+}
+
+static int bpf_mprog_delete(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_tuple *dtuple, int idx)
+{
+ int total = bpf_mprog_total(entry);
+ struct bpf_mprog_entry *peer;
+
+ peer = bpf_mprog_peer(entry);
+ bpf_mprog_entry_copy(peer, entry);
+ if (idx == -1)
+ idx = 0;
+ else if (idx == total)
+ idx = total - 1;
+ bpf_mprog_entry_shrink(peer, idx);
+ bpf_mprog_dec(peer);
+ bpf_mprog_mark_for_release(peer, dtuple);
+ *entry_new = peer;
+ return 0;
+}
+
+/* In bpf_mprog_pos_*() we evaluate the target position for the BPF
+ * program/link that needs to be replaced, inserted or deleted for
+ * each "rule" independently. If all rules agree on that position
+ * or existing element, then enact replacement, addition or deletion.
+ * If this is not the case, then the request cannot be satisfied and
+ * we bail out with an error.
+ */
+static int bpf_mprog_pos_exact(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog))
+ return tuple->link == cp->link ? i : -EBUSY;
+ }
+ return -ENOENT;
+}
+
+static int bpf_mprog_pos_before(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog) &&
+ (!tuple->link || tuple->link == cp->link))
+ return i - 1;
+ }
+ return tuple->prog ? -ENOENT : -1;
+}
+
+static int bpf_mprog_pos_after(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog) &&
+ (!tuple->link || tuple->link == cp->link))
+ return i + 1;
+ }
+ return tuple->prog ? -ENOENT : bpf_mprog_total(entry);
+}
+
+int bpf_mprog_attach(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_prog *prog_new, struct bpf_link *link,
+ struct bpf_prog *prog_old,
+ u32 flags, u32 id_or_fd, u64 revision)
+{
+ struct bpf_tuple rtuple, ntuple = {
+ .prog = prog_new,
+ .link = link,
+ }, otuple = {
+ .prog = prog_old,
+ .link = link,
+ };
+ int ret, idx = -ERANGE, tidx;
+
+ if (revision && revision != bpf_mprog_revision(entry))
+ return -ESTALE;
+ if (bpf_mprog_exists(entry, prog_new))
+ return -EEXIST;
+ ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd,
+ flags & ~BPF_F_REPLACE,
+ prog_new->type);
+ if (ret)
+ return ret;
+ if (flags & BPF_F_REPLACE) {
+ tidx = bpf_mprog_pos_exact(entry, &otuple);
+ if (tidx < 0) {
+ ret = tidx;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_BEFORE) {
+ tidx = bpf_mprog_pos_before(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < -1 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_AFTER) {
+ tidx = bpf_mprog_pos_after(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < 0 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (idx < -1) {
+ if (rtuple.prog || flags) {
+ ret = -EINVAL;
+ goto out;
+ }
+ idx = bpf_mprog_total(entry);
+ flags = BPF_F_AFTER;
+ }
+ if (idx >= bpf_mprog_max()) {
+ ret = -ERANGE;
+ goto out;
+ }
+ if (flags & BPF_F_REPLACE)
+ ret = bpf_mprog_replace(entry, entry_new, &ntuple, idx);
+ else
+ ret = bpf_mprog_insert(entry, entry_new, &ntuple, idx, flags);
+out:
+ bpf_mprog_tuple_put(&rtuple);
+ return ret;
+}
+
+static int bpf_mprog_fetch(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple, int idx)
+{
+ int total = bpf_mprog_total(entry);
+ struct bpf_mprog_cp *cp;
+ struct bpf_mprog_fp *fp;
+ struct bpf_prog *prog;
+ struct bpf_link *link;
+
+ if (idx == -1)
+ idx = 0;
+ else if (idx == total)
+ idx = total - 1;
+ bpf_mprog_read(entry, idx, &fp, &cp);
+ prog = READ_ONCE(fp->prog);
+ link = cp->link;
+ /* The deletion request can either be without filled tuple in which
+ * case it gets populated here based on idx, or with filled tuple
+ * where the only thing we end up doing is the WARN_ON_ONCE() assert.
+ * If we hit a BPF link at the given index, it must not be removed
+ * from opts path.
+ */
+ if (link && !tuple->link)
+ return -EBUSY;
+ WARN_ON_ONCE(tuple->prog && tuple->prog != prog);
+ WARN_ON_ONCE(tuple->link && tuple->link != link);
+ tuple->prog = prog;
+ tuple->link = link;
+ return 0;
+}
+
+int bpf_mprog_detach(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_prog *prog, struct bpf_link *link,
+ u32 flags, u32 id_or_fd, u64 revision)
+{
+ struct bpf_tuple rtuple, dtuple = {
+ .prog = prog,
+ .link = link,
+ };
+ int ret, idx = -ERANGE, tidx;
+
+ if (flags & BPF_F_REPLACE)
+ return -EINVAL;
+ if (revision && revision != bpf_mprog_revision(entry))
+ return -ESTALE;
+ if (!bpf_mprog_total(entry))
+ return -ENOENT;
+ ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags,
+ prog ? prog->type :
+ BPF_PROG_TYPE_UNSPEC);
+ if (ret)
+ return ret;
+ if (dtuple.prog) {
+ tidx = bpf_mprog_pos_exact(entry, &dtuple);
+ if (tidx < 0) {
+ ret = tidx;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_BEFORE) {
+ tidx = bpf_mprog_pos_before(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < -1 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_AFTER) {
+ tidx = bpf_mprog_pos_after(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < 0 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (idx < -1) {
+ if (rtuple.prog || flags) {
+ ret = -EINVAL;
+ goto out;
+ }
+ idx = bpf_mprog_total(entry);
+ flags = BPF_F_AFTER;
+ }
+ if (idx >= bpf_mprog_max()) {
+ ret = -ERANGE;
+ goto out;
+ }
+ ret = bpf_mprog_fetch(entry, &dtuple, idx);
+ if (ret)
+ goto out;
+ ret = bpf_mprog_delete(entry, entry_new, &dtuple, idx);
+out:
+ bpf_mprog_tuple_put(&rtuple);
+ return ret;
+}
+
+int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr,
+ struct bpf_mprog_entry *entry)
+{
+ u32 __user *uprog_flags, *ulink_flags;
+ u32 __user *uprog_id, *ulink_id;
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ struct bpf_prog *prog;
+ const u32 flags = 0;
+ int i, ret = 0;
+ u32 id, count;
+ u64 revision;
+
+ if (attr->query.query_flags || attr->query.attach_flags)
+ return -EINVAL;
+ revision = bpf_mprog_revision(entry);
+ count = bpf_mprog_total(entry);
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.count, &count, sizeof(count)))
+ return -EFAULT;
+ uprog_id = u64_to_user_ptr(attr->query.prog_ids);
+ uprog_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
+ ulink_id = u64_to_user_ptr(attr->query.link_ids);
+ ulink_flags = u64_to_user_ptr(attr->query.link_attach_flags);
+ if (attr->query.count == 0 || !uprog_id || !count)
+ return 0;
+ if (attr->query.count < count) {
+ count = attr->query.count;
+ ret = -ENOSPC;
+ }
+ for (i = 0; i < bpf_mprog_max(); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ prog = READ_ONCE(fp->prog);
+ if (!prog)
+ break;
+ id = prog->aux->id;
+ if (copy_to_user(uprog_id + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (uprog_flags &&
+ copy_to_user(uprog_flags + i, &flags, sizeof(flags)))
+ return -EFAULT;
+ id = cp->link ? cp->link->id : 0;
+ if (ulink_id &&
+ copy_to_user(ulink_id + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (ulink_flags &&
+ copy_to_user(ulink_flags + i, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (i + 1 == count)
+ break;
+ }
+ return ret;
+}
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 8a26cd8814c1..3e4f2ec1af06 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -25,6 +25,7 @@
#include <linux/rhashtable.h>
#include <linux/rtnetlink.h>
#include <linux/rwsem.h>
+#include <net/xdp.h>
/* Protects offdevs, members of bpf_offload_netdev and offload members
* of all progs.
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index b56f9f3314fd..0c63bc2cd895 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -23,9 +23,9 @@ static void free_links_and_skel(void)
static int preload(struct bpf_preload_info *obj)
{
- strlcpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name));
+ strscpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name));
obj[0].link = maps_link;
- strlcpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name));
+ strscpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name));
obj[1].link = progs_link;
return 0;
}
diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile
index 8937dc6bc8d0..b83c2f5e9be1 100644
--- a/kernel/bpf/preload/iterators/Makefile
+++ b/kernel/bpf/preload/iterators/Makefile
@@ -50,7 +50,7 @@ iterators.lskel-%.h: $(OUTPUT)/%/iterators.bpf.o | $(BPFTOOL)
$(OUTPUT)/%/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
$(call msg,BPF,$@)
$(Q)mkdir -p $(@D)
- $(Q)$(CLANG) -g -O2 -target bpf -m$* $(INCLUDES) \
+ $(Q)$(CLANG) -g -O2 --target=bpf -m$* $(INCLUDES) \
-c $(filter %.c,$^) -o $@ && \
$(LLVM_STRIP) -g $@
diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c
index 03af863314ea..b78968b63fab 100644
--- a/kernel/bpf/preload/iterators/iterators.bpf.c
+++ b/kernel/bpf/preload/iterators/iterators.bpf.c
@@ -73,6 +73,8 @@ static const char *get_name(struct btf *btf, long btf_id, const char *fallback)
return str + name_off;
}
+__s64 bpf_map_sum_elem_count(struct bpf_map *map) __ksym;
+
SEC("iter/bpf_map")
int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
{
@@ -84,9 +86,12 @@ int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
return 0;
if (seq_num == 0)
- BPF_SEQ_PRINTF(seq, " id name max_entries\n");
+ BPF_SEQ_PRINTF(seq, " id name max_entries cur_entries\n");
+
+ BPF_SEQ_PRINTF(seq, "%4u %-16s %10d %10lld\n",
+ map->id, map->name, map->max_entries,
+ bpf_map_sum_elem_count(map));
- BPF_SEQ_PRINTF(seq, "%4u %-16s%6d\n", map->id, map->name, map->max_entries);
return 0;
}
diff --git a/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
index 70f236a82fe1..5b98ab02025e 100644
--- a/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
+++ b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
-/* THIS FILE IS AUTOGENERATED! */
+/* THIS FILE IS AUTOGENERATED BY BPFTOOL! */
#ifndef __ITERATORS_BPF_SKEL_H__
#define __ITERATORS_BPF_SKEL_H__
@@ -18,8 +18,6 @@ struct iterators_bpf {
int dump_bpf_map_fd;
int dump_bpf_prog_fd;
} links;
- struct iterators_bpf__rodata {
- } *rodata;
};
static inline int
@@ -68,7 +66,6 @@ iterators_bpf__destroy(struct iterators_bpf *skel)
iterators_bpf__detach(skel);
skel_closenz(skel->progs.dump_bpf_map.prog_fd);
skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
- skel_free_map_data(skel->rodata, skel->maps.rodata.initial_value, 4096);
skel_closenz(skel->maps.rodata.map_fd);
skel_free(skel);
}
@@ -81,15 +78,6 @@ iterators_bpf__open(void)
if (!skel)
goto cleanup;
skel->ctx.sz = (void *)&skel->links - (void *)skel;
- skel->rodata = skel_prep_map_data((void *)"\
-\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
-\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\
-\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\
-\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\
-\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 4096, 98);
- if (!skel->rodata)
- goto cleanup;
- skel->maps.rodata.initial_value = (__u64) (long) skel->rodata;
return skel;
cleanup:
iterators_bpf__destroy(skel);
@@ -103,7 +91,7 @@ iterators_bpf__load(struct iterators_bpf *skel)
int err;
opts.ctx = (struct bpf_loader_ctx *)skel;
- opts.data_sz = 6056;
+ opts.data_sz = 6208;
opts.data = (void *)"\
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
@@ -138,190 +126,197 @@ iterators_bpf__load(struct iterators_bpf *skel)
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\
-\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\xf9\x04\0\0\0\0\0\0\0\0\0\x02\x02\0\
+\x18\0\0\0\0\0\0\0\x80\x04\0\0\x80\x04\0\0\x31\x05\0\0\0\0\0\0\0\0\0\x02\x02\0\
\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\
\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\
\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\
-\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xa3\0\0\0\x03\0\0\x04\x18\0\0\0\xb1\0\
-\0\0\x09\0\0\0\0\0\0\0\xb5\0\0\0\x0b\0\0\0\x40\0\0\0\xc0\0\0\0\x0b\0\0\0\x80\0\
-\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xc8\0\0\0\0\0\0\x07\0\0\0\0\xd1\0\0\0\0\0\0\
-\x08\x0c\0\0\0\xd7\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x94\x01\0\0\x03\0\0\x04\
-\x18\0\0\0\x9c\x01\0\0\x0e\0\0\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xa4\
-\x01\0\0\x0e\0\0\0\xa0\0\0\0\xb0\x01\0\0\0\0\0\x08\x0f\0\0\0\xb6\x01\0\0\0\0\0\
-\x01\x04\0\0\0\x20\0\0\0\xc3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\
-\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xc8\x01\0\0\0\0\0\x01\x04\0\0\0\
-\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x2c\x02\0\0\x02\0\0\x04\x10\0\0\0\x13\0\
-\0\0\x03\0\0\0\0\0\0\0\x3f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x18\0\
-\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x44\x02\0\0\x01\0\0\x0c\
-\x16\0\0\0\x90\x02\0\0\x01\0\0\x04\x08\0\0\0\x99\x02\0\0\x19\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\x02\x1a\0\0\0\xea\x02\0\0\x06\0\0\x04\x38\0\0\0\x9c\x01\0\0\x0e\0\0\
-\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xf7\x02\0\0\x1b\0\0\0\xc0\0\0\0\x08\
-\x03\0\0\x15\0\0\0\0\x01\0\0\x11\x03\0\0\x1d\0\0\0\x40\x01\0\0\x1b\x03\0\0\x1e\
-\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\0\0\0\0\
-\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x65\x03\0\0\x02\0\0\x04\
-\x08\0\0\0\x73\x03\0\0\x0e\0\0\0\0\0\0\0\x7c\x03\0\0\x0e\0\0\0\x20\0\0\0\x1b\
-\x03\0\0\x03\0\0\x04\x18\0\0\0\x86\x03\0\0\x1b\0\0\0\0\0\0\0\x8e\x03\0\0\x21\0\
-\0\0\x40\0\0\0\x94\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\0\0\0\
-\0\0\0\0\0\x02\x24\0\0\0\x98\x03\0\0\x01\0\0\x04\x04\0\0\0\xa3\x03\0\0\x0e\0\0\
-\0\0\0\0\0\x0c\x04\0\0\x01\0\0\x04\x04\0\0\0\x15\x04\0\0\x0e\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x8b\x04\0\0\0\0\0\x0e\x25\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\x9f\x04\
-\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\
-\x20\0\0\0\xb5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\
-\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xca\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xe1\x04\0\0\0\0\0\x0e\x2d\0\0\
-\0\x01\0\0\0\xe9\x04\0\0\x04\0\0\x0f\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\x28\
-\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\0\
-\x11\0\0\0\xf1\x04\0\0\x01\0\0\x0f\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\x62\
-\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\
-\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
-\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\
-\x3a\x30\0\x2f\x77\x2f\x6e\x65\x74\x2d\x6e\x65\x78\x74\x2f\x6b\x65\x72\x6e\x65\
-\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\
-\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\
-\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\
-\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\
-\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\
-\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\
-\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\
-\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\
-\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\
-\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\
-\x29\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\
-\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x30\
-\x3a\x32\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\
-\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\
-\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\
-\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\
-\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\
-\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\
-\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\
-\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\
-\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\
-\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\
-\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\
-\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\
-\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\
-\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\
-\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\
-\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\
-\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\
-\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\
-\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\
-\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\
-\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\
-\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\
-\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
-\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\
-\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
-\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\
-\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\
-\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\
-\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\
-\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\
-\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\
-\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\
-\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\
-\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\
-\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\
-\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\
-\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\
-\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\
-\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
-\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\
-\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\
-\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
-\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
-\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\
-\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2d\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\
-\0\x04\0\0\0\x62\0\0\0\x01\0\0\0\x80\x04\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\
-\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\
-\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\
-\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
-\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\
-\x25\x73\x20\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\
-\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1b\0\0\
-\0\0\0\x79\x11\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\
-\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\
-\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\
-\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\
-\x7b\x1a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\
-\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x23\0\0\0\xb7\x03\0\0\x0e\0\0\0\
-\xb7\x05\0\0\x18\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
-\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x3c\x01\0\x01\0\0\0\x42\0\0\
-\0\x7b\0\0\0\x24\x3c\x01\0\x02\0\0\0\x42\0\0\0\xee\0\0\0\x1d\x44\x01\0\x03\0\0\
-\0\x42\0\0\0\x0f\x01\0\0\x06\x4c\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\x40\
-\x01\0\x05\0\0\0\x42\0\0\0\x1a\x01\0\0\x1d\x40\x01\0\x06\0\0\0\x42\0\0\0\x43\
-\x01\0\0\x06\x58\x01\0\x08\0\0\0\x42\0\0\0\x56\x01\0\0\x03\x5c\x01\0\x0f\0\0\0\
-\x42\0\0\0\xdc\x01\0\0\x02\x64\x01\0\x1f\0\0\0\x42\0\0\0\x2a\x02\0\0\x01\x6c\
-\x01\0\0\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\
-\0\x10\0\0\0\x02\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\
-\x28\0\0\0\x08\0\0\0\x3f\x01\0\0\0\0\0\0\x78\0\0\0\x0d\0\0\0\x3e\0\0\0\0\0\0\0\
-\x88\0\0\0\x0d\0\0\0\xea\0\0\0\0\0\0\0\xa8\0\0\0\x0d\0\0\0\x3f\x01\0\0\0\0\0\0\
-\x1a\0\0\0\x21\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\
-\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\
-\0\0\0\0\0\x0a\0\0\0\x01\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\
-\0\0\0\0\x79\x12\x08\0\0\0\0\0\x15\x02\x3c\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x79\
-\x27\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\
-\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\
-\x31\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\
-\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\
-\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\
-\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\
-\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\
-\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\
-\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\
-\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\
-\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\
-\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\
-\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\
-\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\
-\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\
-\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x51\0\0\0\xb7\x03\0\0\x11\0\0\0\
-\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
-\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x80\x01\0\x01\0\0\0\x42\0\0\
-\0\x7b\0\0\0\x24\x80\x01\0\x02\0\0\0\x42\0\0\0\x60\x02\0\0\x1f\x88\x01\0\x03\0\
-\0\0\x42\0\0\0\x84\x02\0\0\x06\x94\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\
-\x84\x01\0\x05\0\0\0\x42\0\0\0\x9d\x02\0\0\x0e\xa0\x01\0\x06\0\0\0\x42\0\0\0\
-\x1a\x01\0\0\x1d\x84\x01\0\x07\0\0\0\x42\0\0\0\x43\x01\0\0\x06\xa4\x01\0\x09\0\
-\0\0\x42\0\0\0\xaf\x02\0\0\x03\xa8\x01\0\x11\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\
-\xb0\x01\0\x18\0\0\0\x42\0\0\0\x5a\x03\0\0\x06\x04\x01\0\x1b\0\0\0\x42\0\0\0\0\
-\0\0\0\0\0\0\0\x1c\0\0\0\x42\0\0\0\xab\x03\0\0\x0f\x10\x01\0\x1d\0\0\0\x42\0\0\
-\0\xc0\x03\0\0\x2d\x14\x01\0\x1f\0\0\0\x42\0\0\0\xf7\x03\0\0\x0d\x0c\x01\0\x21\
-\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x22\0\0\0\x42\0\0\0\xc0\x03\0\0\x02\x14\x01\0\
-\x25\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x28\0\0\0\x42\0\0\0\0\0\0\0\0\0\
-\0\0\x29\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x1e\x04\
-\0\0\x0d\x18\x01\0\x2d\0\0\0\x42\0\0\0\x4c\x04\0\0\x1b\x1c\x01\0\x2e\0\0\0\x42\
-\0\0\0\x4c\x04\0\0\x06\x1c\x01\0\x2f\0\0\0\x42\0\0\0\x6f\x04\0\0\x0d\x24\x01\0\
-\x31\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\xb0\x01\0\x40\0\0\0\x42\0\0\0\x2a\x02\0\0\
-\x01\xc0\x01\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\
-\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x14\0\0\0\x3e\0\0\0\
-\0\0\0\0\x28\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x30\0\0\0\x08\0\0\0\x3f\x01\0\0\
-\0\0\0\0\x88\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x98\0\0\0\x1a\0\0\0\xea\0\0\0\0\
-\0\0\0\xb0\0\0\0\x1a\0\0\0\x52\x03\0\0\0\0\0\0\xb8\0\0\0\x1a\0\0\0\x56\x03\0\0\
-\0\0\0\0\xc8\0\0\0\x1f\0\0\0\x84\x03\0\0\0\0\0\0\xe0\0\0\0\x20\0\0\0\xea\0\0\0\
-\0\0\0\0\xf8\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x20\x01\0\0\x24\0\0\0\x3e\0\0\0\
-\0\0\0\0\x58\x01\0\0\x1a\0\0\0\xea\0\0\0\0\0\0\0\x68\x01\0\0\x20\0\0\0\x46\x04\
-\0\0\0\0\0\0\x90\x01\0\0\x1a\0\0\0\x3f\x01\0\0\0\0\0\0\xa0\x01\0\0\x1a\0\0\0\
-\x87\x04\0\0\0\0\0\0\xa8\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x42\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\
-\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x1a\0\
-\0\0\x01\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\
-\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\
-\0\0\0\0";
- opts.insns_sz = 2216;
+\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xb0\0\0\0\x03\0\0\x04\x18\0\0\0\xbe\0\
+\0\0\x09\0\0\0\0\0\0\0\xc2\0\0\0\x0b\0\0\0\x40\0\0\0\xcd\0\0\0\x0b\0\0\0\x80\0\
+\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd5\0\0\0\0\0\0\x07\0\0\0\0\xde\0\0\0\0\0\0\
+\x08\x0c\0\0\0\xe4\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xae\x01\0\0\x03\0\0\x04\
+\x18\0\0\0\xb6\x01\0\0\x0e\0\0\0\0\0\0\0\xb9\x01\0\0\x11\0\0\0\x20\0\0\0\xbe\
+\x01\0\0\x0e\0\0\0\xa0\0\0\0\xca\x01\0\0\0\0\0\x08\x0f\0\0\0\xd0\x01\0\0\0\0\0\
+\x01\x04\0\0\0\x20\0\0\0\xdd\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\
+\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xe2\x01\0\0\0\0\0\x01\x04\0\0\0\
+\x20\0\0\0\0\0\0\0\x01\0\0\x0d\x14\0\0\0\x26\x05\0\0\x04\0\0\0\x2b\x02\0\0\0\0\
+\0\x08\x15\0\0\0\x31\x02\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\x01\x3b\x02\0\0\x01\0\
+\0\x0c\x13\0\0\0\0\0\0\0\0\0\0\x02\x18\0\0\0\x52\x02\0\0\x02\0\0\x04\x10\0\0\0\
+\x13\0\0\0\x03\0\0\0\0\0\0\0\x65\x02\0\0\x19\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\
+\x1c\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x17\0\0\0\x6a\x02\0\0\x01\0\
+\0\x0c\x1a\0\0\0\xb6\x02\0\0\x01\0\0\x04\x08\0\0\0\xbf\x02\0\0\x1d\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\x02\x1e\0\0\0\x10\x03\0\0\x06\0\0\x04\x38\0\0\0\xb6\x01\0\0\
+\x0e\0\0\0\0\0\0\0\xb9\x01\0\0\x11\0\0\0\x20\0\0\0\x1d\x03\0\0\x1f\0\0\0\xc0\0\
+\0\0\x2e\x03\0\0\x19\0\0\0\0\x01\0\0\x37\x03\0\0\x21\0\0\0\x40\x01\0\0\x41\x03\
+\0\0\x22\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\
+\0\0\0\0\0\0\0\0\0\x02\x23\0\0\0\0\0\0\0\0\0\0\x02\x24\0\0\0\x8b\x03\0\0\x02\0\
+\0\x04\x08\0\0\0\x99\x03\0\0\x0e\0\0\0\0\0\0\0\xa2\x03\0\0\x0e\0\0\0\x20\0\0\0\
+\x41\x03\0\0\x03\0\0\x04\x18\0\0\0\xac\x03\0\0\x1f\0\0\0\0\0\0\0\xb4\x03\0\0\
+\x25\0\0\0\x40\0\0\0\xba\x03\0\0\x27\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x26\0\0\
+\0\0\0\0\0\0\0\0\x02\x28\0\0\0\xbe\x03\0\0\x01\0\0\x04\x04\0\0\0\xc9\x03\0\0\
+\x0e\0\0\0\0\0\0\0\x32\x04\0\0\x01\0\0\x04\x04\0\0\0\x3b\x04\0\0\x0e\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\x12\0\0\0\x30\0\0\0\xb1\x04\0\0\0\0\0\
+\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\x12\0\0\0\x1a\0\0\0\
+\xc5\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\
+\x12\0\0\0\x20\0\0\0\xdb\x04\0\0\0\0\0\x0e\x2d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\
+\0\0\0\0\x20\0\0\0\x12\0\0\0\x11\0\0\0\xf0\x04\0\0\0\0\0\x0e\x2f\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\x07\x05\0\0\0\0\0\x0e\
+\x31\0\0\0\x01\0\0\0\x0f\x05\0\0\x01\0\0\x0f\x04\0\0\0\x36\0\0\0\0\0\0\0\x04\0\
+\0\0\x16\x05\0\0\x04\0\0\x0f\x7b\0\0\0\x2a\0\0\0\0\0\0\0\x30\0\0\0\x2c\0\0\0\
+\x30\0\0\0\x1a\0\0\0\x2e\0\0\0\x4a\0\0\0\x20\0\0\0\x30\0\0\0\x6a\0\0\0\x11\0\0\
+\0\x1e\x05\0\0\x01\0\0\x0f\x04\0\0\0\x32\0\0\0\0\0\0\0\x04\0\0\0\x26\x05\0\0\0\
+\0\0\x0e\x06\0\0\0\x01\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\
+\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\
+\x62\x70\x66\x5f\x6d\x61\x70\0\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x73\
+\x70\x73\x6b\x2f\x73\x72\x63\x2f\x62\x70\x66\x2d\x6e\x65\x78\x74\x2f\x6b\x65\
+\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\
+\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\
+\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\
+\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\
+\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\
+\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\
+\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\
+\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\
+\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\
+\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\
+\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\
+\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\
+\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\
+\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\
+\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\
+\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\
+\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\
+\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\
+\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\
+\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\
+\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\
+\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\x20\x25\x31\x30\
+\x6c\x6c\x64\x5c\x6e\x22\x2c\0\x7d\0\x5f\x5f\x73\x36\x34\0\x6c\x6f\x6e\x67\x20\
+\x6c\x6f\x6e\x67\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\
+\x6d\x5f\x63\x6f\x75\x6e\x74\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
+\x5f\x70\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\
+\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\
+\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\
+\x20\x28\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\
+\x78\0\x09\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\
+\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\
+\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\
+\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\
+\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\
+\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\
+\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\
+\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\
+\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\
+\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\
+\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\
+\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\
+\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\
+\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\
+\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\
+\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\x20\
+\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\x74\
+\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\
+\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
+\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\
+\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\x66\
+\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\x3e\
+\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\x6e\
+\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\
+\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\
+\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\
+\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\
+\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\
+\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x6b\x73\x79\x6d\
+\x73\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x64\x75\x6d\
+\x6d\x79\x5f\x6b\x73\x79\x6d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\xc9\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x7b\0\0\0\x01\0\0\0\
+\x80\0\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\
+\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x34\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\
+\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\
+\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\
+\x31\x30\x64\x20\x20\x20\x25\x31\x30\x6c\x6c\x64\x0a\0\x20\x20\x69\x64\x20\x6e\
+\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\
+\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
+\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\
+\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1d\0\0\0\0\0\x79\x21\
+\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe0\xff\
+\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\
+\x30\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\
+\xe0\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\
+\x7b\x2a\xe8\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf0\xff\0\0\0\0\xbf\x71\
+\0\0\0\0\0\0\x85\x20\0\0\0\0\0\0\x7b\x0a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\
+\x07\x04\0\0\xe0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\
+\x30\0\0\0\xb7\x03\0\0\x1a\0\0\0\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\
+\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x88\0\0\0\
+\x1e\x44\x01\0\x01\0\0\0\x42\0\0\0\x88\0\0\0\x24\x44\x01\0\x02\0\0\0\x42\0\0\0\
+\xfb\0\0\0\x1d\x4c\x01\0\x03\0\0\0\x42\0\0\0\x1c\x01\0\0\x06\x54\x01\0\x04\0\0\
+\0\x42\0\0\0\x2b\x01\0\0\x1d\x48\x01\0\x05\0\0\0\x42\0\0\0\x50\x01\0\0\x06\x60\
+\x01\0\x07\0\0\0\x42\0\0\0\x63\x01\0\0\x03\x64\x01\0\x0e\0\0\0\x42\0\0\0\xf6\
+\x01\0\0\x02\x6c\x01\0\x21\0\0\0\x42\0\0\0\x29\x02\0\0\x01\x80\x01\0\0\0\0\0\
+\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\
+\x02\0\0\0\xf7\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x27\x01\0\0\0\0\0\0\x70\0\0\0\
+\x0d\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xf7\0\0\0\0\0\0\0\xa0\0\0\0\
+\x0d\0\0\0\x27\x01\0\0\0\0\0\0\x1a\0\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\
+\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\
+\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\
+\x65\x72\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x62\x70\x66\x5f\x6d\
+\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\x6d\x5f\x63\x6f\x75\x6e\x74\0\0\x47\
+\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\
+\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\
+\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\
+\x18\x62\0\0\0\0\0\0\0\0\0\0\x4a\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\
+\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\
+\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\
+\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\
+\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\
+\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\
+\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\
+\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\
+\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\
+\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\
+\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\
+\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\
+\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\
+\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x6a\0\0\0\xb7\
+\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\
+\x95\0\0\0\0\0\0\0\0\0\0\0\x1b\0\0\0\0\0\0\0\x42\0\0\0\x88\0\0\0\x1e\x94\x01\0\
+\x01\0\0\0\x42\0\0\0\x88\0\0\0\x24\x94\x01\0\x02\0\0\0\x42\0\0\0\x86\x02\0\0\
+\x1f\x9c\x01\0\x03\0\0\0\x42\0\0\0\xaa\x02\0\0\x06\xa8\x01\0\x04\0\0\0\x42\0\0\
+\0\xc3\x02\0\0\x0e\xb4\x01\0\x05\0\0\0\x42\0\0\0\x2b\x01\0\0\x1d\x98\x01\0\x06\
+\0\0\0\x42\0\0\0\x50\x01\0\0\x06\xb8\x01\0\x08\0\0\0\x42\0\0\0\xd5\x02\0\0\x03\
+\xbc\x01\0\x10\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x17\0\0\0\x42\0\0\0\
+\x80\x03\0\0\x06\x04\x01\0\x1a\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x1b\0\
+\0\0\x42\0\0\0\xd1\x03\0\0\x0f\x10\x01\0\x1c\0\0\0\x42\0\0\0\xe6\x03\0\0\x2d\
+\x14\x01\0\x1e\0\0\0\x42\0\0\0\x1d\x04\0\0\x0d\x0c\x01\0\x20\0\0\0\x42\0\0\0\
+\x45\x03\0\0\x02\xc4\x01\0\x21\0\0\0\x42\0\0\0\xe6\x03\0\0\x02\x14\x01\0\x24\0\
+\0\0\x42\0\0\0\x44\x04\0\0\x0d\x18\x01\0\x27\0\0\0\x42\0\0\0\x45\x03\0\0\x02\
+\xc4\x01\0\x28\0\0\0\x42\0\0\0\x44\x04\0\0\x0d\x18\x01\0\x2b\0\0\0\x42\0\0\0\
+\x44\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x72\x04\0\0\x1b\x1c\x01\0\x2d\0\
+\0\0\x42\0\0\0\x72\x04\0\0\x06\x1c\x01\0\x2e\0\0\0\x42\0\0\0\x95\x04\0\0\x0d\
+\x24\x01\0\x30\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x3f\0\0\0\x42\0\0\0\
+\x29\x02\0\0\x01\xd4\x01\0\0\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\
+\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x18\0\0\0\xf7\0\0\0\0\0\0\0\x20\0\0\0\x1c\0\0\
+\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x27\x01\0\0\0\0\0\0\x80\0\0\0\x1e\0\0\
+\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1e\0\0\0\xf7\0\0\0\0\0\0\0\xa8\0\0\0\x1e\0\0\0\
+\x78\x03\0\0\0\0\0\0\xb0\0\0\0\x1e\0\0\0\x7c\x03\0\0\0\0\0\0\xc0\0\0\0\x23\0\0\
+\0\xaa\x03\0\0\0\0\0\0\xd8\0\0\0\x24\0\0\0\xf7\0\0\0\0\0\0\0\xf0\0\0\0\x24\0\0\
+\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x28\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\0\0\x1e\0\
+\0\0\xf7\0\0\0\0\0\0\0\x60\x01\0\0\x24\0\0\0\x6c\x04\0\0\0\0\0\0\x88\x01\0\0\
+\x1e\0\0\0\x27\x01\0\0\0\0\0\0\x98\x01\0\0\x1e\0\0\0\xad\x04\0\0\0\0\0\0\xa0\
+\x01\0\0\x1c\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\
+\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\x01\0\0\0\0\0\0\0\
+\x12\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\
+\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0";
+ opts.insns_sz = 2456;
opts.insns = (void *)"\
\xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\
\0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\
@@ -331,79 +326,83 @@ iterators_bpf__load(struct iterators_bpf *skel)
\0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\
\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\
\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
-\x48\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\
-\0\0\x44\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\
-\0\0\0\0\x38\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\
-\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\
-\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\
+\xe8\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\
+\0\0\xe4\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\
+\0\0\0\0\xd8\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\xd0\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xd0\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\
\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\
-\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\x63\x01\0\0\0\
+\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x0f\0\0\x63\x01\0\0\0\
\0\0\0\x61\x60\x1c\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
-\x5c\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\
-\0\x50\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\
+\xfc\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\
+\0\xf0\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\
\xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\
-\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\
-\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\
+\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x38\
+\x0f\0\0\xb7\x02\0\0\x7b\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\
\0\0\0\x94\0\0\0\x05\0\x01\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x18\x62\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\
-\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\
-\0\0\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\
-\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\
-\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\
+\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc0\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xb8\x0f\0\0\x18\x61\0\0\0\0\0\0\0\
+\0\0\0\xc8\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\xd0\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xc0\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\
\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x9f\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\
-\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\
+\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xe0\x0f\0\0\
\xb7\x03\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x92\xff\
-\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
-\x78\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\
-\x61\0\0\0\0\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
-\0\0\0\x40\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\
-\x18\x60\0\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\
-\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\
-\0\0\0\0\0\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\
-\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\
-\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\
-\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\
-\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\
-\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\
+\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x20\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xf0\x0f\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\x18\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\x08\x11\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x60\x12\0\0\x7b\x01\0\0\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x11\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x70\x12\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xa0\x11\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\x90\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x12\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\
+\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x12\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\
+\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x2c\x12\0\0\x63\x01\0\0\0\0\0\0\x79\x60\
+\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x12\0\0\x7b\x01\0\0\0\0\0\0\x61\
+\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x58\x12\0\0\x63\x01\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xa0\x12\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\
\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\
-\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\
-\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\
-\0\0\0\0\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\
-\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\
-\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4a\xff\0\0\
-\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\
-\0\0\0\0\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\
-\x18\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\
-\x60\0\0\0\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\
-\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\
-\0\0\0\0\0\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\
-\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\
-\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\
-\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\
-\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\
-\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\
-\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\
-\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\
-\0\0\0\0\0\xc5\x07\x13\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\
-\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\
-\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\
-\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\
-\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\
-\x07\x01\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\
-\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\
-\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\
-\0\0\0\0\x95\0\0\0\0\0\0\0";
+\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x63\x70\x6c\0\0\0\0\0\
+\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\x18\x68\0\0\0\0\0\0\0\0\0\0\xa8\
+\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x12\0\0\xb7\x02\0\0\x17\0\0\0\xb7\x03\
+\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\
+\x07\x4d\xff\0\0\0\0\x75\x07\x03\0\0\0\0\0\x62\x08\x04\0\0\0\0\0\x6a\x08\x02\0\
+\0\0\0\0\x05\0\x0a\0\0\0\0\0\x63\x78\x04\0\0\0\0\0\xbf\x79\0\0\0\0\0\0\x77\x09\
+\0\0\x20\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\x63\x90\0\0\0\0\0\0\x55\
+\x09\x02\0\0\0\0\0\x6a\x08\x02\0\0\0\0\0\x05\0\x01\0\0\0\0\0\x6a\x08\x02\0\x40\
+\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\xb7\x03\0\
+\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\
+\0\0\x01\0\0\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\
+\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x80\x12\0\0\x61\x01\0\0\0\0\0\0\
+\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x2c\xff\
+\0\0\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x12\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\xa8\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\xd8\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xa0\x17\0\0\x7b\x01\0\0\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\xe0\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe8\x17\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x14\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\xf8\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x78\
+\x16\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x18\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x10\x18\0\0\x7b\x01\0\0\
+\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x17\0\0\x63\x01\
+\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb4\x17\0\0\x63\
+\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x17\0\0\
+\x7b\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\
+\x17\0\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x18\0\0\xb7\x02\0\
+\0\x12\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\
+\x07\0\0\0\0\0\0\xc5\x07\xf5\xfe\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x17\0\
+\0\x63\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\
+\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x98\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\
+\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x08\x18\0\0\
+\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\
+\0\0\xc5\x07\xe3\xfe\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\
+\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\
+\0\0\0\0\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\
+\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0";
err = bpf_load_and_run(&opts);
if (err < 0)
return err;
- skel->rodata = skel_finalize_map_data(&skel->maps.rodata.initial_value,
- 4096, PROT_READ, skel->maps.rodata.map_fd);
- if (!skel->rodata)
- return -ENOMEM;
return 0;
}
@@ -422,4 +421,15 @@ iterators_bpf__open_and_load(void)
return skel;
}
+__attribute__((unused)) static void
+iterators_bpf__assert(struct iterators_bpf *s __attribute__((unused)))
+{
+#ifdef __cplusplus
+#define _Static_assert static_assert
+#endif
+#ifdef __cplusplus
+#undef _Static_assert
+#endif
+}
+
#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 601609164ef3..8d2ddcb7566b 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -7,7 +7,6 @@
#include <linux/bpf.h>
#include <linux/list.h>
#include <linux/slab.h>
-#include <linux/capability.h>
#include <linux/btf_ids.h>
#include "percpu_freelist.h"
@@ -46,9 +45,6 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
/* Called from syscall */
static int queue_stack_map_alloc_check(union bpf_attr *attr)
{
- if (!bpf_capable())
- return -EPERM;
-
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 0 ||
attr->value_size == 0 ||
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index cbf2d8d784b8..4b4f9670f1a9 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -151,9 +151,6 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
int numa_node = bpf_map_attr_numa_node(attr);
struct reuseport_array *array;
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
/* allocate all map elements and zero-initialize them */
array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node);
if (!array)
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 875ac9b698d9..f045fde632e5 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -23,15 +23,6 @@
#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
-/* Maximum size of ring buffer area is limited by 32-bit page offset within
- * record header, counted in pages. Reserve 8 bits for extensibility, and take
- * into account few extra pages for consumer/producer pages and
- * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single
- * ring buffer.
- */
-#define RINGBUF_MAX_DATA_SZ \
- (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
-
struct bpf_ringbuf {
wait_queue_head_t waitq;
struct irq_work work;
@@ -161,6 +152,17 @@ static void bpf_ringbuf_notify(struct irq_work *work)
wake_up_all(&rb->waitq);
}
+/* Maximum size of ring buffer area is limited by 32-bit page offset within
+ * record header, counted in pages. Reserve 8 bits for extensibility, and
+ * take into account few extra pages for consumer/producer pages and
+ * non-mmap()'able parts, the current maximum size would be:
+ *
+ * (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
+ *
+ * This gives 64GB limit, which seems plenty for single ring buffer. Now
+ * considering that the maximum value of data_sz is (4GB - 1), there
+ * will be no overflow, so just note the size limit in the comments.
+ */
static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
{
struct bpf_ringbuf *rb;
@@ -193,12 +195,6 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
!PAGE_ALIGNED(attr->max_entries))
return ERR_PTR(-EINVAL);
-#ifdef CONFIG_64BIT
- /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */
- if (attr->max_entries > RINGBUF_MAX_DATA_SZ)
- return ERR_PTR(-E2BIG);
-#endif
-
rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE);
if (!rb_map)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b25fce425b2c..458bb80b14d5 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -74,9 +74,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
u64 cost, n_buckets;
int err;
- if (!bpf_capable())
- return ERR_PTR(-EPERM);
-
if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f1c8733f76b8..ebeb0695305a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -37,6 +37,8 @@
#include <linux/trace_events.h>
#include <net/netfilter/nf_bpf_link.h>
+#include <net/tcx.h>
+
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
@@ -109,37 +111,6 @@ const struct bpf_map_ops bpf_map_offload_ops = {
.map_mem_usage = bpf_map_offload_map_mem_usage,
};
-static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
-{
- const struct bpf_map_ops *ops;
- u32 type = attr->map_type;
- struct bpf_map *map;
- int err;
-
- if (type >= ARRAY_SIZE(bpf_map_types))
- return ERR_PTR(-EINVAL);
- type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));
- ops = bpf_map_types[type];
- if (!ops)
- return ERR_PTR(-EINVAL);
-
- if (ops->map_alloc_check) {
- err = ops->map_alloc_check(attr);
- if (err)
- return ERR_PTR(err);
- }
- if (attr->map_ifindex)
- ops = &bpf_map_offload_ops;
- if (!ops->map_mem_usage)
- return ERR_PTR(-EINVAL);
- map = ops->map_alloc(attr);
- if (IS_ERR(map))
- return map;
- map->ops = ops;
- map->map_type = type;
- return map;
-}
-
static void bpf_map_write_active_inc(struct bpf_map *map)
{
atomic64_inc(&map->writecnt);
@@ -686,7 +657,6 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
if (!btf_is_kernel(field->kptr.btf)) {
pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
field->kptr.btf_id);
- WARN_ON_ONCE(!pointee_struct_meta);
migrate_disable();
__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
pointee_struct_meta->record :
@@ -1127,7 +1097,9 @@ free_map_tab:
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
+ const struct bpf_map_ops *ops;
int numa_node = bpf_map_attr_numa_node(attr);
+ u32 map_type = attr->map_type;
struct bpf_map *map;
int f_flags;
int err;
@@ -1158,9 +1130,85 @@ static int map_create(union bpf_attr *attr)
return -EINVAL;
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
- map = find_and_alloc_map(attr);
+ map_type = attr->map_type;
+ if (map_type >= ARRAY_SIZE(bpf_map_types))
+ return -EINVAL;
+ map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
+ ops = bpf_map_types[map_type];
+ if (!ops)
+ return -EINVAL;
+
+ if (ops->map_alloc_check) {
+ err = ops->map_alloc_check(attr);
+ if (err)
+ return err;
+ }
+ if (attr->map_ifindex)
+ ops = &bpf_map_offload_ops;
+ if (!ops->map_mem_usage)
+ return -EINVAL;
+
+ /* Intent here is for unprivileged_bpf_disabled to block BPF map
+ * creation for unprivileged users; other actions depend
+ * on fd availability and access to bpffs, so are dependent on
+ * object creation success. Even with unprivileged BPF disabled,
+ * capability checks are still carried out.
+ */
+ if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
+ return -EPERM;
+
+ /* check privileged map type permissions */
+ switch (map_type) {
+ case BPF_MAP_TYPE_ARRAY:
+ case BPF_MAP_TYPE_PERCPU_ARRAY:
+ case BPF_MAP_TYPE_PROG_ARRAY:
+ case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+ case BPF_MAP_TYPE_CGROUP_ARRAY:
+ case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+ case BPF_MAP_TYPE_HASH:
+ case BPF_MAP_TYPE_PERCPU_HASH:
+ case BPF_MAP_TYPE_HASH_OF_MAPS:
+ case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_USER_RINGBUF:
+ case BPF_MAP_TYPE_CGROUP_STORAGE:
+ case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
+ /* unprivileged */
+ break;
+ case BPF_MAP_TYPE_SK_STORAGE:
+ case BPF_MAP_TYPE_INODE_STORAGE:
+ case BPF_MAP_TYPE_TASK_STORAGE:
+ case BPF_MAP_TYPE_CGRP_STORAGE:
+ case BPF_MAP_TYPE_BLOOM_FILTER:
+ case BPF_MAP_TYPE_LPM_TRIE:
+ case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
+ case BPF_MAP_TYPE_STACK_TRACE:
+ case BPF_MAP_TYPE_QUEUE:
+ case BPF_MAP_TYPE_STACK:
+ case BPF_MAP_TYPE_LRU_HASH:
+ case BPF_MAP_TYPE_LRU_PERCPU_HASH:
+ case BPF_MAP_TYPE_STRUCT_OPS:
+ case BPF_MAP_TYPE_CPUMAP:
+ if (!bpf_capable())
+ return -EPERM;
+ break;
+ case BPF_MAP_TYPE_SOCKMAP:
+ case BPF_MAP_TYPE_SOCKHASH:
+ case BPF_MAP_TYPE_DEVMAP:
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ case BPF_MAP_TYPE_XSKMAP:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ break;
+ default:
+ WARN(1, "unsupported map type %d", map_type);
+ return -EPERM;
+ }
+
+ map = ops->map_alloc(attr);
if (IS_ERR(map))
return PTR_ERR(map);
+ map->ops = ops;
+ map->map_type = map_type;
err = bpf_obj_name_cpy(map->name, attr->map_name,
sizeof(attr->map_name));
@@ -1931,6 +1979,11 @@ static int map_freeze(const union bpf_attr *attr)
return -ENOTSUPP;
}
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ fdput(f);
+ return -EPERM;
+ }
+
mutex_lock(&map->freeze_mutex);
if (bpf_map_write_active(map)) {
err = -EBUSY;
@@ -1940,10 +1993,6 @@ static int map_freeze(const union bpf_attr *attr)
err = -EBUSY;
goto err_put;
}
- if (!bpf_capable()) {
- err = -EPERM;
- goto err_put;
- }
WRITE_ONCE(map->frozen, true);
err_put:
@@ -2506,7 +2555,6 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
struct btf *attach_btf = NULL;
int err;
char license[128];
- bool is_gpl;
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
@@ -2525,15 +2573,15 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
!bpf_capable())
return -EPERM;
- /* copy eBPF program license from user space */
- if (strncpy_from_bpfptr(license,
- make_bpfptr(attr->license, uattr.is_kernel),
- sizeof(license) - 1) < 0)
- return -EFAULT;
- license[sizeof(license) - 1] = 0;
-
- /* eBPF programs must be GPL compatible to use GPL-ed functions */
- is_gpl = license_is_gpl_compatible(license);
+ /* Intent here is for unprivileged_bpf_disabled to block BPF program
+ * creation for unprivileged users; other actions depend
+ * on fd availability and access to bpffs, so are dependent on
+ * object creation success. Even with unprivileged BPF disabled,
+ * capability checks are still carried out for these
+ * and other operations.
+ */
+ if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
+ return -EPERM;
if (attr->insn_cnt == 0 ||
attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
@@ -2617,12 +2665,20 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
make_bpfptr(attr->insns, uattr.is_kernel),
bpf_prog_insn_size(prog)) != 0)
goto free_prog_sec;
+ /* copy eBPF program license from user space */
+ if (strncpy_from_bpfptr(license,
+ make_bpfptr(attr->license, uattr.is_kernel),
+ sizeof(license) - 1) < 0)
+ goto free_prog_sec;
+ license[sizeof(license) - 1] = 0;
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;
prog->orig_prog = NULL;
prog->jited = 0;
atomic64_set(&prog->aux->refcnt, 1);
- prog->gpl_compatible = is_gpl ? 1 : 0;
if (bpf_prog_is_dev_bound(prog->aux)) {
err = bpf_prog_dev_bound_init(prog, attr);
@@ -2701,23 +2757,38 @@ free_prog:
return err;
}
-#define BPF_OBJ_LAST_FIELD file_flags
+#define BPF_OBJ_LAST_FIELD path_fd
static int bpf_obj_pin(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
+ int path_fd;
+
+ if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD)
+ return -EINVAL;
+
+ /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
+ if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
return -EINVAL;
- return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
+ path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
+ return bpf_obj_pin_user(attr->bpf_fd, path_fd,
+ u64_to_user_ptr(attr->pathname));
}
static int bpf_obj_get(const union bpf_attr *attr)
{
+ int path_fd;
+
if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
- attr->file_flags & ~BPF_OBJ_FLAG_MASK)
+ attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD))
return -EINVAL;
- return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
+ /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
+ if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
+ return -EINVAL;
+
+ path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
+ return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname),
attr->file_flags);
}
@@ -2743,10 +2814,12 @@ static void bpf_link_free_id(int id)
/* Clean up bpf_link and corresponding anon_inode file and FD. After
* anon_inode is created, bpf_link can't be just kfree()'d due to deferred
- * anon_inode's release() call. This helper marksbpf_link as
+ * anon_inode's release() call. This helper marks bpf_link as
* defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
* is not decremented, it's the responsibility of a calling code that failed
* to complete bpf_link initialization.
+ * This helper eventually calls link's dealloc callback, but does not call
+ * link's release callback.
*/
void bpf_link_cleanup(struct bpf_link_primer *primer)
{
@@ -2781,28 +2854,31 @@ static void bpf_link_put_deferred(struct work_struct *work)
bpf_link_free(link);
}
-/* bpf_link_put can be called from atomic context, but ensures that resources
- * are freed from process context
+/* bpf_link_put might be called from atomic context. It needs to be called
+ * from sleepable context in order to acquire sleeping locks during the process.
*/
void bpf_link_put(struct bpf_link *link)
{
if (!atomic64_dec_and_test(&link->refcnt))
return;
- if (in_atomic()) {
- INIT_WORK(&link->work, bpf_link_put_deferred);
- schedule_work(&link->work);
- } else {
- bpf_link_free(link);
- }
+ INIT_WORK(&link->work, bpf_link_put_deferred);
+ schedule_work(&link->work);
}
EXPORT_SYMBOL(bpf_link_put);
+static void bpf_link_put_direct(struct bpf_link *link)
+{
+ if (!atomic64_dec_and_test(&link->refcnt))
+ return;
+ bpf_link_free(link);
+}
+
static int bpf_link_release(struct inode *inode, struct file *filp)
{
struct bpf_link *link = filp->private_data;
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return 0;
}
@@ -2972,10 +3048,17 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
{
struct bpf_tracing_link *tr_link =
container_of(link, struct bpf_tracing_link, link.link);
+ u32 target_btf_id, target_obj_id;
+ bpf_trampoline_unpack_key(tr_link->trampoline->key,
+ &target_obj_id, &target_btf_id);
seq_printf(seq,
- "attach_type:\t%d\n",
- tr_link->attach_type);
+ "attach_type:\t%d\n"
+ "target_obj_id:\t%u\n"
+ "target_btf_id:\t%u\n",
+ tr_link->attach_type,
+ target_obj_id,
+ target_btf_id);
}
static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
@@ -3215,6 +3298,25 @@ static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
raw_tp_link->btp->tp->name);
}
+static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen,
+ u32 len)
+{
+ if (ulen >= len + 1) {
+ if (copy_to_user(ubuf, buf, len + 1))
+ return -EFAULT;
+ } else {
+ char zero = '\0';
+
+ if (copy_to_user(ubuf, buf, ulen - 1))
+ return -EFAULT;
+ if (put_user(zero, ubuf + ulen - 1))
+ return -EFAULT;
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
@@ -3233,20 +3335,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
if (!ubuf)
return 0;
- if (ulen >= tp_len + 1) {
- if (copy_to_user(ubuf, tp_name, tp_len + 1))
- return -EFAULT;
- } else {
- char zero = '\0';
-
- if (copy_to_user(ubuf, tp_name, ulen - 1))
- return -EFAULT;
- if (put_user(zero, ubuf + ulen - 1))
- return -EFAULT;
- return -ENOSPC;
- }
-
- return 0;
+ return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len);
}
static const struct bpf_link_ops bpf_raw_tp_link_lops = {
@@ -3278,9 +3367,154 @@ static void bpf_perf_link_dealloc(struct bpf_link *link)
kfree(perf_link);
}
+static int bpf_perf_link_fill_common(const struct perf_event *event,
+ char __user *uname, u32 ulen,
+ u64 *probe_offset, u64 *probe_addr,
+ u32 *fd_type)
+{
+ const char *buf;
+ u32 prog_id;
+ size_t len;
+ int err;
+
+ if (!ulen ^ !uname)
+ return -EINVAL;
+
+ err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf,
+ probe_offset, probe_addr);
+ if (err)
+ return err;
+ if (!uname)
+ return 0;
+ if (buf) {
+ len = strlen(buf);
+ err = bpf_copy_to_user(uname, buf, ulen, len);
+ if (err)
+ return err;
+ } else {
+ char zero = '\0';
+
+ if (put_user(zero, uname))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_KPROBE_EVENTS
+static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ char __user *uname;
+ u64 addr, offset;
+ u32 ulen, type;
+ int err;
+
+ uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
+ ulen = info->perf_event.kprobe.name_len;
+ err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
+ &type);
+ if (err)
+ return err;
+ if (type == BPF_FD_TYPE_KRETPROBE)
+ info->perf_event.type = BPF_PERF_EVENT_KRETPROBE;
+ else
+ info->perf_event.type = BPF_PERF_EVENT_KPROBE;
+
+ info->perf_event.kprobe.offset = offset;
+ if (!kallsyms_show_value(current_cred()))
+ addr = 0;
+ info->perf_event.kprobe.addr = addr;
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_UPROBE_EVENTS
+static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ char __user *uname;
+ u64 addr, offset;
+ u32 ulen, type;
+ int err;
+
+ uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
+ ulen = info->perf_event.uprobe.name_len;
+ err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
+ &type);
+ if (err)
+ return err;
+
+ if (type == BPF_FD_TYPE_URETPROBE)
+ info->perf_event.type = BPF_PERF_EVENT_URETPROBE;
+ else
+ info->perf_event.type = BPF_PERF_EVENT_UPROBE;
+ info->perf_event.uprobe.offset = offset;
+ return 0;
+}
+#endif
+
+static int bpf_perf_link_fill_probe(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+#ifdef CONFIG_KPROBE_EVENTS
+ if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
+ return bpf_perf_link_fill_kprobe(event, info);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+ if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
+ return bpf_perf_link_fill_uprobe(event, info);
+#endif
+ return -EOPNOTSUPP;
+}
+
+static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ char __user *uname;
+ u32 ulen;
+
+ uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
+ ulen = info->perf_event.tracepoint.name_len;
+ info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
+ return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL);
+}
+
+static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ info->perf_event.event.type = event->attr.type;
+ info->perf_event.event.config = event->attr.config;
+ info->perf_event.type = BPF_PERF_EVENT_EVENT;
+ return 0;
+}
+
+static int bpf_perf_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_perf_link *perf_link;
+ const struct perf_event *event;
+
+ perf_link = container_of(link, struct bpf_perf_link, link);
+ event = perf_get_event(perf_link->perf_file);
+ if (IS_ERR(event))
+ return PTR_ERR(event);
+
+ switch (event->prog->type) {
+ case BPF_PROG_TYPE_PERF_EVENT:
+ return bpf_perf_link_fill_perf_event(event, info);
+ case BPF_PROG_TYPE_TRACEPOINT:
+ return bpf_perf_link_fill_tracepoint(event, info);
+ case BPF_PROG_TYPE_KPROBE:
+ return bpf_perf_link_fill_probe(event, info);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
static const struct bpf_link_ops bpf_perf_link_lops = {
.release = bpf_perf_link_release,
.dealloc = bpf_perf_link_dealloc,
+ .fill_link_info = bpf_perf_link_fill_link_info,
};
static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
@@ -3422,34 +3656,6 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
return fd;
}
-static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
- enum bpf_attach_type attach_type)
-{
- switch (prog->type) {
- case BPF_PROG_TYPE_CGROUP_SOCK:
- case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
- case BPF_PROG_TYPE_CGROUP_SOCKOPT:
- case BPF_PROG_TYPE_SK_LOOKUP:
- return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
- case BPF_PROG_TYPE_CGROUP_SKB:
- if (!capable(CAP_NET_ADMIN))
- /* cg-skb progs can be loaded by unpriv user.
- * check permissions at attach time.
- */
- return -EPERM;
- return prog->enforce_expected_attach_type &&
- prog->expected_attach_type != attach_type ?
- -EINVAL : 0;
- case BPF_PROG_TYPE_KPROBE:
- if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
- attach_type != BPF_TRACE_KPROBE_MULTI)
- return -EINVAL;
- return 0;
- default:
- return 0;
- }
-}
-
static enum bpf_prog_type
attach_type_to_prog_type(enum bpf_attach_type attach_type)
{
@@ -3508,31 +3714,101 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_XDP;
case BPF_LSM_CGROUP:
return BPF_PROG_TYPE_LSM;
+ case BPF_TCX_INGRESS:
+ case BPF_TCX_EGRESS:
+ return BPF_PROG_TYPE_SCHED_CLS;
default:
return BPF_PROG_TYPE_UNSPEC;
}
}
-#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
+static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
+ enum bpf_attach_type attach_type)
+{
+ enum bpf_prog_type ptype;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_SK_LOOKUP:
+ return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ if (!capable(CAP_NET_ADMIN))
+ /* cg-skb progs can be loaded by unpriv user.
+ * check permissions at attach time.
+ */
+ return -EPERM;
+ return prog->enforce_expected_attach_type &&
+ prog->expected_attach_type != attach_type ?
+ -EINVAL : 0;
+ case BPF_PROG_TYPE_EXT:
+ return 0;
+ case BPF_PROG_TYPE_NETFILTER:
+ if (attach_type != BPF_NETFILTER)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ if (attach_type != BPF_PERF_EVENT)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_KPROBE:
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
+ attach_type != BPF_TRACE_KPROBE_MULTI)
+ return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
+ attach_type != BPF_TRACE_UPROBE_MULTI)
+ return -EINVAL;
+ if (attach_type != BPF_PERF_EVENT &&
+ attach_type != BPF_TRACE_KPROBE_MULTI &&
+ attach_type != BPF_TRACE_UPROBE_MULTI)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ if (attach_type != BPF_TCX_INGRESS &&
+ attach_type != BPF_TCX_EGRESS)
+ return -EINVAL;
+ return 0;
+ default:
+ ptype = attach_type_to_prog_type(attach_type);
+ if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type)
+ return -EINVAL;
+ return 0;
+ }
+}
+
+#define BPF_PROG_ATTACH_LAST_FIELD expected_revision
+
+#define BPF_F_ATTACH_MASK_BASE \
+ (BPF_F_ALLOW_OVERRIDE | \
+ BPF_F_ALLOW_MULTI | \
+ BPF_F_REPLACE)
-#define BPF_F_ATTACH_MASK \
- (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
+#define BPF_F_ATTACH_MASK_MPROG \
+ (BPF_F_REPLACE | \
+ BPF_F_BEFORE | \
+ BPF_F_AFTER | \
+ BPF_F_ID | \
+ BPF_F_LINK)
static int bpf_prog_attach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
struct bpf_prog *prog;
+ u32 mask;
int ret;
if (CHECK_ATTR(BPF_PROG_ATTACH))
return -EINVAL;
- if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
- return -EINVAL;
-
ptype = attach_type_to_prog_type(attr->attach_type);
if (ptype == BPF_PROG_TYPE_UNSPEC)
return -EINVAL;
+ mask = bpf_mprog_supported(ptype) ?
+ BPF_F_ATTACH_MASK_MPROG : BPF_F_ATTACH_MASK_BASE;
+ if (attr->attach_flags & ~mask)
+ return -EINVAL;
prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
if (IS_ERR(prog))
@@ -3568,6 +3844,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
else
ret = cgroup_bpf_prog_attach(attr, ptype, prog);
break;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ ret = tcx_prog_attach(attr, prog);
+ break;
default:
ret = -EINVAL;
}
@@ -3577,25 +3856,41 @@ static int bpf_prog_attach(const union bpf_attr *attr)
return ret;
}
-#define BPF_PROG_DETACH_LAST_FIELD attach_type
+#define BPF_PROG_DETACH_LAST_FIELD expected_revision
static int bpf_prog_detach(const union bpf_attr *attr)
{
+ struct bpf_prog *prog = NULL;
enum bpf_prog_type ptype;
+ int ret;
if (CHECK_ATTR(BPF_PROG_DETACH))
return -EINVAL;
ptype = attach_type_to_prog_type(attr->attach_type);
+ if (bpf_mprog_supported(ptype)) {
+ if (ptype == BPF_PROG_TYPE_UNSPEC)
+ return -EINVAL;
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
+ return -EINVAL;
+ if (attr->attach_bpf_fd) {
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ }
+ }
switch (ptype) {
case BPF_PROG_TYPE_SK_MSG:
case BPF_PROG_TYPE_SK_SKB:
- return sock_map_prog_detach(attr, ptype);
+ ret = sock_map_prog_detach(attr, ptype);
+ break;
case BPF_PROG_TYPE_LIRC_MODE2:
- return lirc_prog_detach(attr);
+ ret = lirc_prog_detach(attr);
+ break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
- return netns_bpf_prog_detach(attr, ptype);
+ ret = netns_bpf_prog_detach(attr, ptype);
+ break;
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -3604,13 +3899,21 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_LSM:
- return cgroup_bpf_prog_detach(attr, ptype);
+ ret = cgroup_bpf_prog_detach(attr, ptype);
+ break;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ ret = tcx_prog_detach(attr, prog);
+ break;
default:
- return -EINVAL;
+ ret = -EINVAL;
}
+
+ if (prog)
+ bpf_prog_put(prog);
+ return ret;
}
-#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags
+#define BPF_PROG_QUERY_LAST_FIELD query.link_attach_flags
static int bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -3658,6 +3961,9 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_SK_MSG_VERDICT:
case BPF_SK_SKB_VERDICT:
return sock_map_bpf_prog_query(attr, uattr);
+ case BPF_TCX_INGRESS:
+ case BPF_TCX_EGRESS:
+ return tcx_prog_query(attr, uattr);
default:
return -EINVAL;
}
@@ -4575,10 +4881,9 @@ err_put:
return err;
}
-#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies
+#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid
static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
- enum bpf_prog_type ptype;
struct bpf_prog *prog;
int ret;
@@ -4598,38 +4903,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
goto out;
switch (prog->type) {
- case BPF_PROG_TYPE_EXT:
- break;
- case BPF_PROG_TYPE_NETFILTER:
- if (attr->link_create.attach_type != BPF_NETFILTER) {
- ret = -EINVAL;
- goto out;
- }
- break;
- case BPF_PROG_TYPE_PERF_EVENT:
- case BPF_PROG_TYPE_TRACEPOINT:
- if (attr->link_create.attach_type != BPF_PERF_EVENT) {
- ret = -EINVAL;
- goto out;
- }
- break;
- case BPF_PROG_TYPE_KPROBE:
- if (attr->link_create.attach_type != BPF_PERF_EVENT &&
- attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) {
- ret = -EINVAL;
- goto out;
- }
- break;
- default:
- ptype = attach_type_to_prog_type(attr->link_create.attach_type);
- if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
- ret = -EINVAL;
- goto out;
- }
- break;
- }
-
- switch (prog->type) {
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
@@ -4671,6 +4944,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_XDP:
ret = bpf_xdp_link_attach(attr, prog);
break;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ ret = tcx_link_attach(attr, prog);
+ break;
case BPF_PROG_TYPE_NETFILTER:
ret = bpf_nf_link_attach(attr, prog);
break;
@@ -4682,8 +4958,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_KPROBE:
if (attr->link_create.attach_type == BPF_PERF_EVENT)
ret = bpf_perf_link_attach(attr, prog);
- else
+ else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI)
ret = bpf_kprobe_multi_link_attach(attr, prog);
+ else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
+ ret = bpf_uprobe_multi_link_attach(attr, prog);
break;
default:
ret = -EINVAL;
@@ -4778,7 +5056,7 @@ out_put_progs:
if (ret)
bpf_prog_put(new_prog);
out_put_link:
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return ret;
}
@@ -4801,7 +5079,7 @@ static int link_detach(union bpf_attr *attr)
else
ret = -EOPNOTSUPP;
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return ret;
}
@@ -4871,7 +5149,7 @@ static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
fd = bpf_link_new_fd(link);
if (fd < 0)
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return fd;
}
@@ -4948,7 +5226,7 @@ static int bpf_iter_create(union bpf_attr *attr)
return PTR_ERR(link);
err = bpf_iter_new_fd(link);
- bpf_link_put(link);
+ bpf_link_put_direct(link);
return err;
}
@@ -5018,23 +5296,8 @@ out_prog_put:
static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
- bool capable;
int err;
- capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled;
-
- /* Intent here is for unprivileged_bpf_disabled to block key object
- * creation commands for unprivileged users; other actions depend
- * of fd availability and access to bpffs, so are dependent on
- * object creation success. Capabilities are later verified for
- * operations such as load and map create, so even with unprivileged
- * BPF disabled, capability checks are still carried out for these
- * and other operations.
- */
- if (!capable &&
- (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD))
- return -EPERM;
-
err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
if (err)
return err;
@@ -5394,7 +5657,8 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write,
*(int *)table->data = unpriv_enable;
}
- unpriv_ebpf_notify(unpriv_enable);
+ if (write)
+ unpriv_ebpf_notify(unpriv_enable);
return ret;
}
diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c
new file mode 100644
index 000000000000..13f0b5dc8262
--- /dev/null
+++ b/kernel/bpf/tcx.c
@@ -0,0 +1,352 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Isovalent */
+
+#include <linux/bpf.h>
+#include <linux/bpf_mprog.h>
+#include <linux/netdevice.h>
+
+#include <net/tcx.h>
+
+int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ bool created, ingress = attr->attach_type == BPF_TCX_INGRESS;
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct bpf_prog *replace_prog = NULL;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ if (attr->attach_flags & BPF_F_REPLACE) {
+ replace_prog = bpf_prog_get_type(attr->replace_bpf_fd,
+ prog->type);
+ if (IS_ERR(replace_prog)) {
+ ret = PTR_ERR(replace_prog);
+ replace_prog = NULL;
+ goto out;
+ }
+ }
+ entry = tcx_entry_fetch_or_create(dev, ingress, &created);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog,
+ attr->attach_flags, attr->relative_fd,
+ attr->expected_revision);
+ if (!ret) {
+ if (entry != entry_new) {
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_inc(ingress);
+ }
+ bpf_mprog_commit(entry);
+ } else if (created) {
+ tcx_entry_free(entry);
+ }
+out:
+ if (replace_prog)
+ bpf_prog_put(replace_prog);
+ rtnl_unlock();
+ return ret;
+}
+
+int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ bool ingress = attr->attach_type == BPF_TCX_INGRESS;
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags,
+ attr->relative_fd, attr->expected_revision);
+ if (!ret) {
+ if (!tcx_entry_is_active(entry_new))
+ entry_new = NULL;
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_dec(ingress);
+ bpf_mprog_commit(entry);
+ if (!entry_new)
+ tcx_entry_free(entry);
+ }
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+void tcx_uninstall(struct net_device *dev, bool ingress)
+{
+ struct bpf_mprog_entry *entry, *entry_new = NULL;
+ struct bpf_tuple tuple = {};
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ bool active;
+
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry)
+ return;
+ active = tcx_entry(entry)->miniq_active;
+ if (active)
+ bpf_mprog_clear_all(entry, &entry_new);
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ bpf_mprog_foreach_tuple(entry, fp, cp, tuple) {
+ if (tuple.link)
+ tcx_link(tuple.link)->dev = NULL;
+ else
+ bpf_prog_put(tuple.prog);
+ tcx_skeys_dec(ingress);
+ }
+ if (!active)
+ tcx_entry_free(entry);
+}
+
+int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+ bool ingress = attr->query.attach_type == BPF_TCX_INGRESS;
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_mprog_entry *entry;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->query.target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_query(attr, uattr, entry);
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd,
+ u64 revision)
+{
+ struct tcx_link *tcx = tcx_link(link);
+ bool created, ingress = tcx->location == BPF_TCX_INGRESS;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev = tcx->dev;
+ int ret;
+
+ ASSERT_RTNL();
+ entry = tcx_entry_fetch_or_create(dev, ingress, &created);
+ if (!entry)
+ return -ENOMEM;
+ ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags,
+ id_or_fd, revision);
+ if (!ret) {
+ if (entry != entry_new) {
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_inc(ingress);
+ }
+ bpf_mprog_commit(entry);
+ } else if (created) {
+ tcx_entry_free(entry);
+ }
+ return ret;
+}
+
+static void tcx_link_release(struct bpf_link *link)
+{
+ struct tcx_link *tcx = tcx_link(link);
+ bool ingress = tcx->location == BPF_TCX_INGRESS;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ dev = tcx->dev;
+ if (!dev)
+ goto out;
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0);
+ if (!ret) {
+ if (!tcx_entry_is_active(entry_new))
+ entry_new = NULL;
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_dec(ingress);
+ bpf_mprog_commit(entry);
+ if (!entry_new)
+ tcx_entry_free(entry);
+ tcx->dev = NULL;
+ }
+out:
+ WARN_ON_ONCE(ret);
+ rtnl_unlock();
+}
+
+static int tcx_link_update(struct bpf_link *link, struct bpf_prog *nprog,
+ struct bpf_prog *oprog)
+{
+ struct tcx_link *tcx = tcx_link(link);
+ bool ingress = tcx->location == BPF_TCX_INGRESS;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ dev = tcx->dev;
+ if (!dev) {
+ ret = -ENOLINK;
+ goto out;
+ }
+ if (oprog && link->prog != oprog) {
+ ret = -EPERM;
+ goto out;
+ }
+ oprog = link->prog;
+ if (oprog == nprog) {
+ bpf_prog_put(nprog);
+ goto out;
+ }
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog,
+ BPF_F_REPLACE | BPF_F_ID,
+ link->prog->aux->id, 0);
+ if (!ret) {
+ WARN_ON_ONCE(entry != entry_new);
+ oprog = xchg(&link->prog, nprog);
+ bpf_prog_put(oprog);
+ bpf_mprog_commit(entry);
+ }
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+static void tcx_link_dealloc(struct bpf_link *link)
+{
+ kfree(tcx_link(link));
+}
+
+static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
+{
+ const struct tcx_link *tcx = tcx_link_const(link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (tcx->dev)
+ ifindex = tcx->dev->ifindex;
+ rtnl_unlock();
+
+ seq_printf(seq, "ifindex:\t%u\n", ifindex);
+ seq_printf(seq, "attach_type:\t%u (%s)\n",
+ tcx->location,
+ tcx->location == BPF_TCX_INGRESS ? "ingress" : "egress");
+}
+
+static int tcx_link_fill_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ const struct tcx_link *tcx = tcx_link_const(link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (tcx->dev)
+ ifindex = tcx->dev->ifindex;
+ rtnl_unlock();
+
+ info->tcx.ifindex = ifindex;
+ info->tcx.attach_type = tcx->location;
+ return 0;
+}
+
+static int tcx_link_detach(struct bpf_link *link)
+{
+ tcx_link_release(link);
+ return 0;
+}
+
+static const struct bpf_link_ops tcx_link_lops = {
+ .release = tcx_link_release,
+ .detach = tcx_link_detach,
+ .dealloc = tcx_link_dealloc,
+ .update_prog = tcx_link_update,
+ .show_fdinfo = tcx_link_fdinfo,
+ .fill_link_info = tcx_link_fill_info,
+};
+
+static int tcx_link_init(struct tcx_link *tcx,
+ struct bpf_link_primer *link_primer,
+ const union bpf_attr *attr,
+ struct net_device *dev,
+ struct bpf_prog *prog)
+{
+ bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog);
+ tcx->location = attr->link_create.attach_type;
+ tcx->dev = dev;
+ return bpf_link_prime(&tcx->link, link_primer);
+}
+
+int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_link_primer link_primer;
+ struct net_device *dev;
+ struct tcx_link *tcx;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->link_create.target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ tcx = kzalloc(sizeof(*tcx), GFP_USER);
+ if (!tcx) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = tcx_link_init(tcx, &link_primer, attr, dev, prog);
+ if (ret) {
+ kfree(tcx);
+ goto out;
+ }
+ ret = tcx_link_prog_attach(&tcx->link, attr->link_create.flags,
+ attr->link_create.tcx.relative_fd,
+ attr->link_create.tcx.expected_revision);
+ if (ret) {
+ tcx->dev = NULL;
+ bpf_link_cleanup(&link_primer);
+ goto out;
+ }
+ ret = bpf_link_settle(&link_primer);
+out:
+ rtnl_unlock();
+ return ret;
+}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index ac021bc43a66..78acf28d4873 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -251,11 +251,8 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_a
return tlinks;
}
-static void __bpf_tramp_image_put_deferred(struct work_struct *work)
+static void bpf_tramp_image_free(struct bpf_tramp_image *im)
{
- struct bpf_tramp_image *im;
-
- im = container_of(work, struct bpf_tramp_image, work);
bpf_image_ksym_del(&im->ksym);
bpf_jit_free_exec(im->image);
bpf_jit_uncharge_modmem(PAGE_SIZE);
@@ -263,6 +260,14 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work)
kfree_rcu(im, rcu);
}
+static void __bpf_tramp_image_put_deferred(struct work_struct *work)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(work, struct bpf_tramp_image, work);
+ bpf_tramp_image_free(im);
+}
+
/* callback, fexit step 3 or fentry step 2 */
static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu)
{
@@ -344,7 +349,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
}
-static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
+static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
{
struct bpf_tramp_image *im;
struct bpf_ksym *ksym;
@@ -371,7 +376,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
ksym = &im->ksym;
INIT_LIST_HEAD_RCU(&ksym->lnode);
- snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx);
+ snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key);
bpf_image_ksym_add(image, ksym);
return im;
@@ -401,11 +406,10 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
err = unregister_fentry(tr, tr->cur_image->image);
bpf_tramp_image_put(tr->cur_image);
tr->cur_image = NULL;
- tr->selector = 0;
goto out;
}
- im = bpf_tramp_image_alloc(tr->key, tr->selector);
+ im = bpf_tramp_image_alloc(tr->key);
if (IS_ERR(im)) {
err = PTR_ERR(im);
goto out;
@@ -438,12 +442,11 @@ again:
&tr->func.model, tr->flags, tlinks,
tr->func.addr);
if (err < 0)
- goto out;
+ goto out_free;
set_memory_rox((long)im->image, 1);
- WARN_ON(tr->cur_image && tr->selector == 0);
- WARN_ON(!tr->cur_image && tr->selector);
+ WARN_ON(tr->cur_image && total == 0);
if (tr->cur_image)
/* progs already running at this address */
err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex);
@@ -468,18 +471,21 @@ again:
}
#endif
if (err)
- goto out;
+ goto out_free;
if (tr->cur_image)
bpf_tramp_image_put(tr->cur_image);
tr->cur_image = im;
- tr->selector++;
out:
/* If any error happens, restore previous flags */
if (err)
tr->flags = orig_flags;
kfree(tlinks);
return err;
+
+out_free:
+ bpf_tramp_image_free(im);
+ goto out;
}
static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cf5f230360f5..bb78212fa5b2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -25,6 +25,8 @@
#include <linux/btf_ids.h>
#include <linux/poison.h>
#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <net/xdp.h>
#include "disasm.h"
@@ -197,6 +199,7 @@ static int ref_set_non_owning(struct bpf_verifier_env *env,
struct bpf_reg_state *reg);
static void specialize_kfunc(struct bpf_verifier_env *env,
u32 func_id, u16 offset, unsigned long *addr);
+static bool is_trusted_reg(const struct bpf_reg_state *reg);
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
@@ -240,6 +243,12 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
(poisoned ? BPF_MAP_KEY_POISON : 0ULL);
}
+static bool bpf_helper_call(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0;
+}
+
static bool bpf_pseudo_call(const struct bpf_insn *insn)
{
return insn->code == (BPF_JMP | BPF_CALL) &&
@@ -273,11 +282,6 @@ struct bpf_call_arg_meta {
struct btf_field *kptr_field;
};
-struct btf_and_id {
- struct btf *btf;
- u32 btf_id;
-};
-
struct bpf_kfunc_call_arg_meta {
/* In parameters */
struct btf *btf;
@@ -296,10 +300,21 @@ struct bpf_kfunc_call_arg_meta {
u64 value;
bool found;
} arg_constant;
- union {
- struct btf_and_id arg_obj_drop;
- struct btf_and_id arg_refcount_acquire;
- };
+
+ /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
+ * generally to pass info about user-defined local kptr types to later
+ * verification logic
+ * bpf_obj_drop
+ * Record the local kptr type to be drop'd
+ * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
+ * Record the local kptr type to be refcount_incr'd and use
+ * arg_owning_ref to determine whether refcount_acquire should be
+ * fallible
+ */
+ struct btf *arg_btf;
+ u32 arg_btf_id;
+ bool arg_owning_ref;
+
struct {
struct btf_field *field;
} arg_list_head;
@@ -309,6 +324,7 @@ struct bpf_kfunc_call_arg_meta {
struct {
enum bpf_dynptr_type type;
u32 id;
+ u32 ref_obj_id;
} initialized_dynptr;
struct {
u8 spi;
@@ -429,8 +445,11 @@ static bool type_may_be_null(u32 type)
return type & PTR_MAYBE_NULL;
}
-static bool reg_type_not_null(enum bpf_reg_type type)
+static bool reg_not_null(const struct bpf_reg_state *reg)
{
+ enum bpf_reg_type type;
+
+ type = reg->type;
if (type_may_be_null(type))
return false;
@@ -440,6 +459,7 @@ static bool reg_type_not_null(enum bpf_reg_type type)
type == PTR_TO_MAP_VALUE ||
type == PTR_TO_MAP_KEY ||
type == PTR_TO_SOCK_COMMON ||
+ (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) ||
type == PTR_TO_MEM;
}
@@ -468,6 +488,13 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
return rec;
}
+static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
+{
+ struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux;
+
+ return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL;
+}
+
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
@@ -515,6 +542,8 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
return func_id == BPF_FUNC_dynptr_data;
}
+static bool is_callback_calling_kfunc(u32 btf_id);
+
static bool is_callback_calling_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_for_each_map_elem ||
@@ -524,6 +553,11 @@ static bool is_callback_calling_function(enum bpf_func_id func_id)
func_id == BPF_FUNC_user_ringbuf_drain;
}
+static bool is_async_callback_calling_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_timer_set_callback;
+}
+
static bool is_storage_get_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_sk_storage_get ||
@@ -604,9 +638,9 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
type & PTR_TRUSTED ? "trusted_" : ""
);
- snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
+ snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s",
prefix, str[base_type(type)], postfix);
- return env->type_str_buf;
+ return env->tmp_str_buf;
}
static char slot_type_char[] = {
@@ -847,11 +881,11 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
struct bpf_func_state *state, int spi);
static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- enum bpf_arg_type arg_type, int insn_idx)
+ enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id)
{
struct bpf_func_state *state = func(env, reg);
enum bpf_dynptr_type type;
- int spi, i, id, err;
+ int spi, i, err;
spi = dynptr_get_spi(env, reg);
if (spi < 0)
@@ -887,7 +921,13 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
if (dynptr_type_refcounted(type)) {
/* The id is used to track proper releasing */
- id = acquire_reference_state(env, insn_idx);
+ int id;
+
+ if (clone_ref_obj_id)
+ id = clone_ref_obj_id;
+ else
+ id = acquire_reference_state(env, insn_idx);
+
if (id < 0)
return id;
@@ -901,24 +941,15 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
return 0;
}
-static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi)
{
- struct bpf_func_state *state = func(env, reg);
- int spi, i;
-
- spi = dynptr_get_spi(env, reg);
- if (spi < 0)
- return spi;
+ int i;
for (i = 0; i < BPF_REG_SIZE; i++) {
state->stack[spi].slot_type[i] = STACK_INVALID;
state->stack[spi - 1].slot_type[i] = STACK_INVALID;
}
- /* Invalidate any slices associated with this dynptr */
- if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type))
- WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id));
-
__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
@@ -945,6 +976,50 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
*/
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
+}
+
+static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, ref_obj_id, i;
+
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+ invalidate_dynptr(env, state, spi);
+ return 0;
+ }
+
+ ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id;
+
+ /* If the dynptr has a ref_obj_id, then we need to invalidate
+ * two things:
+ *
+ * 1) Any dynptrs with a matching ref_obj_id (clones)
+ * 2) Any slices derived from this dynptr.
+ */
+
+ /* Invalidate any slices associated with this dynptr */
+ WARN_ON_ONCE(release_reference(env, ref_obj_id));
+
+ /* Invalidate any dynptr clones */
+ for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id)
+ continue;
+
+ /* it should always be the case that if the ref obj id
+ * matches then the stack slot also belongs to a
+ * dynptr
+ */
+ if (state->stack[i].slot_type[0] != STACK_DYNPTR) {
+ verbose(env, "verifier internal error: misconfigured ref_obj_id\n");
+ return -EFAULT;
+ }
+ if (state->stack[i].spilled_ptr.dynptr.first_slot)
+ invalidate_dynptr(env, state, i);
+ }
return 0;
}
@@ -1254,6 +1329,12 @@ static bool is_spilled_reg(const struct bpf_stack_state *stack)
return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL;
}
+static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack)
+{
+ return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL &&
+ stack->spilled_ptr.type == SCALAR_VALUE;
+}
+
static void scrub_spilled_slot(u8 *stype)
{
if (*stype != STACK_INVALID)
@@ -2775,7 +2856,10 @@ static int check_subprogs(struct bpf_verifier_env *env)
goto next;
if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
goto next;
- off = i + insn[i].off + 1;
+ if (code == (BPF_JMP32 | BPF_JA))
+ off = i + insn[i].imm + 1;
+ else
+ off = i + insn[i].off + 1;
if (off < subprog_start || off >= subprog_end) {
verbose(env, "jump out of range from insn %d to %d\n", i, off);
return -EINVAL;
@@ -2787,6 +2871,7 @@ next:
* or unconditional jump back
*/
if (code != (BPF_JMP | BPF_EXIT) &&
+ code != (BPF_JMP32 | BPF_JA) &&
code != (BPF_JMP | BPF_JA)) {
verbose(env, "last insn is not an exit or jmp\n");
return -EINVAL;
@@ -2932,8 +3017,10 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
+ if (class == BPF_ALU64 && op == BPF_END && (insn->imm == 16 || insn->imm == 32))
+ return false;
+
if (class == BPF_ALU64 || class == BPF_JMP ||
- /* BPF_END always use BPF_ALU class. */
(class == BPF_ALU && op == BPF_END && insn->imm == 64))
return true;
@@ -3144,12 +3231,172 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
return btf_name_by_offset(desc_btf, func->name_off);
}
+static inline void bt_init(struct backtrack_state *bt, u32 frame)
+{
+ bt->frame = frame;
+}
+
+static inline void bt_reset(struct backtrack_state *bt)
+{
+ struct bpf_verifier_env *env = bt->env;
+
+ memset(bt, 0, sizeof(*bt));
+ bt->env = env;
+}
+
+static inline u32 bt_empty(struct backtrack_state *bt)
+{
+ u64 mask = 0;
+ int i;
+
+ for (i = 0; i <= bt->frame; i++)
+ mask |= bt->reg_masks[i] | bt->stack_masks[i];
+
+ return mask == 0;
+}
+
+static inline int bt_subprog_enter(struct backtrack_state *bt)
+{
+ if (bt->frame == MAX_CALL_FRAMES - 1) {
+ verbose(bt->env, "BUG subprog enter from frame %d\n", bt->frame);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ bt->frame++;
+ return 0;
+}
+
+static inline int bt_subprog_exit(struct backtrack_state *bt)
+{
+ if (bt->frame == 0) {
+ verbose(bt->env, "BUG subprog exit from frame 0\n");
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ bt->frame--;
+ return 0;
+}
+
+static inline void bt_set_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ bt->reg_masks[frame] |= 1 << reg;
+}
+
+static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ bt->reg_masks[frame] &= ~(1 << reg);
+}
+
+static inline void bt_set_reg(struct backtrack_state *bt, u32 reg)
+{
+ bt_set_frame_reg(bt, bt->frame, reg);
+}
+
+static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg)
+{
+ bt_clear_frame_reg(bt, bt->frame, reg);
+}
+
+static inline void bt_set_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ bt->stack_masks[frame] |= 1ull << slot;
+}
+
+static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ bt->stack_masks[frame] &= ~(1ull << slot);
+}
+
+static inline void bt_set_slot(struct backtrack_state *bt, u32 slot)
+{
+ bt_set_frame_slot(bt, bt->frame, slot);
+}
+
+static inline void bt_clear_slot(struct backtrack_state *bt, u32 slot)
+{
+ bt_clear_frame_slot(bt, bt->frame, slot);
+}
+
+static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame)
+{
+ return bt->reg_masks[frame];
+}
+
+static inline u32 bt_reg_mask(struct backtrack_state *bt)
+{
+ return bt->reg_masks[bt->frame];
+}
+
+static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame)
+{
+ return bt->stack_masks[frame];
+}
+
+static inline u64 bt_stack_mask(struct backtrack_state *bt)
+{
+ return bt->stack_masks[bt->frame];
+}
+
+static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
+{
+ return bt->reg_masks[bt->frame] & (1 << reg);
+}
+
+static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot)
+{
+ return bt->stack_masks[bt->frame] & (1ull << slot);
+}
+
+/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
+static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
+{
+ DECLARE_BITMAP(mask, 64);
+ bool first = true;
+ int i, n;
+
+ buf[0] = '\0';
+
+ bitmap_from_u64(mask, reg_mask);
+ for_each_set_bit(i, mask, 32) {
+ n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i);
+ first = false;
+ buf += n;
+ buf_sz -= n;
+ if (buf_sz < 0)
+ break;
+ }
+}
+/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */
+static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
+{
+ DECLARE_BITMAP(mask, 64);
+ bool first = true;
+ int i, n;
+
+ buf[0] = '\0';
+
+ bitmap_from_u64(mask, stack_mask);
+ for_each_set_bit(i, mask, 64) {
+ n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8);
+ first = false;
+ buf += n;
+ buf_sz -= n;
+ if (buf_sz < 0)
+ break;
+ }
+}
+
/* For given verifier state backtrack_insn() is called from the last insn to
* the first insn. Its purpose is to compute a bitmask of registers and
* stack slots that needs precision in the parent verifier state.
+ *
+ * @idx is an index of the instruction we are currently processing;
+ * @subseq_idx is an index of the subsequent instruction that:
+ * - *would be* executed next, if jump history is viewed in forward order;
+ * - *was* processed previously during backtracking.
*/
-static int backtrack_insn(struct bpf_verifier_env *env, int idx,
- u32 *reg_mask, u64 *stack_mask)
+static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
+ struct backtrack_state *bt)
{
const struct bpf_insn_cbs cbs = {
.cb_call = disasm_kfunc_name,
@@ -3160,29 +3407,33 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
u8 class = BPF_CLASS(insn->code);
u8 opcode = BPF_OP(insn->code);
u8 mode = BPF_MODE(insn->code);
- u32 dreg = 1u << insn->dst_reg;
- u32 sreg = 1u << insn->src_reg;
- u32 spi;
+ u32 dreg = insn->dst_reg;
+ u32 sreg = insn->src_reg;
+ u32 spi, i;
if (insn->code == 0)
return 0;
if (env->log.level & BPF_LOG_LEVEL2) {
- verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask);
+ fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt));
+ verbose(env, "mark_precise: frame%d: regs=%s ",
+ bt->frame, env->tmp_str_buf);
+ fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
+ verbose(env, "stack=%s before ", env->tmp_str_buf);
verbose(env, "%d: ", idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
}
if (class == BPF_ALU || class == BPF_ALU64) {
- if (!(*reg_mask & dreg))
+ if (!bt_is_reg_set(bt, dreg))
return 0;
if (opcode == BPF_MOV) {
if (BPF_SRC(insn->code) == BPF_X) {
- /* dreg = sreg
+ /* dreg = sreg or dreg = (s8, s16, s32)sreg
* dreg needs precision after this insn
* sreg needs precision before this insn
*/
- *reg_mask &= ~dreg;
- *reg_mask |= sreg;
+ bt_clear_reg(bt, dreg);
+ bt_set_reg(bt, sreg);
} else {
/* dreg = K
* dreg needs precision after this insn.
@@ -3190,7 +3441,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
* as precise=true in this verifier state.
* No further markings in parent are necessary
*/
- *reg_mask &= ~dreg;
+ bt_clear_reg(bt, dreg);
}
} else {
if (BPF_SRC(insn->code) == BPF_X) {
@@ -3198,15 +3449,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
* both dreg and sreg need precision
* before this insn
*/
- *reg_mask |= sreg;
+ bt_set_reg(bt, sreg);
} /* else dreg += K
* dreg still needs precision before this insn
*/
}
} else if (class == BPF_LDX) {
- if (!(*reg_mask & dreg))
+ if (!bt_is_reg_set(bt, dreg))
return 0;
- *reg_mask &= ~dreg;
+ bt_clear_reg(bt, dreg);
/* scalars can only be spilled into stack w/o losing precision.
* Load from any other memory can be zero extended.
@@ -3227,9 +3478,9 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
}
- *stack_mask |= 1ull << spi;
+ bt_set_slot(bt, spi);
} else if (class == BPF_STX || class == BPF_ST) {
- if (*reg_mask & dreg)
+ if (bt_is_reg_set(bt, dreg))
/* stx & st shouldn't be using _scalar_ dst_reg
* to access memory. It means backtracking
* encountered a case of pointer subtraction.
@@ -3244,20 +3495,92 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
}
- if (!(*stack_mask & (1ull << spi)))
+ if (!bt_is_slot_set(bt, spi))
return 0;
- *stack_mask &= ~(1ull << spi);
+ bt_clear_slot(bt, spi);
if (class == BPF_STX)
- *reg_mask |= sreg;
+ bt_set_reg(bt, sreg);
} else if (class == BPF_JMP || class == BPF_JMP32) {
- if (opcode == BPF_CALL) {
- if (insn->src_reg == BPF_PSEUDO_CALL)
- return -ENOTSUPP;
- /* BPF helpers that invoke callback subprogs are
- * equivalent to BPF_PSEUDO_CALL above
+ if (bpf_pseudo_call(insn)) {
+ int subprog_insn_idx, subprog;
+
+ subprog_insn_idx = idx + insn->imm + 1;
+ subprog = find_subprog(env, subprog_insn_idx);
+ if (subprog < 0)
+ return -EFAULT;
+
+ if (subprog_is_global(env, subprog)) {
+ /* check that jump history doesn't have any
+ * extra instructions from subprog; the next
+ * instruction after call to global subprog
+ * should be literally next instruction in
+ * caller program
+ */
+ WARN_ONCE(idx + 1 != subseq_idx, "verifier backtracking bug");
+ /* r1-r5 are invalidated after subprog call,
+ * so for global func call it shouldn't be set
+ * anymore
+ */
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ /* global subprog always sets R0 */
+ bt_clear_reg(bt, BPF_REG_0);
+ return 0;
+ } else {
+ /* static subprog call instruction, which
+ * means that we are exiting current subprog,
+ * so only r1-r5 could be still requested as
+ * precise, r0 and r6-r10 or any stack slot in
+ * the current frame should be zero by now
+ */
+ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+ verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ /* we don't track register spills perfectly,
+ * so fallback to force-precise instead of failing */
+ if (bt_stack_mask(bt) != 0)
+ return -ENOTSUPP;
+ /* propagate r1-r5 to the caller */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
+ if (bt_is_reg_set(bt, i)) {
+ bt_clear_reg(bt, i);
+ bt_set_frame_reg(bt, bt->frame - 1, i);
+ }
+ }
+ if (bt_subprog_exit(bt))
+ return -EFAULT;
+ return 0;
+ }
+ } else if ((bpf_helper_call(insn) &&
+ is_callback_calling_function(insn->imm) &&
+ !is_async_callback_calling_function(insn->imm)) ||
+ (bpf_pseudo_kfunc_call(insn) && is_callback_calling_kfunc(insn->imm))) {
+ /* callback-calling helper or kfunc call, which means
+ * we are exiting from subprog, but unlike the subprog
+ * call handling above, we shouldn't propagate
+ * precision of r1-r5 (if any requested), as they are
+ * not actually arguments passed directly to callback
+ * subprogs
*/
- if (insn->src_reg == 0 && is_callback_calling_function(insn->imm))
+ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+ verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ if (bt_stack_mask(bt) != 0)
return -ENOTSUPP;
+ /* clear r1-r5 in callback subprog's mask */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ bt_clear_reg(bt, i);
+ if (bt_subprog_exit(bt))
+ return -EFAULT;
+ return 0;
+ } else if (opcode == BPF_CALL) {
/* kfunc with imm==0 is invalid and fixup_kfunc_call will
* catch this error later. Make backtracking conservative
* with ENOTSUPP.
@@ -3265,19 +3588,51 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
return -ENOTSUPP;
/* regular helper call sets R0 */
- *reg_mask &= ~1;
- if (*reg_mask & 0x3f) {
+ bt_clear_reg(bt, BPF_REG_0);
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
/* if backtracing was looking for registers R1-R5
* they should have been found already.
*/
- verbose(env, "BUG regs %x\n", *reg_mask);
+ verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
}
} else if (opcode == BPF_EXIT) {
- return -ENOTSUPP;
+ bool r0_precise;
+
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ /* if backtracing was looking for registers R1-R5
+ * they should have been found already.
+ */
+ verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+
+ /* BPF_EXIT in subprog or callback always returns
+ * right after the call instruction, so by checking
+ * whether the instruction at subseq_idx-1 is subprog
+ * call or not we can distinguish actual exit from
+ * *subprog* from exit from *callback*. In the former
+ * case, we need to propagate r0 precision, if
+ * necessary. In the former we never do that.
+ */
+ r0_precise = subseq_idx - 1 >= 0 &&
+ bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) &&
+ bt_is_reg_set(bt, BPF_REG_0);
+
+ bt_clear_reg(bt, BPF_REG_0);
+ if (bt_subprog_enter(bt))
+ return -EFAULT;
+
+ if (r0_precise)
+ bt_set_reg(bt, BPF_REG_0);
+ /* r6-r9 and stack slots will stay set in caller frame
+ * bitmasks until we return back from callee(s)
+ */
+ return 0;
} else if (BPF_SRC(insn->code) == BPF_X) {
- if (!(*reg_mask & (dreg | sreg)))
+ if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg))
return 0;
/* dreg <cond> sreg
* Both dreg and sreg need precision before
@@ -3285,7 +3640,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
* before it would be equally necessary to
* propagate it to dreg.
*/
- *reg_mask |= (sreg | dreg);
+ bt_set_reg(bt, dreg);
+ bt_set_reg(bt, sreg);
/* else dreg <cond> K
* Only dreg still needs precision before
* this insn, so for the K-based conditional
@@ -3293,9 +3649,9 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
*/
}
} else if (class == BPF_LD) {
- if (!(*reg_mask & dreg))
+ if (!bt_is_reg_set(bt, dreg))
return 0;
- *reg_mask &= ~dreg;
+ bt_clear_reg(bt, dreg);
/* It's ld_imm64 or ld_abs or ld_ind.
* For ld_imm64 no further tracking of precision
* into parent is necessary
@@ -3366,6 +3722,11 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
struct bpf_reg_state *reg;
int i, j;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n",
+ st->curframe);
+ }
+
/* big hammer: mark all scalars precise in this path.
* pop_stack may still get !precise scalars.
* We also skip current state and go straight to first parent state,
@@ -3377,17 +3738,25 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
func = st->frame[i];
for (j = 0; j < BPF_REG_FP; j++) {
reg = &func->regs[j];
- if (reg->type != SCALAR_VALUE)
+ if (reg->type != SCALAR_VALUE || reg->precise)
continue;
reg->precise = true;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "force_precise: frame%d: forcing r%d to be precise\n",
+ i, j);
+ }
}
for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
if (!is_spilled_reg(&func->stack[j]))
continue;
reg = &func->stack[j].spilled_ptr;
- if (reg->type != SCALAR_VALUE)
+ if (reg->type != SCALAR_VALUE || reg->precise)
continue;
reg->precise = true;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n",
+ i, -(j + 1) * 8);
+ }
}
}
}
@@ -3418,6 +3787,96 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_
}
}
+static bool idset_contains(struct bpf_idset *s, u32 id)
+{
+ u32 i;
+
+ for (i = 0; i < s->count; ++i)
+ if (s->ids[i] == id)
+ return true;
+
+ return false;
+}
+
+static int idset_push(struct bpf_idset *s, u32 id)
+{
+ if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids)))
+ return -EFAULT;
+ s->ids[s->count++] = id;
+ return 0;
+}
+
+static void idset_reset(struct bpf_idset *s)
+{
+ s->count = 0;
+}
+
+/* Collect a set of IDs for all registers currently marked as precise in env->bt.
+ * Mark all registers with these IDs as precise.
+ */
+static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_idset *precise_ids = &env->idset_scratch;
+ struct backtrack_state *bt = &env->bt;
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ DECLARE_BITMAP(mask, 64);
+ int i, fr;
+
+ idset_reset(precise_ids);
+
+ for (fr = bt->frame; fr >= 0; fr--) {
+ func = st->frame[fr];
+
+ bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
+ for_each_set_bit(i, mask, 32) {
+ reg = &func->regs[i];
+ if (!reg->id || reg->type != SCALAR_VALUE)
+ continue;
+ if (idset_push(precise_ids, reg->id))
+ return -EFAULT;
+ }
+
+ bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
+ for_each_set_bit(i, mask, 64) {
+ if (i >= func->allocated_stack / BPF_REG_SIZE)
+ break;
+ if (!is_spilled_scalar_reg(&func->stack[i]))
+ continue;
+ reg = &func->stack[i].spilled_ptr;
+ if (!reg->id)
+ continue;
+ if (idset_push(precise_ids, reg->id))
+ return -EFAULT;
+ }
+ }
+
+ for (fr = 0; fr <= st->curframe; ++fr) {
+ func = st->frame[fr];
+
+ for (i = BPF_REG_0; i < BPF_REG_10; ++i) {
+ reg = &func->regs[i];
+ if (!reg->id)
+ continue;
+ if (!idset_contains(precise_ids, reg->id))
+ continue;
+ bt_set_frame_reg(bt, fr, i);
+ }
+ for (i = 0; i < func->allocated_stack / BPF_REG_SIZE; ++i) {
+ if (!is_spilled_scalar_reg(&func->stack[i]))
+ continue;
+ reg = &func->stack[i].spilled_ptr;
+ if (!reg->id)
+ continue;
+ if (!idset_contains(precise_ids, reg->id))
+ continue;
+ bt_set_frame_slot(bt, fr, i);
+ }
+ }
+
+ return 0;
+}
+
/*
* __mark_chain_precision() backtracks BPF program instruction sequence and
* chain of verifier states making sure that register *regno* (if regno >= 0)
@@ -3505,62 +3964,74 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_
* mark_all_scalars_imprecise() to hopefully get more permissive and generic
* finalized states which help in short circuiting more future states.
*/
-static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,
- int spi)
+static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
+ struct backtrack_state *bt = &env->bt;
struct bpf_verifier_state *st = env->cur_state;
int first_idx = st->first_insn_idx;
int last_idx = env->insn_idx;
+ int subseq_idx = -1;
struct bpf_func_state *func;
struct bpf_reg_state *reg;
- u32 reg_mask = regno >= 0 ? 1u << regno : 0;
- u64 stack_mask = spi >= 0 ? 1ull << spi : 0;
bool skip_first = true;
- bool new_marks = false;
- int i, err;
+ int i, fr, err;
if (!env->bpf_capable)
return 0;
+ /* set frame number from which we are starting to backtrack */
+ bt_init(bt, env->cur_state->curframe);
+
/* Do sanity checks against current state of register and/or stack
* slot, but don't set precise flag in current state, as precision
* tracking in the current state is unnecessary.
*/
- func = st->frame[frame];
+ func = st->frame[bt->frame];
if (regno >= 0) {
reg = &func->regs[regno];
if (reg->type != SCALAR_VALUE) {
WARN_ONCE(1, "backtracing misuse");
return -EFAULT;
}
- new_marks = true;
+ bt_set_reg(bt, regno);
}
- while (spi >= 0) {
- if (!is_spilled_reg(&func->stack[spi])) {
- stack_mask = 0;
- break;
- }
- reg = &func->stack[spi].spilled_ptr;
- if (reg->type != SCALAR_VALUE) {
- stack_mask = 0;
- break;
- }
- new_marks = true;
- break;
- }
-
- if (!new_marks)
- return 0;
- if (!reg_mask && !stack_mask)
+ if (bt_empty(bt))
return 0;
for (;;) {
DECLARE_BITMAP(mask, 64);
u32 history = st->jmp_history_cnt;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
+ bt->frame, last_idx, first_idx, subseq_idx);
+ }
+
+ /* If some register with scalar ID is marked as precise,
+ * make sure that all registers sharing this ID are also precise.
+ * This is needed to estimate effect of find_equal_scalars().
+ * Do this at the last instruction of each state,
+ * bpf_reg_state::id fields are valid for these instructions.
+ *
+ * Allows to track precision in situation like below:
+ *
+ * r2 = unknown value
+ * ...
+ * --- state #0 ---
+ * ...
+ * r1 = r2 // r1 and r2 now share the same ID
+ * ...
+ * --- state #1 {r1.id = A, r2.id = A} ---
+ * ...
+ * if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1
+ * ...
+ * --- state #2 {r1.id = A, r2.id = A} ---
+ * r3 = r10
+ * r3 += r1 // need to mark both r1 and r2
+ */
+ if (mark_precise_scalar_ids(env, st))
+ return -EFAULT;
if (last_idx < 0) {
/* we are at the entry into subprog, which
@@ -3571,12 +4042,13 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
if (st->curframe == 0 &&
st->frame[0]->subprogno > 0 &&
st->frame[0]->callsite == BPF_MAIN_FUNC &&
- stack_mask == 0 && (reg_mask & ~0x3e) == 0) {
- bitmap_from_u64(mask, reg_mask);
+ bt_stack_mask(bt) == 0 &&
+ (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) {
+ bitmap_from_u64(mask, bt_reg_mask(bt));
for_each_set_bit(i, mask, 32) {
reg = &st->frame[0]->regs[i];
if (reg->type != SCALAR_VALUE) {
- reg_mask &= ~(1u << i);
+ bt_clear_reg(bt, i);
continue;
}
reg->precise = true;
@@ -3584,8 +4056,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
return 0;
}
- verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n",
- st->frame[0]->subprogno, reg_mask, stack_mask);
+ verbose(env, "BUG backtracking func entry subprog %d reg_mask %x stack_mask %llx\n",
+ st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
}
@@ -3595,15 +4067,16 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
err = 0;
skip_first = false;
} else {
- err = backtrack_insn(env, i, &reg_mask, &stack_mask);
+ err = backtrack_insn(env, i, subseq_idx, bt);
}
if (err == -ENOTSUPP) {
- mark_all_scalars_precise(env, st);
+ mark_all_scalars_precise(env, env->cur_state);
+ bt_reset(bt);
return 0;
} else if (err) {
return err;
}
- if (!reg_mask && !stack_mask)
+ if (bt_empty(bt))
/* Found assignment(s) into tracked register in this state.
* Since this state is already marked, just return.
* Nothing to be tracked further in the parent state.
@@ -3611,6 +4084,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
return 0;
if (i == first_idx)
break;
+ subseq_idx = i;
i = get_prev_insn_idx(st, i, &history);
if (i >= env->prog->len) {
/* This can happen if backtracking reached insn 0
@@ -3628,84 +4102,95 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
if (!st)
break;
- new_marks = false;
- func = st->frame[frame];
- bitmap_from_u64(mask, reg_mask);
- for_each_set_bit(i, mask, 32) {
- reg = &func->regs[i];
- if (reg->type != SCALAR_VALUE) {
- reg_mask &= ~(1u << i);
- continue;
+ for (fr = bt->frame; fr >= 0; fr--) {
+ func = st->frame[fr];
+ bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
+ for_each_set_bit(i, mask, 32) {
+ reg = &func->regs[i];
+ if (reg->type != SCALAR_VALUE) {
+ bt_clear_frame_reg(bt, fr, i);
+ continue;
+ }
+ if (reg->precise)
+ bt_clear_frame_reg(bt, fr, i);
+ else
+ reg->precise = true;
}
- if (!reg->precise)
- new_marks = true;
- reg->precise = true;
- }
- bitmap_from_u64(mask, stack_mask);
- for_each_set_bit(i, mask, 64) {
- if (i >= func->allocated_stack / BPF_REG_SIZE) {
- /* the sequence of instructions:
- * 2: (bf) r3 = r10
- * 3: (7b) *(u64 *)(r3 -8) = r0
- * 4: (79) r4 = *(u64 *)(r10 -8)
- * doesn't contain jmps. It's backtracked
- * as a single block.
- * During backtracking insn 3 is not recognized as
- * stack access, so at the end of backtracking
- * stack slot fp-8 is still marked in stack_mask.
- * However the parent state may not have accessed
- * fp-8 and it's "unallocated" stack space.
- * In such case fallback to conservative.
- */
- mark_all_scalars_precise(env, st);
- return 0;
- }
+ bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
+ for_each_set_bit(i, mask, 64) {
+ if (i >= func->allocated_stack / BPF_REG_SIZE) {
+ /* the sequence of instructions:
+ * 2: (bf) r3 = r10
+ * 3: (7b) *(u64 *)(r3 -8) = r0
+ * 4: (79) r4 = *(u64 *)(r10 -8)
+ * doesn't contain jmps. It's backtracked
+ * as a single block.
+ * During backtracking insn 3 is not recognized as
+ * stack access, so at the end of backtracking
+ * stack slot fp-8 is still marked in stack_mask.
+ * However the parent state may not have accessed
+ * fp-8 and it's "unallocated" stack space.
+ * In such case fallback to conservative.
+ */
+ mark_all_scalars_precise(env, env->cur_state);
+ bt_reset(bt);
+ return 0;
+ }
- if (!is_spilled_reg(&func->stack[i])) {
- stack_mask &= ~(1ull << i);
- continue;
+ if (!is_spilled_scalar_reg(&func->stack[i])) {
+ bt_clear_frame_slot(bt, fr, i);
+ continue;
+ }
+ reg = &func->stack[i].spilled_ptr;
+ if (reg->precise)
+ bt_clear_frame_slot(bt, fr, i);
+ else
+ reg->precise = true;
}
- reg = &func->stack[i].spilled_ptr;
- if (reg->type != SCALAR_VALUE) {
- stack_mask &= ~(1ull << i);
- continue;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+ bt_frame_reg_mask(bt, fr));
+ verbose(env, "mark_precise: frame%d: parent state regs=%s ",
+ fr, env->tmp_str_buf);
+ fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+ bt_frame_stack_mask(bt, fr));
+ verbose(env, "stack=%s: ", env->tmp_str_buf);
+ print_verifier_state(env, func, true);
}
- if (!reg->precise)
- new_marks = true;
- reg->precise = true;
- }
- if (env->log.level & BPF_LOG_LEVEL2) {
- verbose(env, "parent %s regs=%x stack=%llx marks:",
- new_marks ? "didn't have" : "already had",
- reg_mask, stack_mask);
- print_verifier_state(env, func, true);
}
- if (!reg_mask && !stack_mask)
- break;
- if (!new_marks)
- break;
+ if (bt_empty(bt))
+ return 0;
+ subseq_idx = first_idx;
last_idx = st->last_insn_idx;
first_idx = st->first_insn_idx;
}
+
+ /* if we still have requested precise regs or slots, we missed
+ * something (e.g., stack access through non-r10 register), so
+ * fallback to marking all precise
+ */
+ if (!bt_empty(bt)) {
+ mark_all_scalars_precise(env, env->cur_state);
+ bt_reset(bt);
+ }
+
return 0;
}
int mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
- return __mark_chain_precision(env, env->cur_state->curframe, regno, -1);
-}
-
-static int mark_chain_precision_frame(struct bpf_verifier_env *env, int frame, int regno)
-{
- return __mark_chain_precision(env, frame, regno, -1);
+ return __mark_chain_precision(env, regno);
}
-static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int frame, int spi)
+/* mark_chain_precision_batch() assumes that env->bt is set in the caller to
+ * desired reg and stack masks across all relevant frames
+ */
+static int mark_chain_precision_batch(struct bpf_verifier_env *env)
{
- return __mark_chain_precision(env, frame, -1, spi);
+ return __mark_chain_precision(env, -1);
}
static bool is_spillable_regtype(enum bpf_reg_type type)
@@ -4070,6 +4555,7 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env,
for (i = min_off; i < max_off; i++) {
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
+ mark_stack_slot_scratched(env, spi);
stype = ptr_state->stack[spi].slot_type;
if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
break;
@@ -4121,6 +4607,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
stype = reg_state->stack[spi].slot_type;
reg = &reg_state->stack[spi].spilled_ptr;
+ mark_stack_slot_scratched(env, spi);
+
if (is_spilled_reg(&reg_state->stack[spi])) {
u8 spill_size = 1;
@@ -4502,20 +4990,22 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, u32 regno)
{
const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
- int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
+ int perm_flags;
const char *reg_name = "";
- /* Only unreferenced case accepts untrusted pointers */
- if (kptr_field->type == BPF_KPTR_UNREF)
- perm_flags |= PTR_UNTRUSTED;
+ if (btf_is_kernel(reg->btf)) {
+ perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
+
+ /* Only unreferenced case accepts untrusted pointers */
+ if (kptr_field->type == BPF_KPTR_UNREF)
+ perm_flags |= PTR_UNTRUSTED;
+ } else {
+ perm_flags = PTR_MAYBE_NULL | MEM_ALLOC;
+ }
if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
goto bad_type;
- if (!btf_is_kernel(reg->btf)) {
- verbose(env, "R%d must point to kernel BTF\n", regno);
- return -EINVAL;
- }
/* We need to verify reg->type and reg->btf, before accessing reg->btf */
reg_name = btf_type_name(reg->btf, reg->btf_id);
@@ -4528,7 +5018,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
if (__check_ptr_off_reg(env, reg, regno, true))
return -EACCES;
- /* A full type match is needed, as BTF can be vmlinux or module BTF, and
+ /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and
* we also need to take into account the reg->off.
*
* We want to support cases like:
@@ -4574,7 +5064,9 @@ bad_type:
*/
static bool in_rcu_cs(struct bpf_verifier_env *env)
{
- return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable;
+ return env->cur_state->active_rcu_lock ||
+ env->cur_state->active_lock.ptr ||
+ !env->prog->aux->sleepable;
}
/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
@@ -4932,12 +5424,25 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
return reg->type == PTR_TO_FLOW_KEYS;
}
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+ [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+ [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+ [CONST_PTR_TO_MAP] = btf_bpf_map_id,
+};
+
static bool is_trusted_reg(const struct bpf_reg_state *reg)
{
/* A referenced register is always trusted. */
if (reg->ref_obj_id)
return true;
+ /* Types listed in the reg2btf_ids are always trusted */
+ if (reg2btf_ids[base_type(reg->type)])
+ return true;
+
/* If a register is not referenced, it is trusted if it has the
* MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
* other type modifiers may be safe, but we elect to take an opt-in
@@ -5093,16 +5598,17 @@ static int update_stack_depth(struct bpf_verifier_env *env,
* Since recursion is prevented by check_cfg() this algorithm
* only needs a local stack of MAX_CALL_FRAMES to remember callsites
*/
-static int check_max_stack_depth(struct bpf_verifier_env *env)
+static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
{
- int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
+ int depth = 0, frame = 0, i, subprog_end;
bool tail_call_reachable = false;
int ret_insn[MAX_CALL_FRAMES];
int ret_prog[MAX_CALL_FRAMES];
int j;
+ i = subprog[idx].start;
process_func:
/* protect against potential stack overflow that might happen when
* bpf2bpf calls get combined with tailcalls. Limit the caller's stack
@@ -5141,7 +5647,7 @@ process_func:
continue_func:
subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
- int next_insn;
+ int next_insn, sidx;
if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
@@ -5151,21 +5657,23 @@ continue_func:
/* find the callee */
next_insn = i + insn[i].imm + 1;
- idx = find_subprog(env, next_insn);
- if (idx < 0) {
+ sidx = find_subprog(env, next_insn);
+ if (sidx < 0) {
WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
next_insn);
return -EFAULT;
}
- if (subprog[idx].is_async_cb) {
- if (subprog[idx].has_tail_call) {
+ if (subprog[sidx].is_async_cb) {
+ if (subprog[sidx].has_tail_call) {
verbose(env, "verifier bug. subprog has tail_call and async cb\n");
return -EFAULT;
}
- /* async callbacks don't increase bpf prog stack size */
- continue;
+ /* async callbacks don't increase bpf prog stack size unless called directly */
+ if (!bpf_pseudo_call(insn + i))
+ continue;
}
i = next_insn;
+ idx = sidx;
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
@@ -5201,6 +5709,22 @@ continue_func:
goto continue_func;
}
+static int check_max_stack_depth(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *si = env->subprog_info;
+ int ret;
+
+ for (int i = 0; i < env->subprog_cnt; i++) {
+ if (!i || si[i].is_async_cb) {
+ ret = check_max_stack_depth_subprog(env, i);
+ if (ret < 0)
+ return ret;
+ }
+ continue;
+ }
+ return 0;
+}
+
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
static int get_callee_stack_depth(struct bpf_verifier_env *env,
const struct bpf_insn *insn, int idx)
@@ -5314,6 +5838,147 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
__reg_combine_64_into_32(reg);
}
+static void set_sext64_default_val(struct bpf_reg_state *reg, int size)
+{
+ if (size == 1) {
+ reg->smin_value = reg->s32_min_value = S8_MIN;
+ reg->smax_value = reg->s32_max_value = S8_MAX;
+ } else if (size == 2) {
+ reg->smin_value = reg->s32_min_value = S16_MIN;
+ reg->smax_value = reg->s32_max_value = S16_MAX;
+ } else {
+ /* size == 4 */
+ reg->smin_value = reg->s32_min_value = S32_MIN;
+ reg->smax_value = reg->s32_max_value = S32_MAX;
+ }
+ reg->umin_value = reg->u32_min_value = 0;
+ reg->umax_value = U64_MAX;
+ reg->u32_max_value = U32_MAX;
+ reg->var_off = tnum_unknown;
+}
+
+static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size)
+{
+ s64 init_s64_max, init_s64_min, s64_max, s64_min, u64_cval;
+ u64 top_smax_value, top_smin_value;
+ u64 num_bits = size * 8;
+
+ if (tnum_is_const(reg->var_off)) {
+ u64_cval = reg->var_off.value;
+ if (size == 1)
+ reg->var_off = tnum_const((s8)u64_cval);
+ else if (size == 2)
+ reg->var_off = tnum_const((s16)u64_cval);
+ else
+ /* size == 4 */
+ reg->var_off = tnum_const((s32)u64_cval);
+
+ u64_cval = reg->var_off.value;
+ reg->smax_value = reg->smin_value = u64_cval;
+ reg->umax_value = reg->umin_value = u64_cval;
+ reg->s32_max_value = reg->s32_min_value = u64_cval;
+ reg->u32_max_value = reg->u32_min_value = u64_cval;
+ return;
+ }
+
+ top_smax_value = ((u64)reg->smax_value >> num_bits) << num_bits;
+ top_smin_value = ((u64)reg->smin_value >> num_bits) << num_bits;
+
+ if (top_smax_value != top_smin_value)
+ goto out;
+
+ /* find the s64_min and s64_min after sign extension */
+ if (size == 1) {
+ init_s64_max = (s8)reg->smax_value;
+ init_s64_min = (s8)reg->smin_value;
+ } else if (size == 2) {
+ init_s64_max = (s16)reg->smax_value;
+ init_s64_min = (s16)reg->smin_value;
+ } else {
+ init_s64_max = (s32)reg->smax_value;
+ init_s64_min = (s32)reg->smin_value;
+ }
+
+ s64_max = max(init_s64_max, init_s64_min);
+ s64_min = min(init_s64_max, init_s64_min);
+
+ /* both of s64_max/s64_min positive or negative */
+ if ((s64_max >= 0) == (s64_min >= 0)) {
+ reg->smin_value = reg->s32_min_value = s64_min;
+ reg->smax_value = reg->s32_max_value = s64_max;
+ reg->umin_value = reg->u32_min_value = s64_min;
+ reg->umax_value = reg->u32_max_value = s64_max;
+ reg->var_off = tnum_range(s64_min, s64_max);
+ return;
+ }
+
+out:
+ set_sext64_default_val(reg, size);
+}
+
+static void set_sext32_default_val(struct bpf_reg_state *reg, int size)
+{
+ if (size == 1) {
+ reg->s32_min_value = S8_MIN;
+ reg->s32_max_value = S8_MAX;
+ } else {
+ /* size == 2 */
+ reg->s32_min_value = S16_MIN;
+ reg->s32_max_value = S16_MAX;
+ }
+ reg->u32_min_value = 0;
+ reg->u32_max_value = U32_MAX;
+}
+
+static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)
+{
+ s32 init_s32_max, init_s32_min, s32_max, s32_min, u32_val;
+ u32 top_smax_value, top_smin_value;
+ u32 num_bits = size * 8;
+
+ if (tnum_is_const(reg->var_off)) {
+ u32_val = reg->var_off.value;
+ if (size == 1)
+ reg->var_off = tnum_const((s8)u32_val);
+ else
+ reg->var_off = tnum_const((s16)u32_val);
+
+ u32_val = reg->var_off.value;
+ reg->s32_min_value = reg->s32_max_value = u32_val;
+ reg->u32_min_value = reg->u32_max_value = u32_val;
+ return;
+ }
+
+ top_smax_value = ((u32)reg->s32_max_value >> num_bits) << num_bits;
+ top_smin_value = ((u32)reg->s32_min_value >> num_bits) << num_bits;
+
+ if (top_smax_value != top_smin_value)
+ goto out;
+
+ /* find the s32_min and s32_min after sign extension */
+ if (size == 1) {
+ init_s32_max = (s8)reg->s32_max_value;
+ init_s32_min = (s8)reg->s32_min_value;
+ } else {
+ /* size == 2 */
+ init_s32_max = (s16)reg->s32_max_value;
+ init_s32_min = (s16)reg->s32_min_value;
+ }
+ s32_max = max(init_s32_max, init_s32_min);
+ s32_min = min(init_s32_max, init_s32_min);
+
+ if ((s32_min >= 0) == (s32_max >= 0)) {
+ reg->s32_min_value = s32_min;
+ reg->s32_max_value = s32_max;
+ reg->u32_min_value = (u32)s32_min;
+ reg->u32_max_value = (u32)s32_max;
+ return;
+ }
+
+out:
+ set_sext32_default_val(reg, size);
+}
+
static bool bpf_map_is_rdonly(const struct bpf_map *map)
{
/* A map is considered read-only if the following condition are true:
@@ -5334,7 +5999,8 @@ static bool bpf_map_is_rdonly(const struct bpf_map *map)
!bpf_map_write_active(map);
}
-static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
+static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val,
+ bool is_ldsx)
{
void *ptr;
u64 addr;
@@ -5347,13 +6013,13 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
switch (size) {
case sizeof(u8):
- *val = (u64)*(u8 *)ptr;
+ *val = is_ldsx ? (s64)*(s8 *)ptr : (u64)*(u8 *)ptr;
break;
case sizeof(u16):
- *val = (u64)*(u16 *)ptr;
+ *val = is_ldsx ? (s64)*(s16 *)ptr : (u64)*(u16 *)ptr;
break;
case sizeof(u32):
- *val = (u64)*(u32 *)ptr;
+ *val = is_ldsx ? (s64)*(s32 *)ptr : (u64)*(u32 *)ptr;
break;
case sizeof(u64):
*val = *(u64 *)ptr;
@@ -5537,7 +6203,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
* program allocated objects (which always have ref_obj_id > 0),
* but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
*/
- if (atype != BPF_READ && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) {
verbose(env, "only read is supported\n");
return -EACCES;
}
@@ -5586,6 +6252,11 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
type_is_rcu_or_null(env, reg, field_name, btf_id)) {
/* __rcu tagged pointers can be NULL */
flag |= MEM_RCU | PTR_MAYBE_NULL;
+
+ /* We always trust them */
+ if (type_is_rcu_or_null(env, reg, field_name, btf_id) &&
+ flag & PTR_UNTRUSTED)
+ flag &= ~PTR_UNTRUSTED;
} else if (flag & (MEM_PERCPU | MEM_USER)) {
/* keep as-is */
} else {
@@ -5767,7 +6438,7 @@ static int check_stack_access_within_bounds(
*/
static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
int off, int bpf_size, enum bpf_access_type t,
- int value_regno, bool strict_alignment_once)
+ int value_regno, bool strict_alignment_once, bool is_ldsx)
{
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = regs + regno;
@@ -5828,7 +6499,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
u64 val = 0;
err = bpf_map_direct_read(map, map_off, size,
- &val);
+ &val, is_ldsx);
if (err)
return err;
@@ -5998,8 +6669,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
regs[value_regno].type == SCALAR_VALUE) {
- /* b/h/w load zero-extends, mark upper bits as known 0 */
- coerce_reg_to_size(&regs[value_regno], size);
+ if (!is_ldsx)
+ /* b/h/w load zero-extends, mark upper bits as known 0 */
+ coerce_reg_to_size(&regs[value_regno], size);
+ else
+ coerce_reg_to_size_sx(&regs[value_regno], size);
}
return err;
}
@@ -6091,17 +6765,17 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
* case to simulate the register fill.
*/
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_READ, -1, true);
+ BPF_SIZE(insn->code), BPF_READ, -1, true, false);
if (!err && load_reg >= 0)
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, load_reg,
- true);
+ true, false);
if (err)
return err;
/* Check whether we can write into the same memory. */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_WRITE, -1, true);
+ BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
if (err)
return err;
@@ -6347,7 +7021,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return zero_size_allowed ? 0 : -EACCES;
return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
- atype, -1, false);
+ atype, -1, false, false);
}
fallthrough;
@@ -6680,7 +7354,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
* type, and declare it as 'const struct bpf_dynptr *' in their prototype.
*/
static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
- enum bpf_arg_type arg_type)
+ enum bpf_arg_type arg_type, int clone_ref_obj_id)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
int err;
@@ -6719,12 +7393,12 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
/* we write BPF_DW bits (8 bytes) at a time */
for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
err = check_mem_access(env, insn_idx, regno,
- i, BPF_DW, BPF_WRITE, -1, false);
+ i, BPF_DW, BPF_WRITE, -1, false, false);
if (err)
return err;
}
- err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx);
+ err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id);
} else /* MEM_RDONLY and None case from above */ {
/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
@@ -6812,7 +7486,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
err = check_mem_access(env, insn_idx, regno,
- i, BPF_DW, BPF_WRITE, -1, false);
+ i, BPF_DW, BPF_WRITE, -1, false, false);
if (err)
return err;
}
@@ -7146,14 +7820,18 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
* ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
* but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
*
+ * ARG_PTR_TO_MEM is compatible with PTR_TO_MEM that is tagged with a dynptr type.
+ *
* Therefore we fold these flags depending on the arg_type before comparison.
*/
if (arg_type & MEM_RDONLY)
type &= ~MEM_RDONLY;
if (arg_type & PTR_MAYBE_NULL)
type &= ~PTR_MAYBE_NULL;
+ if (base_type(arg_type) == ARG_PTR_TO_MEM)
+ type &= ~DYNPTR_TYPE_FLAG_MASK;
- if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC)
+ if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type))
type &= ~MEM_ALLOC;
for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
@@ -7242,7 +7920,10 @@ found:
verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
return -EFAULT;
}
- /* Handled by helper specific checks */
+ if (meta->func_id == BPF_FUNC_kptr_xchg) {
+ if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
+ return -EACCES;
+ }
break;
case PTR_TO_BTF_ID | MEM_PERCPU:
case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
@@ -7294,17 +7975,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
return 0;
- if ((type_is_ptr_alloc_obj(type) || type_is_non_owning_ref(type)) && reg->off) {
- if (reg_find_field_offset(reg, reg->off, BPF_GRAPH_NODE_OR_ROOT))
- return __check_ptr_off_reg(env, reg, regno, true);
-
- verbose(env, "R%d must have zero offset when passed to release func\n",
- regno);
- verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno,
- btf_type_name(reg->btf, reg->btf_id), reg->off);
- return -EINVAL;
- }
-
/* Doing check_ptr_off_reg check for the offset will catch this
* because fixed_off_ok is false, but checking here allows us
* to give the user a better error message.
@@ -7339,6 +8009,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
case PTR_TO_BTF_ID | PTR_TRUSTED:
case PTR_TO_BTF_ID | MEM_RCU:
case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
+ case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
/* When referenced PTR_TO_BTF_ID is passed to release function,
* its fixed offset must be 0. In the other cases, fixed offset
* can be non-zero. This was already checked above. So pass
@@ -7634,7 +8305,7 @@ skip_type_check:
err = check_mem_size_reg(env, reg, regno, true, meta);
break;
case ARG_PTR_TO_DYNPTR:
- err = process_dynptr_func(env, regno, insn_idx, arg_type);
+ err = process_dynptr_func(env, regno, insn_idx, arg_type, 0);
if (err)
return err;
break;
@@ -8181,17 +8852,13 @@ static int set_callee_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee, int insn_idx);
-static bool is_callback_calling_kfunc(u32 btf_id);
-
static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx, int subprog,
set_callee_state_fn set_callee_state_cb)
{
struct bpf_verifier_state *state = env->cur_state;
- struct bpf_func_info_aux *func_info_aux;
struct bpf_func_state *caller, *callee;
int err;
- bool is_global = false;
if (state->curframe + 1 >= MAX_CALL_FRAMES) {
verbose(env, "the call stack of %d frames is too deep\n",
@@ -8206,13 +8873,10 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EFAULT;
}
- func_info_aux = env->prog->aux->func_info_aux;
- if (func_info_aux)
- is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
err = btf_check_subprog_call(env, subprog, caller->regs);
if (err == -EFAULT)
return err;
- if (is_global) {
+ if (subprog_is_global(env, subprog)) {
if (err) {
verbose(env, "Caller passes invalid args into func#%d\n",
subprog);
@@ -8639,19 +9303,33 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
{
struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
- if (ret_type != RET_INTEGER ||
- (func_id != BPF_FUNC_get_stack &&
- func_id != BPF_FUNC_get_task_stack &&
- func_id != BPF_FUNC_probe_read_str &&
- func_id != BPF_FUNC_probe_read_kernel_str &&
- func_id != BPF_FUNC_probe_read_user_str))
+ if (ret_type != RET_INTEGER)
return;
- ret_reg->smax_value = meta->msize_max_value;
- ret_reg->s32_max_value = meta->msize_max_value;
- ret_reg->smin_value = -MAX_ERRNO;
- ret_reg->s32_min_value = -MAX_ERRNO;
- reg_bounds_sync(ret_reg);
+ switch (func_id) {
+ case BPF_FUNC_get_stack:
+ case BPF_FUNC_get_task_stack:
+ case BPF_FUNC_probe_read_str:
+ case BPF_FUNC_probe_read_kernel_str:
+ case BPF_FUNC_probe_read_user_str:
+ ret_reg->smax_value = meta->msize_max_value;
+ ret_reg->s32_max_value = meta->msize_max_value;
+ ret_reg->smin_value = -MAX_ERRNO;
+ ret_reg->s32_min_value = -MAX_ERRNO;
+ reg_bounds_sync(ret_reg);
+ break;
+ case BPF_FUNC_get_smp_processor_id:
+ ret_reg->umax_value = nr_cpu_ids - 1;
+ ret_reg->u32_max_value = nr_cpu_ids - 1;
+ ret_reg->smax_value = nr_cpu_ids - 1;
+ ret_reg->s32_max_value = nr_cpu_ids - 1;
+ ret_reg->umin_value = 0;
+ ret_reg->u32_min_value = 0;
+ ret_reg->smin_value = 0;
+ ret_reg->s32_min_value = 0;
+ reg_bounds_sync(ret_reg);
+ break;
+ }
}
static int
@@ -8945,7 +9623,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
*/
for (i = 0; i < meta.access_size; i++) {
err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B,
- BPF_WRITE, -1, false);
+ BPF_WRITE, -1, false, false);
if (err)
return err;
}
@@ -9327,11 +10005,6 @@ static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_ACQUIRE;
}
-static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
-{
- return meta->kfunc_flags & KF_RET_NULL;
-}
-
static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
{
return meta->kfunc_flags & KF_RELEASE;
@@ -9401,6 +10074,11 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
return __kfunc_param_match_suffix(btf, arg, "__szk");
}
+static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__opt");
+}
+
static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
{
return __kfunc_param_match_suffix(btf, arg, "__k");
@@ -9554,15 +10232,6 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
return true;
}
-
-static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
-#ifdef CONFIG_NET
- [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
- [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
- [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
-#endif
-};
-
enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_CTX,
KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */
@@ -9598,6 +10267,7 @@ enum special_kfunc_type {
KF_bpf_dynptr_from_xdp,
KF_bpf_dynptr_slice,
KF_bpf_dynptr_slice_rdwr,
+ KF_bpf_dynptr_clone,
};
BTF_SET_START(special_kfunc_set)
@@ -9617,6 +10287,7 @@ BTF_ID(func, bpf_dynptr_from_skb)
BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
+BTF_ID(func, bpf_dynptr_clone)
BTF_SET_END(special_kfunc_set)
BTF_ID_LIST(special_kfunc_list)
@@ -9638,6 +10309,17 @@ BTF_ID(func, bpf_dynptr_from_skb)
BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
+BTF_ID(func, bpf_dynptr_clone)
+
+static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
+{
+ if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
+ meta->arg_owning_ref) {
+ return false;
+ }
+
+ return meta->kfunc_flags & KF_RET_NULL;
+}
static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -9794,6 +10476,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_verifier_state *state = env->cur_state;
+ struct btf_record *rec = reg_btf_record(reg);
if (!state->active_lock.ptr) {
verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
@@ -9806,6 +10489,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
}
reg->type |= NON_OWN_REF;
+ if (rec->refcount_off >= 0)
+ reg->type |= MEM_RCU;
+
return 0;
}
@@ -10116,6 +10802,8 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
node_off, btf_name_by_offset(reg->btf, t->name_off));
return -EINVAL;
}
+ meta->arg_btf = reg->btf;
+ meta->arg_btf_id = reg->btf_id;
if (node_off != field->graph_root.node_offset) {
verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n",
@@ -10326,13 +11014,14 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
if (meta->btf == btf_vmlinux &&
meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
- meta->arg_obj_drop.btf = reg->btf;
- meta->arg_obj_drop.btf_id = reg->btf_id;
+ meta->arg_btf = reg->btf;
+ meta->arg_btf_id = reg->btf_id;
}
break;
case KF_ARG_PTR_TO_DYNPTR:
{
enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
+ int clone_ref_obj_id = 0;
if (reg->type != PTR_TO_STACK &&
reg->type != CONST_PTR_TO_DYNPTR) {
@@ -10346,12 +11035,28 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (is_kfunc_arg_uninit(btf, &args[i]))
dynptr_arg_type |= MEM_UNINIT;
- if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb])
+ if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
dynptr_arg_type |= DYNPTR_TYPE_SKB;
- else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp])
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) {
dynptr_arg_type |= DYNPTR_TYPE_XDP;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] &&
+ (dynptr_arg_type & MEM_UNINIT)) {
+ enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;
+
+ if (parent_type == BPF_DYNPTR_TYPE_INVALID) {
+ verbose(env, "verifier internal error: no dynptr type for parent of clone\n");
+ return -EFAULT;
+ }
- ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type);
+ dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type);
+ clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id;
+ if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) {
+ verbose(env, "verifier internal error: missing ref obj id for parent of clone\n");
+ return -EFAULT;
+ }
+ }
+
+ ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id);
if (ret < 0)
return ret;
@@ -10364,6 +11069,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
}
meta->initialized_dynptr.id = id;
meta->initialized_dynptr.type = dynptr_get_type(env, reg);
+ meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg);
}
break;
@@ -10467,13 +11173,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
break;
case KF_ARG_PTR_TO_MEM_SIZE:
{
+ struct bpf_reg_state *buff_reg = &regs[regno];
+ const struct btf_param *buff_arg = &args[i];
struct bpf_reg_state *size_reg = &regs[regno + 1];
const struct btf_param *size_arg = &args[i + 1];
- ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
- if (ret < 0) {
- verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
- return ret;
+ if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) {
+ ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
+ if (ret < 0) {
+ verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
+ return ret;
+ }
}
if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {
@@ -10497,10 +11207,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
meta->subprogno = reg->subprogno;
break;
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
- if (!type_is_ptr_alloc_obj(reg->type) && !type_is_non_owning_ref(reg->type)) {
+ if (!type_is_ptr_alloc_obj(reg->type)) {
verbose(env, "arg#%d is neither owning or non-owning ref\n", i);
return -EINVAL;
}
+ if (!type_is_non_owning_ref(reg->type))
+ meta->arg_owning_ref = true;
rec = reg_btf_record(reg);
if (!rec) {
@@ -10512,12 +11224,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i);
return -EINVAL;
}
- if (rec->refcount_off >= 0) {
- verbose(env, "bpf_refcount_acquire calls are disabled for now\n");
- return -EINVAL;
- }
- meta->arg_refcount_acquire.btf = reg->btf;
- meta->arg_refcount_acquire.btf_id = reg->btf_id;
+
+ meta->arg_btf = reg->btf;
+ meta->arg_btf_id = reg->btf_id;
break;
}
}
@@ -10558,7 +11267,7 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env,
*kfunc_name = func_name;
func_proto = btf_type_by_id(desc_btf, func->type);
- kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id);
+ kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog);
if (!kfunc_flags) {
return -EACCES;
}
@@ -10620,6 +11329,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_func_state *state;
struct bpf_reg_state *reg;
+ if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
+ verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
+ return -EACCES;
+ }
+
if (rcu_lock) {
verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
return -EINVAL;
@@ -10663,6 +11377,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
insn_aux->insert_off = regs[BPF_REG_2].off;
+ insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id);
err = ref_convert_owning_non_owning(env, release_ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
@@ -10749,12 +11464,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
} else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
- regs[BPF_REG_0].btf = meta.arg_refcount_acquire.btf;
- regs[BPF_REG_0].btf_id = meta.arg_refcount_acquire.btf_id;
+ regs[BPF_REG_0].btf = meta.arg_btf;
+ regs[BPF_REG_0].btf_id = meta.arg_btf_id;
insn_aux->kptr_struct_meta =
- btf_find_struct_meta(meta.arg_refcount_acquire.btf,
- meta.arg_refcount_acquire.btf_id);
+ btf_find_struct_meta(meta.arg_btf,
+ meta.arg_btf_id);
} else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
struct btf_field *field = meta.arg_list_head.field;
@@ -10884,8 +11599,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
insn_aux->kptr_struct_meta =
- btf_find_struct_meta(meta.arg_obj_drop.btf,
- meta.arg_obj_drop.btf_id);
+ btf_find_struct_meta(meta.arg_btf,
+ meta.arg_btf_id);
}
}
}
@@ -12371,7 +13086,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
(insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
- BPF_CLASS(insn->code) == BPF_ALU64) {
+ (BPF_CLASS(insn->code) == BPF_ALU64 &&
+ BPF_SRC(insn->code) != BPF_TO_LE)) {
verbose(env, "BPF_END uses reserved fields\n");
return -EINVAL;
}
@@ -12396,11 +13112,24 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else if (opcode == BPF_MOV) {
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0 || insn->off != 0) {
+ if (insn->imm != 0) {
verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
+ if (BPF_CLASS(insn->code) == BPF_ALU) {
+ if (insn->off != 0 && insn->off != 8 && insn->off != 16) {
+ verbose(env, "BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else {
+ if (insn->off != 0 && insn->off != 8 && insn->off != 16 &&
+ insn->off != 32) {
+ verbose(env, "BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
/* check src operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
@@ -12420,20 +13149,46 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) == BPF_X) {
struct bpf_reg_state *src_reg = regs + insn->src_reg;
struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
+ bool need_id = src_reg->type == SCALAR_VALUE && !src_reg->id &&
+ !tnum_is_const(src_reg->var_off);
if (BPF_CLASS(insn->code) == BPF_ALU64) {
- /* case: R1 = R2
- * copy register state to dest reg
- */
- if (src_reg->type == SCALAR_VALUE && !src_reg->id)
- /* Assign src and dst registers the same ID
- * that will be used by find_equal_scalars()
- * to propagate min/max range.
+ if (insn->off == 0) {
+ /* case: R1 = R2
+ * copy register state to dest reg
*/
- src_reg->id = ++env->id_gen;
- copy_register_state(dst_reg, src_reg);
- dst_reg->live |= REG_LIVE_WRITTEN;
- dst_reg->subreg_def = DEF_NOT_SUBREG;
+ if (need_id)
+ /* Assign src and dst registers the same ID
+ * that will be used by find_equal_scalars()
+ * to propagate min/max range.
+ */
+ src_reg->id = ++env->id_gen;
+ copy_register_state(dst_reg, src_reg);
+ dst_reg->live |= REG_LIVE_WRITTEN;
+ dst_reg->subreg_def = DEF_NOT_SUBREG;
+ } else {
+ /* case: R1 = (s8, s16 s32)R2 */
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose(env,
+ "R%d sign-extension part of pointer\n",
+ insn->src_reg);
+ return -EACCES;
+ } else if (src_reg->type == SCALAR_VALUE) {
+ bool no_sext;
+
+ no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
+ if (no_sext && need_id)
+ src_reg->id = ++env->id_gen;
+ copy_register_state(dst_reg, src_reg);
+ if (!no_sext)
+ dst_reg->id = 0;
+ coerce_reg_to_size_sx(dst_reg, insn->off >> 3);
+ dst_reg->live |= REG_LIVE_WRITTEN;
+ dst_reg->subreg_def = DEF_NOT_SUBREG;
+ } else {
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ }
+ }
} else {
/* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
@@ -12442,19 +13197,33 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg);
return -EACCES;
} else if (src_reg->type == SCALAR_VALUE) {
- bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX;
-
- if (is_src_reg_u32 && !src_reg->id)
- src_reg->id = ++env->id_gen;
- copy_register_state(dst_reg, src_reg);
- /* Make sure ID is cleared if src_reg is not in u32 range otherwise
- * dst_reg min/max could be incorrectly
- * propagated into src_reg by find_equal_scalars()
- */
- if (!is_src_reg_u32)
- dst_reg->id = 0;
- dst_reg->live |= REG_LIVE_WRITTEN;
- dst_reg->subreg_def = env->insn_idx + 1;
+ if (insn->off == 0) {
+ bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX;
+
+ if (is_src_reg_u32 && need_id)
+ src_reg->id = ++env->id_gen;
+ copy_register_state(dst_reg, src_reg);
+ /* Make sure ID is cleared if src_reg is not in u32
+ * range otherwise dst_reg min/max could be incorrectly
+ * propagated into src_reg by find_equal_scalars()
+ */
+ if (!is_src_reg_u32)
+ dst_reg->id = 0;
+ dst_reg->live |= REG_LIVE_WRITTEN;
+ dst_reg->subreg_def = env->insn_idx + 1;
+ } else {
+ /* case: W1 = (s8, s16)W2 */
+ bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
+
+ if (no_sext && need_id)
+ src_reg->id = ++env->id_gen;
+ copy_register_state(dst_reg, src_reg);
+ if (!no_sext)
+ dst_reg->id = 0;
+ dst_reg->live |= REG_LIVE_WRITTEN;
+ dst_reg->subreg_def = env->insn_idx + 1;
+ coerce_subreg_to_size_sx(dst_reg, insn->off >> 3);
+ }
} else {
mark_reg_unknown(env, regs,
insn->dst_reg);
@@ -12485,7 +13254,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else { /* all other ALU ops: and, sub, xor, add, ... */
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0 || insn->off != 0) {
+ if (insn->imm != 0 || insn->off > 1 ||
+ (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
@@ -12494,7 +13264,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
} else {
- if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ if (insn->src_reg != BPF_REG_0 || insn->off > 1 ||
+ (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
@@ -12776,7 +13547,7 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
bool is_jmp32)
{
if (__is_pointer_value(false, reg)) {
- if (!reg_type_not_null(reg->type))
+ if (!reg_not_null(reg))
return -1;
/* If pointer is valid tests against zero will fail so we can
@@ -13279,6 +14050,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return -EINVAL;
}
+ /* check src2 operand */
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ dst_reg = &regs[insn->dst_reg];
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
@@ -13290,12 +14067,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (err)
return err;
- if (is_pointer_value(env, insn->src_reg)) {
+ src_reg = &regs[insn->src_reg];
+ if (!(reg_is_pkt_pointer_any(dst_reg) && reg_is_pkt_pointer_any(src_reg)) &&
+ is_pointer_value(env, insn->src_reg)) {
verbose(env, "R%d pointer comparison prohibited\n",
insn->src_reg);
return -EACCES;
}
- src_reg = &regs[insn->src_reg];
} else {
if (insn->src_reg != BPF_REG_0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
@@ -13303,12 +14081,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
}
- /* check src2 operand */
- err = check_reg_arg(env, insn->dst_reg, SRC_OP);
- if (err)
- return err;
-
- dst_reg = &regs[insn->dst_reg];
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
if (BPF_SRC(insn->code) == BPF_K) {
@@ -14038,7 +14810,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
static int visit_insn(int t, struct bpf_verifier_env *env)
{
struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
- int ret;
+ int ret, off;
if (bpf_pseudo_func(insn))
return visit_func_call_insn(t, insns, env, true);
@@ -14086,14 +14858,19 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
if (BPF_SRC(insn->code) != BPF_K)
return -EINVAL;
+ if (BPF_CLASS(insn->code) == BPF_JMP)
+ off = insn->off;
+ else
+ off = insn->imm;
+
/* unconditional jump with single edge */
- ret = push_insn(t, t + insn->off + 1, FALLTHROUGH, env,
+ ret = push_insn(t, t + off + 1, FALLTHROUGH, env,
true);
if (ret)
return ret;
- mark_prune_point(env, t + insn->off + 1);
- mark_jmp_point(env, t + insn->off + 1);
+ mark_prune_point(env, t + off + 1);
+ mark_jmp_point(env, t + off + 1);
return ret;
@@ -14600,8 +15377,9 @@ static bool range_within(struct bpf_reg_state *old,
* So we look through our idmap to see if this old id has been seen before. If
* so, we require the new id to match; otherwise, we add the id pair to the map.
*/
-static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap)
+static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
{
+ struct bpf_id_pair *map = idmap->map;
unsigned int i;
/* either both IDs should be set or both should be zero */
@@ -14612,20 +15390,34 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap)
return true;
for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
- if (!idmap[i].old) {
+ if (!map[i].old) {
/* Reached an empty slot; haven't seen this id before */
- idmap[i].old = old_id;
- idmap[i].cur = cur_id;
+ map[i].old = old_id;
+ map[i].cur = cur_id;
return true;
}
- if (idmap[i].old == old_id)
- return idmap[i].cur == cur_id;
+ if (map[i].old == old_id)
+ return map[i].cur == cur_id;
+ if (map[i].cur == cur_id)
+ return false;
}
/* We ran out of idmap slots, which should be impossible */
WARN_ON_ONCE(1);
return false;
}
+/* Similar to check_ids(), but allocate a unique temporary ID
+ * for 'old_id' or 'cur_id' of zero.
+ * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid.
+ */
+static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
+{
+ old_id = old_id ? old_id : ++idmap->tmp_id_gen;
+ cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen;
+
+ return check_ids(old_id, cur_id, idmap);
+}
+
static void clean_func_state(struct bpf_verifier_env *env,
struct bpf_func_state *st)
{
@@ -14724,7 +15516,7 @@ next:
static bool regs_exact(const struct bpf_reg_state *rold,
const struct bpf_reg_state *rcur,
- struct bpf_id_pair *idmap)
+ struct bpf_idmap *idmap)
{
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
check_ids(rold->id, rcur->id, idmap) &&
@@ -14733,7 +15525,7 @@ static bool regs_exact(const struct bpf_reg_state *rold,
/* Returns true if (rold safe implies rcur safe) */
static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
- struct bpf_reg_state *rcur, struct bpf_id_pair *idmap)
+ struct bpf_reg_state *rcur, struct bpf_idmap *idmap)
{
if (!(rold->live & REG_LIVE_READ))
/* explored state didn't use this */
@@ -14770,15 +15562,42 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
switch (base_type(rold->type)) {
case SCALAR_VALUE:
- if (regs_exact(rold, rcur, idmap))
- return true;
- if (env->explore_alu_limits)
- return false;
+ if (env->explore_alu_limits) {
+ /* explore_alu_limits disables tnum_in() and range_within()
+ * logic and requires everything to be strict
+ */
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ check_scalar_ids(rold->id, rcur->id, idmap);
+ }
if (!rold->precise)
return true;
- /* new val must satisfy old val knowledge */
+ /* Why check_ids() for scalar registers?
+ *
+ * Consider the following BPF code:
+ * 1: r6 = ... unbound scalar, ID=a ...
+ * 2: r7 = ... unbound scalar, ID=b ...
+ * 3: if (r6 > r7) goto +1
+ * 4: r6 = r7
+ * 5: if (r6 > X) goto ...
+ * 6: ... memory operation using r7 ...
+ *
+ * First verification path is [1-6]:
+ * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
+ * - at (5) r6 would be marked <= X, find_equal_scalars() would also mark
+ * r7 <= X, because r6 and r7 share same id.
+ * Next verification path is [1-4, 6].
+ *
+ * Instruction (6) would be reached in two states:
+ * I. r6{.id=b}, r7{.id=b} via path 1-6;
+ * II. r6{.id=a}, r7{.id=b} via path 1-4, 6.
+ *
+ * Use check_ids() to distinguish these states.
+ * ---
+ * Also verify that new value satisfies old value range knowledge.
+ */
return range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
+ tnum_in(rold->var_off, rcur->var_off) &&
+ check_scalar_ids(rold->id, rcur->id, idmap);
case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
case PTR_TO_MEM:
@@ -14824,7 +15643,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
}
static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur, struct bpf_id_pair *idmap)
+ struct bpf_func_state *cur, struct bpf_idmap *idmap)
{
int i, spi;
@@ -14927,7 +15746,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
}
static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
- struct bpf_id_pair *idmap)
+ struct bpf_idmap *idmap)
{
int i;
@@ -14975,13 +15794,13 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
for (i = 0; i < MAX_BPF_REG; i++)
if (!regsafe(env, &old->regs[i], &cur->regs[i],
- env->idmap_scratch))
+ &env->idmap_scratch))
return false;
- if (!stacksafe(env, old, cur, env->idmap_scratch))
+ if (!stacksafe(env, old, cur, &env->idmap_scratch))
return false;
- if (!refsafe(old, cur, env->idmap_scratch))
+ if (!refsafe(old, cur, &env->idmap_scratch))
return false;
return true;
@@ -14996,7 +15815,8 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->curframe != cur->curframe)
return false;
- memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
+ env->idmap_scratch.tmp_id_gen = env->id_gen;
+ memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
/* Verification state from speculative execution simulation
* must never prune a non-speculative execution one.
@@ -15014,7 +15834,7 @@ static bool states_equal(struct bpf_verifier_env *env,
return false;
if (old->active_lock.id &&
- !check_ids(old->active_lock.id, cur->active_lock.id, env->idmap_scratch))
+ !check_ids(old->active_lock.id, cur->active_lock.id, &env->idmap_scratch))
return false;
if (old->active_rcu_lock != cur->active_rcu_lock)
@@ -15121,20 +15941,25 @@ static int propagate_precision(struct bpf_verifier_env *env,
struct bpf_reg_state *state_reg;
struct bpf_func_state *state;
int i, err = 0, fr;
+ bool first;
for (fr = old->curframe; fr >= 0; fr--) {
state = old->frame[fr];
state_reg = state->regs;
+ first = true;
for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
if (state_reg->type != SCALAR_VALUE ||
!state_reg->precise ||
!(state_reg->live & REG_LIVE_READ))
continue;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "frame %d: propagating r%d\n", fr, i);
- err = mark_chain_precision_frame(env, fr, i);
- if (err < 0)
- return err;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ if (first)
+ verbose(env, "frame %d: propagating r%d", fr, i);
+ else
+ verbose(env, ",r%d", i);
+ }
+ bt_set_frame_reg(&env->bt, fr, i);
+ first = false;
}
for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
@@ -15145,14 +15970,24 @@ static int propagate_precision(struct bpf_verifier_env *env,
!state_reg->precise ||
!(state_reg->live & REG_LIVE_READ))
continue;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "frame %d: propagating fp%d\n",
- fr, (-i - 1) * BPF_REG_SIZE);
- err = mark_chain_precision_stack_frame(env, fr, i);
- if (err < 0)
- return err;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ if (first)
+ verbose(env, "frame %d: propagating fp%d",
+ fr, (-i - 1) * BPF_REG_SIZE);
+ else
+ verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE);
+ }
+ bt_set_frame_slot(&env->bt, fr, i);
+ first = false;
}
+ if (!first)
+ verbose(env, "\n");
}
+
+ err = mark_chain_precision_batch(env);
+ if (err < 0)
+ return err;
+
return 0;
}
@@ -15582,7 +16417,7 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ
* Have to support a use case when one path through
* the program yields TRUSTED pointer while another
* is UNTRUSTED. Fallback to UNTRUSTED to generate
- * BPF_PROBE_MEM.
+ * BPF_PROBE_MEM/BPF_PROBE_MEMSX.
*/
*prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
} else {
@@ -15723,7 +16558,8 @@ static int do_check(struct bpf_verifier_env *env)
*/
err = check_mem_access(env, env->insn_idx, insn->src_reg,
insn->off, BPF_SIZE(insn->code),
- BPF_READ, insn->dst_reg, false);
+ BPF_READ, insn->dst_reg, false,
+ BPF_MODE(insn->code) == BPF_MEMSX);
if (err)
return err;
@@ -15760,7 +16596,7 @@ static int do_check(struct bpf_verifier_env *env)
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, env->insn_idx, insn->dst_reg,
insn->off, BPF_SIZE(insn->code),
- BPF_WRITE, insn->src_reg, false);
+ BPF_WRITE, insn->src_reg, false, false);
if (err)
return err;
@@ -15785,7 +16621,7 @@ static int do_check(struct bpf_verifier_env *env)
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, env->insn_idx, insn->dst_reg,
insn->off, BPF_SIZE(insn->code),
- BPF_WRITE, -1, false);
+ BPF_WRITE, -1, false, false);
if (err)
return err;
@@ -15830,15 +16666,18 @@ static int do_check(struct bpf_verifier_env *env)
mark_reg_scratched(env, BPF_REG_0);
} else if (opcode == BPF_JA) {
if (BPF_SRC(insn->code) != BPF_K ||
- insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0 ||
- class == BPF_JMP32) {
+ (class == BPF_JMP && insn->imm != 0) ||
+ (class == BPF_JMP32 && insn->off != 0)) {
verbose(env, "BPF_JA uses reserved fields\n");
return -EINVAL;
}
- env->insn_idx += insn->off + 1;
+ if (class == BPF_JMP)
+ env->insn_idx += insn->off + 1;
+ else
+ env->insn_idx += insn->imm + 1;
continue;
} else if (opcode == BPF_EXIT) {
@@ -15857,7 +16696,8 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (env->cur_state->active_rcu_lock) {
+ if (env->cur_state->active_rcu_lock &&
+ !in_rbtree_lock_required_cb(env)) {
verbose(env, "bpf_rcu_read_unlock is missing\n");
return -EINVAL;
}
@@ -16137,11 +16977,6 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
return -EINVAL;
}
-
- if (prog->aux->sleepable) {
- verbose(env, "sleepable progs cannot use bpf_spin_lock yet\n");
- return -EINVAL;
- }
}
if (btf_record_has_field(map->record, BPF_TIMER)) {
@@ -16213,7 +17048,8 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
- (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
+ ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) ||
+ insn->imm != 0)) {
verbose(env, "BPF_LDX uses reserved fields\n");
return -EINVAL;
}
@@ -16684,13 +17520,13 @@ static bool insn_is_cond_jump(u8 code)
{
u8 op;
+ op = BPF_OP(code);
if (BPF_CLASS(code) == BPF_JMP32)
- return true;
+ return op != BPF_JA;
if (BPF_CLASS(code) != BPF_JMP)
return false;
- op = BPF_OP(code);
return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
}
@@ -16907,11 +17743,15 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
bpf_convert_ctx_access_t convert_ctx_access;
+ u8 mode;
if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) {
+ insn->code == (BPF_LDX | BPF_MEM | BPF_DW) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) {
type = BPF_READ;
} else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
@@ -16970,8 +17810,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
*/
case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
if (type == BPF_READ) {
- insn->code = BPF_LDX | BPF_PROBE_MEM |
- BPF_SIZE((insn)->code);
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ insn->code = BPF_LDX | BPF_PROBE_MEM |
+ BPF_SIZE((insn)->code);
+ else
+ insn->code = BPF_LDX | BPF_PROBE_MEMSX |
+ BPF_SIZE((insn)->code);
env->prog->aux->num_exentries++;
}
continue;
@@ -16981,6 +17825,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
size = BPF_LDST_BYTES(insn);
+ mode = BPF_MODE(insn->code);
/* If the read access is a narrower load of the field,
* convert to a 4/8-byte load, to minimum program type specific
@@ -17040,6 +17885,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
(1ULL << size * 8) - 1);
}
}
+ if (mode == BPF_MEMSX)
+ insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X,
+ insn->dst_reg, insn->dst_reg,
+ size * 8, 0);
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
@@ -17159,7 +18008,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
- BPF_MODE(insn->code) == BPF_PROBE_MEM)
+ (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
num_exentries++;
}
func[i]->aux->num_exentries = num_exentries;
@@ -17431,6 +18281,13 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+ if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
+ !kptr_struct_meta) {
+ verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
+ insn_idx);
+ return -EFAULT;
+ }
+
insn_buf[0] = addr[0];
insn_buf[1] = addr[1];
insn_buf[2] = *insn;
@@ -17438,6 +18295,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
} else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
int struct_meta_reg = BPF_REG_3;
int node_offset_reg = BPF_REG_4;
@@ -17447,6 +18305,12 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
node_offset_reg = BPF_REG_5;
}
+ if (!kptr_struct_meta) {
+ verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
+ insn_idx);
+ return -EFAULT;
+ }
+
__fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
node_offset_reg, insn, insn_buf, cnt);
} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
@@ -18617,7 +19481,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
* in the fmodret id set with the KF_SLEEPABLE flag.
*/
else {
- u32 *flags = btf_kfunc_is_modify_return(btf, btf_id);
+ u32 *flags = btf_kfunc_is_modify_return(btf, btf_id,
+ prog);
if (flags && (*flags & KF_SLEEPABLE))
ret = 0;
@@ -18645,7 +19510,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
return -EINVAL;
}
ret = -EINVAL;
- if (btf_kfunc_is_modify_return(btf, btf_id) ||
+ if (btf_kfunc_is_modify_return(btf, btf_id, prog) ||
!check_attach_modify_return(addr, tname))
ret = 0;
if (ret) {
@@ -18812,6 +19677,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (!env)
return -ENOMEM;
+ env->bt.env = env;
+
len = (*prog)->len;
env->insn_aux_data =
vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));