aboutsummaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c37
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c6
-rw-r--r--kernel/bpf/bpf_lsm.c27
-rw-r--r--kernel/bpf/bpf_struct_ops.c35
-rw-r--r--kernel/bpf/btf.c11
-rw-r--r--kernel/bpf/cgroup.c6
-rw-r--r--kernel/bpf/core.c53
-rw-r--r--kernel/bpf/cpumask.c20
-rw-r--r--kernel/bpf/dispatcher.c7
-rw-r--r--kernel/bpf/hashtab.c12
-rw-r--r--kernel/bpf/helpers.c38
-rw-r--r--kernel/bpf/inode.c326
-rw-r--r--kernel/bpf/log.c42
-rw-r--r--kernel/bpf/map_in_map.c17
-rw-r--r--kernel/bpf/map_in_map.h2
-rw-r--r--kernel/bpf/syscall.c294
-rw-r--r--kernel/bpf/tnum.c6
-rw-r--r--kernel/bpf/token.c271
-rw-r--r--kernel/bpf/trampoline.c101
-rw-r--r--kernel/bpf/verifier.c558
21 files changed, 1416 insertions, 455 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f526b7573e97..4ce95acfcaa7 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index c85ff9162a5c..13358675ff2e 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -82,7 +82,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int numa_node = bpf_map_attr_numa_node(attr);
u32 elem_size, index_mask, max_entries;
- bool bypass_spec_v1 = bpf_bypass_spec_v1();
+ bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL);
u64 array_size, mask64;
struct bpf_array *array;
@@ -867,11 +867,11 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
}
if (old_ptr)
- map->ops->map_fd_put_ptr(old_ptr);
+ map->ops->map_fd_put_ptr(map, old_ptr, true);
return 0;
}
-static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
+static long __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *old_ptr;
@@ -890,13 +890,18 @@ static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
}
if (old_ptr) {
- map->ops->map_fd_put_ptr(old_ptr);
+ map->ops->map_fd_put_ptr(map, old_ptr, need_defer);
return 0;
} else {
return -ENOENT;
}
}
+static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return __fd_array_map_delete_elem(map, key, true);
+}
+
static void *prog_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
@@ -913,8 +918,9 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map,
return prog;
}
-static void prog_fd_array_put_ptr(void *ptr)
+static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
+ /* bpf_prog is freed after one RCU or tasks trace grace period */
bpf_prog_put(ptr);
}
@@ -924,13 +930,13 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr)
}
/* decrement refcnt of all bpf_progs that are stored in this map */
-static void bpf_fd_array_map_clear(struct bpf_map *map)
+static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
for (i = 0; i < array->map.max_entries; i++)
- fd_array_map_delete_elem(map, &i);
+ __fd_array_map_delete_elem(map, &i, need_defer);
}
static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
@@ -1071,7 +1077,7 @@ static void prog_array_map_clear_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_array_aux,
work)->map;
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, true);
bpf_map_put(map);
}
@@ -1151,7 +1157,7 @@ static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
{
struct bpf_event_entry *ee;
- ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
+ ee = kzalloc(sizeof(*ee), GFP_KERNEL);
if (ee) {
ee->event = perf_file->private_data;
ee->perf_file = perf_file;
@@ -1201,8 +1207,9 @@ err_out:
return ee;
}
-static void perf_event_fd_array_put_ptr(void *ptr)
+static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
+ /* bpf_perf_event is freed after one RCU grace period */
bpf_event_entry_free_rcu(ptr);
}
@@ -1220,7 +1227,7 @@ static void perf_event_fd_array_release(struct bpf_map *map,
for (i = 0; i < array->map.max_entries; i++) {
ee = READ_ONCE(array->ptrs[i]);
if (ee && ee->map_file == map_file)
- fd_array_map_delete_elem(map, &i);
+ __fd_array_map_delete_elem(map, &i, true);
}
rcu_read_unlock();
}
@@ -1228,7 +1235,7 @@ static void perf_event_fd_array_release(struct bpf_map *map,
static void perf_event_fd_array_map_free(struct bpf_map *map)
{
if (map->map_flags & BPF_F_PRESERVE_ELEMS)
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, false);
fd_array_map_free(map);
}
@@ -1256,7 +1263,7 @@ static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
return cgroup_get_from_fd(fd);
}
-static void cgroup_fd_array_put_ptr(void *ptr)
+static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
/* cgroup_put free cgrp after a rcu grace period */
cgroup_put(ptr);
@@ -1264,7 +1271,7 @@ static void cgroup_fd_array_put_ptr(void *ptr)
static void cgroup_fd_array_free(struct bpf_map *map)
{
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, false);
fd_array_map_free(map);
}
@@ -1309,7 +1316,7 @@ static void array_of_map_free(struct bpf_map *map)
* is protected by fdget/fdput.
*/
bpf_map_meta_free(map->inner_map_meta);
- bpf_fd_array_map_clear(map);
+ bpf_fd_array_map_clear(map, false);
fd_array_map_free(map);
}
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index d44fe8dd9732..28efd0a3f220 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -82,7 +82,7 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
int fd;
fd = *(int *)key;
- cgroup = cgroup_get_from_fd(fd);
+ cgroup = cgroup_v1v2_get_from_fd(fd);
if (IS_ERR(cgroup))
return ERR_CAST(cgroup);
@@ -101,7 +101,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
int fd;
fd = *(int *)key;
- cgroup = cgroup_get_from_fd(fd);
+ cgroup = cgroup_v1v2_get_from_fd(fd);
if (IS_ERR(cgroup))
return PTR_ERR(cgroup);
@@ -131,7 +131,7 @@ static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
int err, fd;
fd = *(int *)key;
- cgroup = cgroup_get_from_fd(fd);
+ cgroup = cgroup_v1v2_get_from_fd(fd);
if (IS_ERR(cgroup))
return PTR_ERR(cgroup);
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index e14c822f8911..63b4dc495125 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -260,9 +260,15 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
BTF_SET_START(sleepable_lsm_hooks)
BTF_ID(func, bpf_lsm_bpf)
BTF_ID(func, bpf_lsm_bpf_map)
-BTF_ID(func, bpf_lsm_bpf_map_alloc_security)
-BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_map_create)
+BTF_ID(func, bpf_lsm_bpf_map_free)
BTF_ID(func, bpf_lsm_bpf_prog)
+BTF_ID(func, bpf_lsm_bpf_prog_load)
+BTF_ID(func, bpf_lsm_bpf_prog_free)
+BTF_ID(func, bpf_lsm_bpf_token_create)
+BTF_ID(func, bpf_lsm_bpf_token_free)
+BTF_ID(func, bpf_lsm_bpf_token_cmd)
+BTF_ID(func, bpf_lsm_bpf_token_capable)
BTF_ID(func, bpf_lsm_bprm_check_security)
BTF_ID(func, bpf_lsm_bprm_committed_creds)
BTF_ID(func, bpf_lsm_bprm_committing_creds)
@@ -298,6 +304,18 @@ BTF_ID(func, bpf_lsm_kernel_module_request)
BTF_ID(func, bpf_lsm_kernel_read_file)
BTF_ID(func, bpf_lsm_kernfs_init_security)
+#ifdef CONFIG_SECURITY_PATH
+BTF_ID(func, bpf_lsm_path_unlink)
+BTF_ID(func, bpf_lsm_path_mkdir)
+BTF_ID(func, bpf_lsm_path_rmdir)
+BTF_ID(func, bpf_lsm_path_truncate)
+BTF_ID(func, bpf_lsm_path_symlink)
+BTF_ID(func, bpf_lsm_path_link)
+BTF_ID(func, bpf_lsm_path_rename)
+BTF_ID(func, bpf_lsm_path_chmod)
+BTF_ID(func, bpf_lsm_path_chown)
+#endif /* CONFIG_SECURITY_PATH */
+
#ifdef CONFIG_KEYS
BTF_ID(func, bpf_lsm_key_free)
#endif /* CONFIG_KEYS */
@@ -345,9 +363,8 @@ BTF_ID(func, bpf_lsm_userns_create)
BTF_SET_END(sleepable_lsm_hooks)
BTF_SET_START(untrusted_lsm_hooks)
-BTF_ID(func, bpf_lsm_bpf_map_free_security)
-BTF_ID(func, bpf_lsm_bpf_prog_alloc_security)
-BTF_ID(func, bpf_lsm_bpf_prog_free_security)
+BTF_ID(func, bpf_lsm_bpf_map_free)
+BTF_ID(func, bpf_lsm_bpf_prog_free)
BTF_ID(func, bpf_lsm_file_alloc_security)
BTF_ID(func, bpf_lsm_file_free_security)
#ifdef CONFIG_SECURITY_NETWORK
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index db6176fb64dc..02068bd0e4d9 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -352,18 +352,24 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = {
int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
struct bpf_tramp_link *link,
const struct btf_func_model *model,
- void *image, void *image_end)
+ void *stub_func, void *image, void *image_end)
{
- u32 flags;
+ u32 flags = BPF_TRAMP_F_INDIRECT;
+ int size;
tlinks[BPF_TRAMP_FENTRY].links[0] = link;
tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
- /* BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops,
- * and it must be used alone.
- */
- flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
+
+ if (model->ret_size > 0)
+ flags |= BPF_TRAMP_F_RET_FENTRY_RET;
+
+ size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
+ if (size < 0)
+ return size;
+ if (size > (unsigned long)image_end - (unsigned long)image)
+ return -E2BIG;
return arch_prepare_bpf_trampoline(NULL, image, image_end,
- model, flags, tlinks, NULL);
+ model, flags, tlinks, stub_func);
}
static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
@@ -497,11 +503,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
err = bpf_struct_ops_prepare_trampoline(tlinks, link,
&st_ops->func_models[i],
+ *(void **)(st_ops->cfi_stubs + moff),
image, image_end);
if (err < 0)
goto reset_unlock;
- *(void **)(kdata + moff) = image;
+ *(void **)(kdata + moff) = image + cfi_get_offset();
image += err;
/* put prog_id to udata */
@@ -515,7 +522,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (err)
goto reset_unlock;
}
- set_memory_rox((long)st_map->image, 1);
+ arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
/* Let bpf_link handle registration & unregistration.
*
* Pair with smp_load_acquire() during lookup_elem().
@@ -524,7 +531,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto unlock;
}
- set_memory_rox((long)st_map->image, 1);
+ arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
err = st_ops->reg(kdata);
if (likely(!err)) {
/* This refcnt increment on the map here after
@@ -547,8 +554,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
* there was a race in registering the struct_ops (under the same name) to
* a sub-system through different struct_ops's maps.
*/
- set_memory_nx((long)st_map->image, 1);
- set_memory_rw((long)st_map->image, 1);
+ arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
reset_unlock:
bpf_struct_ops_map_put_progs(st_map);
@@ -616,7 +622,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
bpf_struct_ops_map_put_progs(st_map);
bpf_map_area_free(st_map->links);
if (st_map->image) {
- bpf_jit_free_exec(st_map->image);
+ arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
bpf_jit_uncharge_modmem(PAGE_SIZE);
}
bpf_map_area_free(st_map->uvalue);
@@ -691,7 +697,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
return ERR_PTR(ret);
}
- st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
+ st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
if (!st_map->image) {
/* __bpf_struct_ops_map_free() uses st_map->image as flag
* for "charged or not". In this case, we need to unchange
@@ -711,7 +717,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
}
mutex_init(&st_map->lock);
- set_vm_flush_reset_perms(st_map->image);
bpf_map_init_from_attr(map, attr);
return map;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 63cf4128fc05..d56433bf8aba 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6956,7 +6956,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
* (either PTR_TO_CTX or SCALAR_VALUE).
*/
int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *regs, bool is_ex_cb)
+ struct bpf_reg_state *regs, u32 *arg_cnt)
{
struct bpf_verifier_log *log = &env->log;
struct bpf_prog *prog = env->prog;
@@ -7013,6 +7013,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
tname, nargs, MAX_BPF_FUNC_REG_ARGS);
return -EINVAL;
}
+ *arg_cnt = nargs;
/* check that function returns int, exception cb also requires this */
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
@@ -7062,14 +7063,6 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
i, btf_type_str(t), tname);
return -EINVAL;
}
- /* We have already ensured that the callback returns an integer, just
- * like all global subprogs. We need to determine it only has a single
- * scalar argument.
- */
- if (is_ex_cb && (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE)) {
- bpf_log(log, "exception cb only supports single integer argument\n");
- return -EINVAL;
- }
return 0;
}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 491d20038cbe..98e0e3835b28 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1630,7 +1630,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
@@ -2191,7 +2191,7 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
@@ -2348,7 +2348,7 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index fe254ae035fe..14ace23d517b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -121,6 +121,9 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
#endif
INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
+#ifdef CONFIG_FINEIBT
+ INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
+#endif
mutex_init(&fp->aux->used_maps_mutex);
mutex_init(&fp->aux->dst_mutex);
@@ -679,7 +682,7 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
if (!bpf_prog_kallsyms_candidate(fp) ||
- !bpf_capable())
+ !bpf_token_capable(fp->aux->token, CAP_BPF))
return;
bpf_prog_ksym_set_addr(fp);
@@ -687,6 +690,23 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp)
fp->aux->ksym.prog = true;
bpf_ksym_add(&fp->aux->ksym);
+
+#ifdef CONFIG_FINEIBT
+ /*
+ * When FineIBT, code in the __cfi_foo() symbols can get executed
+ * and hence unwinder needs help.
+ */
+ if (cfi_mode != CFI_FINEIBT)
+ return;
+
+ snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
+ "__cfi_%s", fp->aux->ksym.name);
+
+ fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16;
+ fp->aux->ksym_prefix.end = (unsigned long) fp->bpf_func;
+
+ bpf_ksym_add(&fp->aux->ksym_prefix);
+#endif
}
void bpf_prog_kallsyms_del(struct bpf_prog *fp)
@@ -695,6 +715,11 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp)
return;
bpf_ksym_del(&fp->aux->ksym);
+#ifdef CONFIG_FINEIBT
+ if (cfi_mode != CFI_FINEIBT)
+ return;
+ bpf_ksym_del(&fp->aux->ksym_prefix);
+#endif
}
static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
@@ -932,20 +957,20 @@ out:
return ptr;
}
-void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+void bpf_prog_pack_free(void *ptr, u32 size)
{
struct bpf_prog_pack *pack = NULL, *tmp;
unsigned int nbits;
unsigned long pos;
mutex_lock(&pack_mutex);
- if (hdr->size > BPF_PROG_PACK_SIZE) {
- bpf_jit_free_exec(hdr);
+ if (size > BPF_PROG_PACK_SIZE) {
+ bpf_jit_free_exec(ptr);
goto out;
}
list_for_each_entry(tmp, &pack_list, list) {
- if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) {
+ if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) {
pack = tmp;
break;
}
@@ -954,10 +979,10 @@ void bpf_prog_pack_free(struct bpf_binary_header *hdr)
if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
goto out;
- nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
- pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
+ nbits = BPF_PROG_SIZE_TO_NBITS(size);
+ pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
- WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size),
+ WARN_ONCE(bpf_arch_text_invalidate(ptr, size),
"bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
bitmap_clear(pack->bitmap, pos, nbits);
@@ -1104,8 +1129,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
*rw_header = kvmalloc(size, GFP_KERNEL);
if (!*rw_header) {
- bpf_arch_text_copy(&ro_header->size, &size, sizeof(size));
- bpf_prog_pack_free(ro_header);
+ bpf_prog_pack_free(ro_header, size);
bpf_jit_uncharge_modmem(size);
return NULL;
}
@@ -1136,7 +1160,7 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
kvfree(rw_header);
if (IS_ERR(ptr)) {
- bpf_prog_pack_free(ro_header);
+ bpf_prog_pack_free(ro_header, ro_header->size);
return PTR_ERR(ptr);
}
return 0;
@@ -1157,7 +1181,7 @@ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
{
u32 size = ro_header->size;
- bpf_prog_pack_free(ro_header);
+ bpf_prog_pack_free(ro_header, size);
kvfree(rw_header);
bpf_jit_uncharge_modmem(size);
}
@@ -2668,12 +2692,16 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux,
struct bpf_map **used_maps, u32 len)
{
struct bpf_map *map;
+ bool sleepable;
u32 i;
+ sleepable = aux->sleepable;
for (i = 0; i < len; i++) {
map = used_maps[i];
if (map->ops->map_poke_untrack)
map->ops->map_poke_untrack(map, aux);
+ if (sleepable)
+ atomic64_dec(&map->sleepable_refcnt);
bpf_map_put(map);
}
}
@@ -2751,6 +2779,7 @@ void bpf_prog_free(struct bpf_prog *fp)
if (aux->dst_prog)
bpf_prog_put(aux->dst_prog);
+ bpf_token_put(aux->token);
INIT_WORK(&aux->work, bpf_prog_free_deferred);
schedule_work(&aux->work);
}
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index e01c741e54e7..2e73533a3811 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -96,6 +96,12 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
migrate_enable();
}
+__bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask)
+{
+ bpf_cpumask_release(cpumask);
+}
+CFI_NOSEAL(bpf_cpumask_release_dtor);
+
/**
* bpf_cpumask_first() - Get the index of the first nonzero bit in the cpumask.
* @cpumask: The cpumask being queried.
@@ -405,6 +411,17 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
return cpumask_any_and_distribute(src1, src2);
}
+/**
+ * bpf_cpumask_weight() - Return the number of bits in @cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Count the number of set bits in the given cpumask.
+ */
+__bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
+{
+ return cpumask_weight(cpumask);
+}
+
__bpf_kfunc_end_defs();
BTF_SET8_START(cpumask_kfunc_btf_ids)
@@ -432,6 +449,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
BTF_SET8_END(cpumask_kfunc_btf_ids)
static const struct btf_kfunc_id_set cpumask_kfunc_set = {
@@ -441,7 +459,7 @@ static const struct btf_kfunc_id_set cpumask_kfunc_set = {
BTF_ID_LIST(cpumask_dtor_ids)
BTF_ID(struct, bpf_cpumask)
-BTF_ID(func, bpf_cpumask_release)
+BTF_ID(func, bpf_cpumask_release_dtor)
static int __init cpumask_kfunc_init(void)
{
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index fa3e9225aedc..70fb82bf1637 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -150,14 +150,11 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
goto out;
d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE);
if (!d->rw_image) {
- u32 size = PAGE_SIZE;
-
- bpf_arch_text_copy(d->image, &size, sizeof(size));
- bpf_prog_pack_free((struct bpf_binary_header *)d->image);
+ bpf_prog_pack_free(d->image, PAGE_SIZE);
d->image = NULL;
goto out;
}
- bpf_image_ksym_add(d->image, &d->ksym);
+ bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym);
}
prev_num_progs = d->num_progs;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fd8d4b0addfc..ec3bdcc6a3cf 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -897,7 +897,7 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
if (map->ops->map_fd_put_ptr) {
ptr = fd_htab_map_get_ptr(map, l);
- map->ops->map_fd_put_ptr(ptr);
+ map->ops->map_fd_put_ptr(map, ptr, true);
}
}
@@ -2484,7 +2484,7 @@ static void fd_htab_map_free(struct bpf_map *map)
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
void *ptr = fd_htab_map_get_ptr(map, l);
- map->ops->map_fd_put_ptr(ptr);
+ map->ops->map_fd_put_ptr(map, ptr, false);
}
}
@@ -2523,9 +2523,15 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
if (IS_ERR(ptr))
return PTR_ERR(ptr);
+ /* The htab bucket lock is always held during update operations in fd
+ * htab map, and the following rcu_read_lock() is only used to avoid
+ * the WARN_ON_ONCE in htab_map_update_elem().
+ */
+ rcu_read_lock();
ret = htab_map_update_elem(map, key, &ptr, map_flags);
+ rcu_read_unlock();
if (ret)
- map->ops->map_fd_put_ptr(ptr);
+ map->ops->map_fd_put_ptr(map, ptr, false);
return ret;
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b45a8381f9bd..07fd4b5704f3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -32,12 +32,13 @@
*
* Different map implementations will rely on rcu in map methods
* lookup/update/delete, therefore eBPF programs must run under rcu lock
- * if program is allowed to access maps, so check rcu_read_lock_held in
- * all three functions.
+ * if program is allowed to access maps, so check rcu_read_lock_held() or
+ * rcu_read_lock_trace_held() in all three functions.
*/
BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
return (unsigned long) map->ops->map_lookup_elem(map, key);
}
@@ -53,7 +54,8 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = {
BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
void *, value, u64, flags)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
return map->ops->map_update_elem(map, key, value, flags);
}
@@ -70,7 +72,8 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
return map->ops->map_delete_elem(map, key);
}
@@ -1676,7 +1679,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id)
+bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
@@ -1727,7 +1730,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
break;
}
- if (!bpf_capable())
+ if (!bpf_token_capable(prog->aux->token, CAP_BPF))
return NULL;
switch (func_id) {
@@ -1785,7 +1788,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
break;
}
- if (!perfmon_capable())
+ if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
return NULL;
switch (func_id) {
@@ -2147,6 +2150,12 @@ __bpf_kfunc void bpf_task_release(struct task_struct *p)
put_task_struct_rcu_user(p);
}
+__bpf_kfunc void bpf_task_release_dtor(void *p)
+{
+ put_task_struct_rcu_user(p);
+}
+CFI_NOSEAL(bpf_task_release_dtor);
+
#ifdef CONFIG_CGROUPS
/**
* bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by
@@ -2171,6 +2180,12 @@ __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
cgroup_put(cgrp);
}
+__bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp)
+{
+ cgroup_put(cgrp);
+}
+CFI_NOSEAL(bpf_cgroup_release_dtor);
+
/**
* bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor
* array. A cgroup returned by this kfunc which is not subsequently stored in a
@@ -2522,7 +2537,7 @@ __bpf_kfunc void bpf_throw(u64 cookie)
* which skips compiler generated instrumentation to do the same.
*/
kasan_unpoison_task_stack_below((void *)(long)ctx.sp);
- ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp);
+ ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0);
WARN(1, "A call to BPF exception callback should never return\n");
}
@@ -2567,10 +2582,10 @@ static const struct btf_kfunc_id_set generic_kfunc_set = {
BTF_ID_LIST(generic_dtor_ids)
BTF_ID(struct, task_struct)
-BTF_ID(func, bpf_task_release)
+BTF_ID(func, bpf_task_release_dtor)
#ifdef CONFIG_CGROUPS
BTF_ID(struct, cgroup)
-BTF_ID(func, bpf_cgroup_release)
+BTF_ID(func, bpf_cgroup_release_dtor)
#endif
BTF_SET8_START(common_btf_ids)
@@ -2627,6 +2642,7 @@ static int __init kfunc_init(void)
ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
ARRAY_SIZE(generic_dtors),
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 1aafb2ff2e95..4383b3d13a55 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -20,6 +20,7 @@
#include <linux/filter.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
+#include <linux/kstrtox.h>
#include "preload/bpf_preload.h"
enum bpf_type {
@@ -98,9 +99,9 @@ static const struct inode_operations bpf_prog_iops = { };
static const struct inode_operations bpf_map_iops = { };
static const struct inode_operations bpf_link_iops = { };
-static struct inode *bpf_get_inode(struct super_block *sb,
- const struct inode *dir,
- umode_t mode)
+struct inode *bpf_get_inode(struct super_block *sb,
+ const struct inode *dir,
+ umode_t mode)
{
struct inode *inode;
@@ -594,15 +595,183 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ
}
EXPORT_SYMBOL(bpf_prog_get_type_path);
+struct bpffs_btf_enums {
+ const struct btf *btf;
+ const struct btf_type *cmd_t;
+ const struct btf_type *map_t;
+ const struct btf_type *prog_t;
+ const struct btf_type *attach_t;
+};
+
+static int find_bpffs_btf_enums(struct bpffs_btf_enums *info)
+{
+ const struct btf *btf;
+ const struct btf_type *t;
+ const char *name;
+ int i, n;
+
+ memset(info, 0, sizeof(*info));
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ if (!btf)
+ return -ENOENT;
+
+ info->btf = btf;
+
+ for (i = 1, n = btf_nr_types(btf); i < n; i++) {
+ t = btf_type_by_id(btf, i);
+ if (!btf_type_is_enum(t))
+ continue;
+
+ name = btf_name_by_offset(btf, t->name_off);
+ if (!name)
+ continue;
+
+ if (strcmp(name, "bpf_cmd") == 0)
+ info->cmd_t = t;
+ else if (strcmp(name, "bpf_map_type") == 0)
+ info->map_t = t;
+ else if (strcmp(name, "bpf_prog_type") == 0)
+ info->prog_t = t;
+ else if (strcmp(name, "bpf_attach_type") == 0)
+ info->attach_t = t;
+ else
+ continue;
+
+ if (info->cmd_t && info->map_t && info->prog_t && info->attach_t)
+ return 0;
+ }
+
+ return -ESRCH;
+}
+
+static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t,
+ const char *prefix, const char *str, int *value)
+{
+ const struct btf_enum *e;
+ const char *name;
+ int i, n, pfx_len = strlen(prefix);
+
+ *value = 0;
+
+ if (!btf || !enum_t)
+ return false;
+
+ for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
+ e = &btf_enum(enum_t)[i];
+
+ name = btf_name_by_offset(btf, e->name_off);
+ if (!name || strncasecmp(name, prefix, pfx_len) != 0)
+ continue;
+
+ /* match symbolic name case insensitive and ignoring prefix */
+ if (strcasecmp(name + pfx_len, str) == 0) {
+ *value = e->val;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void seq_print_delegate_opts(struct seq_file *m,
+ const char *opt_name,
+ const struct btf *btf,
+ const struct btf_type *enum_t,
+ const char *prefix,
+ u64 delegate_msk, u64 any_msk)
+{
+ const struct btf_enum *e;
+ bool first = true;
+ const char *name;
+ u64 msk;
+ int i, n, pfx_len = strlen(prefix);
+
+ delegate_msk &= any_msk; /* clear unknown bits */
+
+ if (delegate_msk == 0)
+ return;
+
+ seq_printf(m, ",%s", opt_name);
+ if (delegate_msk == any_msk) {
+ seq_printf(m, "=any");
+ return;
+ }
+
+ if (btf && enum_t) {
+ for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
+ e = &btf_enum(enum_t)[i];
+ name = btf_name_by_offset(btf, e->name_off);
+ if (!name || strncasecmp(name, prefix, pfx_len) != 0)
+ continue;
+ msk = 1ULL << e->val;
+ if (delegate_msk & msk) {
+ /* emit lower-case name without prefix */
+ seq_printf(m, "%c", first ? '=' : ':');
+ name += pfx_len;
+ while (*name) {
+ seq_printf(m, "%c", tolower(*name));
+ name++;
+ }
+
+ delegate_msk &= ~msk;
+ first = false;
+ }
+ }
+ }
+ if (delegate_msk)
+ seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk);
+}
+
/*
* Display the mount options in /proc/mounts.
*/
static int bpf_show_options(struct seq_file *m, struct dentry *root)
{
- umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
-
+ struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
+ struct inode *inode = d_inode(root);
+ umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX;
+ u64 mask;
+
+ if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID))
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(&init_user_ns, inode->i_uid));
+ if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID))
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(&init_user_ns, inode->i_gid));
if (mode != S_IRWXUGO)
seq_printf(m, ",mode=%o", mode);
+
+ if (opts->delegate_cmds || opts->delegate_maps ||
+ opts->delegate_progs || opts->delegate_attachs) {
+ struct bpffs_btf_enums info;
+
+ /* ignore errors, fallback to hex */
+ (void)find_bpffs_btf_enums(&info);
+
+ mask = (1ULL << __MAX_BPF_CMD) - 1;
+ seq_print_delegate_opts(m, "delegate_cmds",
+ info.btf, info.cmd_t, "BPF_",
+ opts->delegate_cmds, mask);
+
+ mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
+ seq_print_delegate_opts(m, "delegate_maps",
+ info.btf, info.map_t, "BPF_MAP_TYPE_",
+ opts->delegate_maps, mask);
+
+ mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
+ seq_print_delegate_opts(m, "delegate_progs",
+ info.btf, info.prog_t, "BPF_PROG_TYPE_",
+ opts->delegate_progs, mask);
+
+ mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
+ seq_print_delegate_opts(m, "delegate_attachs",
+ info.btf, info.attach_t, "BPF_",
+ opts->delegate_attachs, mask);
+ }
+
return 0;
}
@@ -617,7 +786,7 @@ static void bpf_free_inode(struct inode *inode)
free_inode_nonrcu(inode);
}
-static const struct super_operations bpf_super_ops = {
+const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
.show_options = bpf_show_options,
@@ -625,23 +794,33 @@ static const struct super_operations bpf_super_ops = {
};
enum {
+ OPT_UID,
+ OPT_GID,
OPT_MODE,
+ OPT_DELEGATE_CMDS,
+ OPT_DELEGATE_MAPS,
+ OPT_DELEGATE_PROGS,
+ OPT_DELEGATE_ATTACHS,
};
static const struct fs_parameter_spec bpf_fs_parameters[] = {
+ fsparam_u32 ("uid", OPT_UID),
+ fsparam_u32 ("gid", OPT_GID),
fsparam_u32oct ("mode", OPT_MODE),
+ fsparam_string ("delegate_cmds", OPT_DELEGATE_CMDS),
+ fsparam_string ("delegate_maps", OPT_DELEGATE_MAPS),
+ fsparam_string ("delegate_progs", OPT_DELEGATE_PROGS),
+ fsparam_string ("delegate_attachs", OPT_DELEGATE_ATTACHS),
{}
};
-struct bpf_mount_opts {
- umode_t mode;
-};
-
static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- struct bpf_mount_opts *opts = fc->fs_private;
+ struct bpf_mount_opts *opts = fc->s_fs_info;
struct fs_parse_result result;
- int opt;
+ kuid_t uid;
+ kgid_t gid;
+ int opt, err;
opt = fs_parse(fc, bpf_fs_parameters, param, &result);
if (opt < 0) {
@@ -662,12 +841,104 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
}
switch (opt) {
+ case OPT_UID:
+ uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(uid))
+ goto bad_value;
+
+ /*
+ * The requested uid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kuid_has_mapping(fc->user_ns, uid))
+ goto bad_value;
+
+ opts->uid = uid;
+ break;
+ case OPT_GID:
+ gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(gid))
+ goto bad_value;
+
+ /*
+ * The requested gid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kgid_has_mapping(fc->user_ns, gid))
+ goto bad_value;
+
+ opts->gid = gid;
+ break;
case OPT_MODE:
opts->mode = result.uint_32 & S_IALLUGO;
break;
+ case OPT_DELEGATE_CMDS:
+ case OPT_DELEGATE_MAPS:
+ case OPT_DELEGATE_PROGS:
+ case OPT_DELEGATE_ATTACHS: {
+ struct bpffs_btf_enums info;
+ const struct btf_type *enum_t;
+ const char *enum_pfx;
+ u64 *delegate_msk, msk = 0;
+ char *p;
+ int val;
+
+ /* ignore errors, fallback to hex */
+ (void)find_bpffs_btf_enums(&info);
+
+ switch (opt) {
+ case OPT_DELEGATE_CMDS:
+ delegate_msk = &opts->delegate_cmds;
+ enum_t = info.cmd_t;
+ enum_pfx = "BPF_";
+ break;
+ case OPT_DELEGATE_MAPS:
+ delegate_msk = &opts->delegate_maps;
+ enum_t = info.map_t;
+ enum_pfx = "BPF_MAP_TYPE_";
+ break;
+ case OPT_DELEGATE_PROGS:
+ delegate_msk = &opts->delegate_progs;
+ enum_t = info.prog_t;
+ enum_pfx = "BPF_PROG_TYPE_";
+ break;
+ case OPT_DELEGATE_ATTACHS:
+ delegate_msk = &opts->delegate_attachs;
+ enum_t = info.attach_t;
+ enum_pfx = "BPF_";
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ while ((p = strsep(&param->string, ":"))) {
+ if (strcmp(p, "any") == 0) {
+ msk |= ~0ULL;
+ } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) {
+ msk |= 1ULL << val;
+ } else {
+ err = kstrtou64(p, 0, &msk);
+ if (err)
+ return err;
+ }
+ }
+
+ /* Setting delegation mount options requires privileges */
+ if (msk && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ *delegate_msk |= msk;
+ break;
+ }
+ default:
+ /* ignore unknown mount options */
+ break;
}
return 0;
+
+bad_value:
+ return invalfc(fc, "Bad value for '%s'", param->key);
}
struct bpf_preload_ops *bpf_preload_ops;
@@ -739,10 +1010,14 @@ out:
static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
{
static const struct tree_descr bpf_rfiles[] = { { "" } };
- struct bpf_mount_opts *opts = fc->fs_private;
+ struct bpf_mount_opts *opts = sb->s_fs_info;
struct inode *inode;
int ret;
+ /* Mounting an instance of BPF FS requires privileges */
+ if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
if (ret)
return ret;
@@ -750,6 +1025,8 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_op = &bpf_super_ops;
inode = sb->s_root->d_inode;
+ inode->i_uid = opts->uid;
+ inode->i_gid = opts->gid;
inode->i_op = &bpf_dir_iops;
inode->i_mode &= ~S_IALLUGO;
populate_bpffs(sb->s_root);
@@ -764,7 +1041,7 @@ static int bpf_get_tree(struct fs_context *fc)
static void bpf_free_fc(struct fs_context *fc)
{
- kfree(fc->fs_private);
+ kfree(fc->s_fs_info);
}
static const struct fs_context_operations bpf_context_ops = {
@@ -785,18 +1062,35 @@ static int bpf_init_fs_context(struct fs_context *fc)
return -ENOMEM;
opts->mode = S_IRWXUGO;
+ opts->uid = current_fsuid();
+ opts->gid = current_fsgid();
+
+ /* start out with no BPF token delegation enabled */
+ opts->delegate_cmds = 0;
+ opts->delegate_maps = 0;
+ opts->delegate_progs = 0;
+ opts->delegate_attachs = 0;
- fc->fs_private = opts;
+ fc->s_fs_info = opts;
fc->ops = &bpf_context_ops;
return 0;
}
+static void bpf_kill_super(struct super_block *sb)
+{
+ struct bpf_mount_opts *opts = sb->s_fs_info;
+
+ kill_litter_super(sb);
+ kfree(opts);
+}
+
static struct file_system_type bpf_fs_type = {
.owner = THIS_MODULE,
.name = "bpf",
.init_fs_context = bpf_init_fs_context,
.parameters = bpf_fs_parameters,
- .kill_sb = kill_litter_super,
+ .kill_sb = bpf_kill_super,
+ .fs_flags = FS_USERNS_MOUNT,
};
static int __init bpf_init(void)
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 3505f3e5ae96..594a234f122b 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -539,6 +539,19 @@ static void verbose_snum(struct bpf_verifier_env *env, s64 num)
verbose(env, "%#llx", num);
}
+int tnum_strn(char *str, size_t size, struct tnum a)
+{
+ /* print as a constant, if tnum is fully known */
+ if (a.mask == 0) {
+ if (is_unum_decimal(a.value))
+ return snprintf(str, size, "%llu", a.value);
+ else
+ return snprintf(str, size, "%#llx", a.value);
+ }
+ return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
+}
+EXPORT_SYMBOL_GPL(tnum_strn);
+
static void print_scalar_ranges(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
const char **sep)
@@ -615,6 +628,12 @@ static bool type_is_map_ptr(enum bpf_reg_type t) {
}
}
+/*
+ * _a stands for append, was shortened to avoid multiline statements below.
+ * This macro is used to output a comma separated list of attributes.
+ */
+#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; })
+
static void print_reg_state(struct bpf_verifier_env *env,
const struct bpf_func_state *state,
const struct bpf_reg_state *reg)
@@ -630,11 +649,6 @@ static void print_reg_state(struct bpf_verifier_env *env,
verbose_snum(env, reg->var_off.value + reg->off);
return;
}
-/*
- * _a stands for append, was shortened to avoid multiline statements below.
- * This macro is used to output a comma separated list of attributes.
- */
-#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; })
verbose(env, "%s", reg_type_str(env, t));
if (t == PTR_TO_STACK) {
@@ -669,6 +683,12 @@ static void print_reg_state(struct bpf_verifier_env *env,
verbose_a("r=");
verbose_unum(env, reg->range);
}
+ if (base_type(t) == PTR_TO_MEM) {
+ verbose_a("sz=");
+ verbose_unum(env, reg->mem_size);
+ }
+ if (t == CONST_PTR_TO_DYNPTR)
+ verbose_a("type=%s", dynptr_type_str(reg->dynptr.type));
if (tnum_is_const(reg->var_off)) {
/* a pointer register with fixed offset */
if (reg->var_off.value) {
@@ -685,8 +705,6 @@ static void print_reg_state(struct bpf_verifier_env *env,
}
}
verbose(env, ")");
-
-#undef verbose_a
}
void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state,
@@ -710,6 +728,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st
}
for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
char types_buf[BPF_REG_SIZE + 1];
+ const char *sep = "";
bool valid = false;
u8 slot_type;
int j;
@@ -748,9 +767,14 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st
verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
print_liveness(env, reg->live);
- verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type));
+ verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type));
+ if (reg->id)
+ verbose_a("id=%d", reg->id);
if (reg->ref_obj_id)
- verbose(env, "(ref_id=%d)", reg->ref_obj_id);
+ verbose_a("ref_id=%d", reg->ref_obj_id);
+ if (reg->dynptr_id)
+ verbose_a("dynptr_id=%d", reg->dynptr_id);
+ verbose(env, ")");
break;
case STACK_ITER:
/* only main slot has ref_obj_id set; skip others */
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index cd5eafaba97e..8ef269e66ba5 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -127,12 +127,21 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
return inner_map;
}
-void bpf_map_fd_put_ptr(void *ptr)
+void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
- /* ptr->ops->map_free() has to go through one
- * rcu grace period by itself.
+ struct bpf_map *inner_map = ptr;
+
+ /* Defer the freeing of inner map according to the sleepable attribute
+ * of bpf program which owns the outer map, so unnecessary waiting for
+ * RCU tasks trace grace period can be avoided.
*/
- bpf_map_put(ptr);
+ if (need_defer) {
+ if (atomic64_read(&map->sleepable_refcnt))
+ WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
+ else
+ WRITE_ONCE(inner_map->free_after_rcu_gp, true);
+ }
+ bpf_map_put(inner_map);
}
u32 bpf_map_fd_sys_lookup_elem(void *ptr)
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index bcb7534afb3c..7d61602354de 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -13,7 +13,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
void bpf_map_meta_free(struct bpf_map *map_meta);
void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
int ufd);
-void bpf_map_fd_put_ptr(void *ptr);
+void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer);
u32 bpf_map_fd_sys_lookup_elem(void *ptr);
#endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5e43ddd1b83f..8faa1a20edf8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -142,9 +142,13 @@ static u32 bpf_map_value_size(const struct bpf_map *map)
static void maybe_wait_bpf_programs(struct bpf_map *map)
{
- /* Wait for any running BPF programs to complete so that
- * userspace, when we return to it, knows that all programs
- * that could be running use the new map value.
+ /* Wait for any running non-sleepable BPF programs to complete so that
+ * userspace, when we return to it, knows that all non-sleepable
+ * programs that could be running use the new map value. For sleepable
+ * BPF programs, synchronize_rcu_tasks_trace() should be used to wait
+ * for the completions of these programs, but considering the waiting
+ * time can be very long and userspace may think it will hang forever,
+ * so don't handle sleepable BPF programs now.
*/
if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
@@ -180,15 +184,11 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
err = bpf_percpu_cgroup_storage_update(map, key, value,
flags);
} else if (IS_FD_ARRAY(map)) {
- rcu_read_lock();
err = bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
- rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
- rcu_read_lock();
err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
flags);
- rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
/* rcu_read_lock() is not needed */
err = bpf_fd_reuseport_array_update_elem(map, key, value,
@@ -203,7 +203,6 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
rcu_read_unlock();
}
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
return err;
}
@@ -264,7 +263,6 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
}
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
return err;
}
@@ -694,6 +692,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
struct btf_record *rec = map->record;
+ struct btf *btf = map->btf;
security_bpf_map_free(map);
bpf_map_release_memcg(map);
@@ -709,6 +708,10 @@ static void bpf_map_free_deferred(struct work_struct *work)
* template bpf_map struct used during verification.
*/
btf_record_free(rec);
+ /* Delay freeing of btf for maps, as map_free callback may need
+ * struct_meta info which will be freed with btf_put().
+ */
+ btf_put(btf);
}
static void bpf_map_put_uref(struct bpf_map *map)
@@ -719,6 +722,28 @@ static void bpf_map_put_uref(struct bpf_map *map)
}
}
+static void bpf_map_free_in_work(struct bpf_map *map)
+{
+ INIT_WORK(&map->work, bpf_map_free_deferred);
+ /* Avoid spawning kworkers, since they all might contend
+ * for the same mutex like slab_mutex.
+ */
+ queue_work(system_unbound_wq, &map->work);
+}
+
+static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
+{
+ bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu));
+}
+
+static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_map_free_rcu_gp(rcu);
+ else
+ call_rcu(rcu, bpf_map_free_rcu_gp);
+}
+
/* decrement map refcnt and schedule it for freeing via workqueue
* (underlying map implementation ops->map_free() might sleep)
*/
@@ -727,12 +752,14 @@ void bpf_map_put(struct bpf_map *map)
if (atomic64_dec_and_test(&map->refcnt)) {
/* bpf_map_free_id() must be called first */
bpf_map_free_id(map);
- btf_put(map->btf);
- INIT_WORK(&map->work, bpf_map_free_deferred);
- /* Avoid spawning kworkers, since they all might contend
- * for the same mutex like slab_mutex.
- */
- queue_work(system_unbound_wq, &map->work);
+
+ WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
+ if (READ_ONCE(map->free_after_mult_rcu_gp))
+ call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
+ else if (READ_ONCE(map->free_after_rcu_gp))
+ call_rcu(&map->rcu, bpf_map_free_rcu_gp);
+ else
+ bpf_map_free_in_work(map);
}
}
EXPORT_SYMBOL_GPL(bpf_map_put);
@@ -984,8 +1011,8 @@ int map_check_no_btf(const struct bpf_map *map,
return -ENOTSUPP;
}
-static int map_check_btf(struct bpf_map *map, const struct btf *btf,
- u32 btf_key_id, u32 btf_value_id)
+static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
+ const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
{
const struct btf_type *key_type, *value_type;
u32 key_size, value_size;
@@ -1013,7 +1040,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
if (!IS_ERR_OR_NULL(map->record)) {
int i;
- if (!bpf_capable()) {
+ if (!bpf_token_capable(token, CAP_BPF)) {
ret = -EPERM;
goto free_map_tab;
}
@@ -1096,11 +1123,17 @@ free_map_tab:
return ret;
}
-#define BPF_MAP_CREATE_LAST_FIELD map_extra
+static bool bpf_net_capable(void)
+{
+ return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
+}
+
+#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
const struct bpf_map_ops *ops;
+ struct bpf_token *token = NULL;
int numa_node = bpf_map_attr_numa_node(attr);
u32 map_type = attr->map_type;
struct bpf_map *map;
@@ -1151,14 +1184,32 @@ static int map_create(union bpf_attr *attr)
if (!ops->map_mem_usage)
return -EINVAL;
+ if (attr->map_token_fd) {
+ token = bpf_token_get_from_fd(attr->map_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+
+ /* if current token doesn't grant map creation permissions,
+ * then we can't use this token, so ignore it and rely on
+ * system-wide capabilities checks
+ */
+ if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) ||
+ !bpf_token_allow_map_type(token, attr->map_type)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ err = -EPERM;
+
/* Intent here is for unprivileged_bpf_disabled to block BPF map
* creation for unprivileged users; other actions depend
* on fd availability and access to bpffs, so are dependent on
* object creation success. Even with unprivileged BPF disabled,
* capability checks are still carried out.
*/
- if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
- return -EPERM;
+ if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF))
+ goto put_token;
/* check privileged map type permissions */
switch (map_type) {
@@ -1191,25 +1242,27 @@ static int map_create(union bpf_attr *attr)
case BPF_MAP_TYPE_LRU_PERCPU_HASH:
case BPF_MAP_TYPE_STRUCT_OPS:
case BPF_MAP_TYPE_CPUMAP:
- if (!bpf_capable())
- return -EPERM;
+ if (!bpf_token_capable(token, CAP_BPF))
+ goto put_token;
break;
case BPF_MAP_TYPE_SOCKMAP:
case BPF_MAP_TYPE_SOCKHASH:
case BPF_MAP_TYPE_DEVMAP:
case BPF_MAP_TYPE_DEVMAP_HASH:
case BPF_MAP_TYPE_XSKMAP:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
+ if (!bpf_token_capable(token, CAP_NET_ADMIN))
+ goto put_token;
break;
default:
WARN(1, "unsupported map type %d", map_type);
- return -EPERM;
+ goto put_token;
}
map = ops->map_alloc(attr);
- if (IS_ERR(map))
- return PTR_ERR(map);
+ if (IS_ERR(map)) {
+ err = PTR_ERR(map);
+ goto put_token;
+ }
map->ops = ops;
map->map_type = map_type;
@@ -1246,7 +1299,7 @@ static int map_create(union bpf_attr *attr)
map->btf = btf;
if (attr->btf_value_type_id) {
- err = map_check_btf(map, btf, attr->btf_key_type_id,
+ err = map_check_btf(map, token, btf, attr->btf_key_type_id,
attr->btf_value_type_id);
if (err)
goto free_map;
@@ -1258,15 +1311,16 @@ static int map_create(union bpf_attr *attr)
attr->btf_vmlinux_value_type_id;
}
- err = security_bpf_map_alloc(map);
+ err = security_bpf_map_create(map, attr, token);
if (err)
- goto free_map;
+ goto free_map_sec;
err = bpf_map_alloc_id(map);
if (err)
goto free_map_sec;
bpf_map_save_memcg(map);
+ bpf_token_put(token);
err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
@@ -1287,6 +1341,8 @@ free_map_sec:
free_map:
btf_put(map->btf);
map->ops->map_free(map);
+put_token:
+ bpf_token_put(token);
return err;
}
@@ -1524,6 +1580,8 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
}
err = bpf_map_update_value(map, f.file, key, value, attr->flags);
+ if (!err)
+ maybe_wait_bpf_programs(map);
kvfree(value);
free_key:
@@ -1579,7 +1637,8 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
+ if (!err)
+ maybe_wait_bpf_programs(map);
out:
kvfree(key);
err_put:
@@ -1676,6 +1735,9 @@ int generic_map_delete_batch(struct bpf_map *map,
if (!max_count)
return 0;
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!key)
return -ENOMEM;
@@ -1705,7 +1767,6 @@ int generic_map_delete_batch(struct bpf_map *map,
kvfree(key);
- maybe_wait_bpf_programs(map);
return err;
}
@@ -1733,6 +1794,9 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
if (!max_count)
return 0;
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!key)
return -ENOMEM;
@@ -1763,6 +1827,7 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
kvfree(value);
kvfree(key);
+
return err;
}
@@ -2108,7 +2173,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
kvfree(aux->func_info);
kfree(aux->func_info_aux);
free_uid(aux->user);
- security_bpf_prog_free(aux);
+ security_bpf_prog_free(aux->prog);
bpf_prog_free(aux->prog);
}
@@ -2554,13 +2619,15 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD log_true_size
+#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd
static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog, *dst_prog = NULL;
struct btf *attach_btf = NULL;
+ struct bpf_token *token = NULL;
+ bool bpf_cap;
int err;
char license[128];
@@ -2577,10 +2644,31 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
BPF_F_TEST_REG_INVARIANTS))
return -EINVAL;
+ bpf_prog_load_fixup_attach_type(attr);
+
+ if (attr->prog_token_fd) {
+ token = bpf_token_get_from_fd(attr->prog_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+ /* if current token doesn't grant prog loading permissions,
+ * then we can't use this token, so ignore it and rely on
+ * system-wide capabilities checks
+ */
+ if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) ||
+ !bpf_token_allow_prog_type(token, attr->prog_type,
+ attr->expected_attach_type)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ bpf_cap = bpf_token_capable(token, CAP_BPF);
+ err = -EPERM;
+
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
(attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
- !bpf_capable())
- return -EPERM;
+ !bpf_cap)
+ goto put_token;
/* Intent here is for unprivileged_bpf_disabled to block BPF program
* creation for unprivileged users; other actions depend
@@ -2589,21 +2677,23 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
* capability checks are still carried out for these
* and other operations.
*/
- if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
- return -EPERM;
+ if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
+ goto put_token;
if (attr->insn_cnt == 0 ||
- attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
- return -E2BIG;
+ attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
+ err = -E2BIG;
+ goto put_token;
+ }
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
- !bpf_capable())
- return -EPERM;
+ !bpf_cap)
+ goto put_token;
- if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (is_perfmon_prog_type(type) && !perfmon_capable())
- return -EPERM;
+ if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
+ goto put_token;
+ if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
+ goto put_token;
/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
* or btf, we need to check which one it is
@@ -2613,27 +2703,33 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (IS_ERR(dst_prog)) {
dst_prog = NULL;
attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
- if (IS_ERR(attach_btf))
- return -EINVAL;
+ if (IS_ERR(attach_btf)) {
+ err = -EINVAL;
+ goto put_token;
+ }
if (!btf_is_kernel(attach_btf)) {
/* attaching through specifying bpf_prog's BTF
* objects directly might be supported eventually
*/
btf_put(attach_btf);
- return -ENOTSUPP;
+ err = -ENOTSUPP;
+ goto put_token;
}
}
} else if (attr->attach_btf_id) {
/* fall back to vmlinux BTF, if BTF type ID is specified */
attach_btf = bpf_get_btf_vmlinux();
- if (IS_ERR(attach_btf))
- return PTR_ERR(attach_btf);
- if (!attach_btf)
- return -EINVAL;
+ if (IS_ERR(attach_btf)) {
+ err = PTR_ERR(attach_btf);
+ goto put_token;
+ }
+ if (!attach_btf) {
+ err = -EINVAL;
+ goto put_token;
+ }
btf_get(attach_btf);
}
- bpf_prog_load_fixup_attach_type(attr);
if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
attach_btf, attr->attach_btf_id,
dst_prog)) {
@@ -2641,7 +2737,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
bpf_prog_put(dst_prog);
if (attach_btf)
btf_put(attach_btf);
- return -EINVAL;
+ err = -EINVAL;
+ goto put_token;
}
/* plain bpf_prog allocation */
@@ -2651,7 +2748,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
bpf_prog_put(dst_prog);
if (attach_btf)
btf_put(attach_btf);
- return -ENOMEM;
+ err = -EINVAL;
+ goto put_token;
}
prog->expected_attach_type = attr->expected_attach_type;
@@ -2662,9 +2760,9 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
- err = security_bpf_prog_alloc(prog->aux);
- if (err)
- goto free_prog;
+ /* move token into prog->aux, reuse taken refcnt */
+ prog->aux->token = token;
+ token = NULL;
prog->aux->user = get_current_user();
prog->len = attr->insn_cnt;
@@ -2673,12 +2771,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (copy_from_bpfptr(prog->insns,
make_bpfptr(attr->insns, uattr.is_kernel),
bpf_prog_insn_size(prog)) != 0)
- goto free_prog_sec;
+ goto free_prog;
/* copy eBPF program license from user space */
if (strncpy_from_bpfptr(license,
make_bpfptr(attr->license, uattr.is_kernel),
sizeof(license) - 1) < 0)
- goto free_prog_sec;
+ goto free_prog;
license[sizeof(license) - 1] = 0;
/* eBPF programs must be GPL compatible to use GPL-ed functions */
@@ -2692,25 +2790,29 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (bpf_prog_is_dev_bound(prog->aux)) {
err = bpf_prog_dev_bound_init(prog, attr);
if (err)
- goto free_prog_sec;
+ goto free_prog;
}
if (type == BPF_PROG_TYPE_EXT && dst_prog &&
bpf_prog_is_dev_bound(dst_prog->aux)) {
err = bpf_prog_dev_bound_inherit(prog, dst_prog);
if (err)
- goto free_prog_sec;
+ goto free_prog;
}
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
if (err < 0)
- goto free_prog_sec;
+ goto free_prog;
prog->aux->load_time = ktime_get_boottime_ns();
err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
sizeof(attr->prog_name));
if (err < 0)
+ goto free_prog;
+
+ err = security_bpf_prog_load(prog, attr, token);
+ if (err)
goto free_prog_sec;
/* run eBPF verifier */
@@ -2756,13 +2858,16 @@ free_used_maps:
*/
__bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
return err;
+
free_prog_sec:
- free_uid(prog->aux->user);
- security_bpf_prog_free(prog->aux);
+ security_bpf_prog_free(prog);
free_prog:
+ free_uid(prog->aux->user);
if (prog->aux->attach_btf)
btf_put(prog->aux->attach_btf);
bpf_prog_free(prog);
+put_token:
+ bpf_token_put(token);
return err;
}
@@ -3752,7 +3857,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
case BPF_PROG_TYPE_SK_LOOKUP:
return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
case BPF_PROG_TYPE_CGROUP_SKB:
- if (!capable(CAP_NET_ADMIN))
+ if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN))
/* cg-skb progs can be loaded by unpriv user.
* check permissions at attach time.
*/
@@ -3955,7 +4060,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
static int bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
- if (!capable(CAP_NET_ADMIN))
+ if (!bpf_net_capable())
return -EPERM;
if (CHECK_ATTR(BPF_PROG_QUERY))
return -EINVAL;
@@ -4723,15 +4828,31 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
return err;
}
-#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
+#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
+ struct bpf_token *token = NULL;
+
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
- if (!bpf_capable())
+ if (attr->btf_token_fd) {
+ token = bpf_token_get_from_fd(attr->btf_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+ if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ if (!bpf_token_capable(token, CAP_BPF)) {
+ bpf_token_put(token);
return -EPERM;
+ }
+
+ bpf_token_put(token);
return btf_new_fd(attr, uattr, uattr_size);
}
@@ -4920,8 +5041,10 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
else
BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
err_put:
- if (has_write)
+ if (has_write) {
+ maybe_wait_bpf_programs(map);
bpf_map_write_active_dec(map);
+ }
fdput(f);
return err;
}
@@ -5323,6 +5446,11 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
goto out_unlock;
}
+ /* The bpf program will not access the bpf map, but for the sake of
+ * simplicity, increase sleepable_refcnt for sleepable program as well.
+ */
+ if (prog->aux->sleepable)
+ atomic64_inc(&map->sleepable_refcnt);
memcpy(used_maps_new, used_maps_old,
sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
used_maps_new[prog->aux->used_map_cnt] = map;
@@ -5342,6 +5470,20 @@ out_prog_put:
return ret;
}
+#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd
+
+static int token_create(union bpf_attr *attr)
+{
+ if (CHECK_ATTR(BPF_TOKEN_CREATE))
+ return -EINVAL;
+
+ /* no flags are supported yet */
+ if (attr->token_create.flags)
+ return -EINVAL;
+
+ return bpf_token_create(attr);
+}
+
static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
@@ -5475,6 +5617,9 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
case BPF_PROG_BIND_MAP:
err = bpf_prog_bind_map(&attr);
break;
+ case BPF_TOKEN_CREATE:
+ err = token_create(&attr);
+ break;
default:
err = -EINVAL;
break;
@@ -5581,7 +5726,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
const struct bpf_func_proto * __weak
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
BPF_CALL_1(bpf_sys_close, u32, fd)
@@ -5631,7 +5776,8 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_sys_bpf:
- return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
+ return !bpf_token_capable(prog->aux->token, CAP_PERFMON)
+ ? NULL : &bpf_sys_bpf_proto;
case BPF_FUNC_btf_find_by_name_kind:
return &bpf_btf_find_by_name_kind_proto;
case BPF_FUNC_sys_close:
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index f4c91c9b27d7..9dbc31b25e3d 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -172,12 +172,6 @@ bool tnum_in(struct tnum a, struct tnum b)
return a.value == b.value;
}
-int tnum_strn(char *str, size_t size, struct tnum a)
-{
- return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
-}
-EXPORT_SYMBOL_GPL(tnum_strn);
-
int tnum_sbin(char *str, size_t size, struct tnum a)
{
size_t n;
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
new file mode 100644
index 000000000000..a86fccd57e2d
--- /dev/null
+++ b/kernel/bpf/token.c
@@ -0,0 +1,271 @@
+#include <linux/bpf.h>
+#include <linux/vmalloc.h>
+#include <linux/fdtable.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/idr.h>
+#include <linux/namei.h>
+#include <linux/user_namespace.h>
+#include <linux/security.h>
+
+bool bpf_token_capable(const struct bpf_token *token, int cap)
+{
+ /* BPF token allows ns_capable() level of capabilities, but only if
+ * token's userns is *exactly* the same as current user's userns
+ */
+ if (token && current_user_ns() == token->userns) {
+ if (ns_capable(token->userns, cap) ||
+ (cap != CAP_SYS_ADMIN && ns_capable(token->userns, CAP_SYS_ADMIN)))
+ return security_bpf_token_capable(token, cap) == 0;
+ }
+ /* otherwise fallback to capable() checks */
+ return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
+}
+
+void bpf_token_inc(struct bpf_token *token)
+{
+ atomic64_inc(&token->refcnt);
+}
+
+static void bpf_token_free(struct bpf_token *token)
+{
+ security_bpf_token_free(token);
+ put_user_ns(token->userns);
+ kvfree(token);
+}
+
+static void bpf_token_put_deferred(struct work_struct *work)
+{
+ struct bpf_token *token = container_of(work, struct bpf_token, work);
+
+ bpf_token_free(token);
+}
+
+void bpf_token_put(struct bpf_token *token)
+{
+ if (!token)
+ return;
+
+ if (!atomic64_dec_and_test(&token->refcnt))
+ return;
+
+ INIT_WORK(&token->work, bpf_token_put_deferred);
+ schedule_work(&token->work);
+}
+
+static int bpf_token_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_token *token = filp->private_data;
+
+ bpf_token_put(token);
+ return 0;
+}
+
+static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ struct bpf_token *token = filp->private_data;
+ u64 mask;
+
+ BUILD_BUG_ON(__MAX_BPF_CMD >= 64);
+ mask = (1ULL << __MAX_BPF_CMD) - 1;
+ if ((token->allowed_cmds & mask) == mask)
+ seq_printf(m, "allowed_cmds:\tany\n");
+ else
+ seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds);
+
+ BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64);
+ mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
+ if ((token->allowed_maps & mask) == mask)
+ seq_printf(m, "allowed_maps:\tany\n");
+ else
+ seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps);
+
+ BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64);
+ mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
+ if ((token->allowed_progs & mask) == mask)
+ seq_printf(m, "allowed_progs:\tany\n");
+ else
+ seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs);
+
+ BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64);
+ mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
+ if ((token->allowed_attachs & mask) == mask)
+ seq_printf(m, "allowed_attachs:\tany\n");
+ else
+ seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs);
+}
+
+#define BPF_TOKEN_INODE_NAME "bpf-token"
+
+static const struct inode_operations bpf_token_iops = { };
+
+static const struct file_operations bpf_token_fops = {
+ .release = bpf_token_release,
+ .show_fdinfo = bpf_token_show_fdinfo,
+};
+
+int bpf_token_create(union bpf_attr *attr)
+{
+ struct bpf_mount_opts *mnt_opts;
+ struct bpf_token *token = NULL;
+ struct user_namespace *userns;
+ struct inode *inode;
+ struct file *file;
+ struct path path;
+ struct fd f;
+ umode_t mode;
+ int err, fd;
+
+ f = fdget(attr->token_create.bpffs_fd);
+ if (!f.file)
+ return -EBADF;
+
+ path = f.file->f_path;
+ path_get(&path);
+ fdput(f);
+
+ if (path.dentry != path.mnt->mnt_sb->s_root) {
+ err = -EINVAL;
+ goto out_path;
+ }
+ if (path.mnt->mnt_sb->s_op != &bpf_super_ops) {
+ err = -EINVAL;
+ goto out_path;
+ }
+ err = path_permission(&path, MAY_ACCESS);
+ if (err)
+ goto out_path;
+
+ userns = path.dentry->d_sb->s_user_ns;
+ /*
+ * Enforce that creators of BPF tokens are in the same user
+ * namespace as the BPF FS instance. This makes reasoning about
+ * permissions a lot easier and we can always relax this later.
+ */
+ if (current_user_ns() != userns) {
+ err = -EPERM;
+ goto out_path;
+ }
+ if (!ns_capable(userns, CAP_BPF)) {
+ err = -EPERM;
+ goto out_path;
+ }
+
+ mnt_opts = path.dentry->d_sb->s_fs_info;
+ if (mnt_opts->delegate_cmds == 0 &&
+ mnt_opts->delegate_maps == 0 &&
+ mnt_opts->delegate_progs == 0 &&
+ mnt_opts->delegate_attachs == 0) {
+ err = -ENOENT; /* no BPF token delegation is set up */
+ goto out_path;
+ }
+
+ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+ inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_path;
+ }
+
+ inode->i_op = &bpf_token_iops;
+ inode->i_fop = &bpf_token_fops;
+ clear_nlink(inode); /* make sure it is unlinked */
+
+ file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops);
+ if (IS_ERR(file)) {
+ iput(inode);
+ err = PTR_ERR(file);
+ goto out_path;
+ }
+
+ token = kvzalloc(sizeof(*token), GFP_USER);
+ if (!token) {
+ err = -ENOMEM;
+ goto out_file;
+ }
+
+ atomic64_set(&token->refcnt, 1);
+
+ /* remember bpffs owning userns for future ns_capable() checks */
+ token->userns = get_user_ns(userns);
+
+ token->allowed_cmds = mnt_opts->delegate_cmds;
+ token->allowed_maps = mnt_opts->delegate_maps;
+ token->allowed_progs = mnt_opts->delegate_progs;
+ token->allowed_attachs = mnt_opts->delegate_attachs;
+
+ err = security_bpf_token_create(token, attr, &path);
+ if (err)
+ goto out_token;
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0) {
+ err = fd;
+ goto out_token;
+ }
+
+ file->private_data = token;
+ fd_install(fd, file);
+
+ path_put(&path);
+ return fd;
+
+out_token:
+ bpf_token_free(token);
+out_file:
+ fput(file);
+out_path:
+ path_put(&path);
+ return err;
+}
+
+struct bpf_token *bpf_token_get_from_fd(u32 ufd)
+{
+ struct fd f = fdget(ufd);
+ struct bpf_token *token;
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+ if (f.file->f_op != &bpf_token_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ token = f.file->private_data;
+ bpf_token_inc(token);
+ fdput(f);
+
+ return token;
+}
+
+bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+{
+ /* BPF token can be used only within exactly the same userns in which
+ * it was created
+ */
+ if (!token || current_user_ns() != token->userns)
+ return false;
+ if (!(token->allowed_cmds & (1ULL << cmd)))
+ return false;
+ return security_bpf_token_cmd(token, cmd) == 0;
+}
+
+bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type)
+{
+ if (!token || type >= __MAX_BPF_MAP_TYPE)
+ return false;
+
+ return token->allowed_maps & (1ULL << type);
+}
+
+bool bpf_token_allow_prog_type(const struct bpf_token *token,
+ enum bpf_prog_type prog_type,
+ enum bpf_attach_type attach_type)
+{
+ if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE)
+ return false;
+
+ return (token->allowed_progs & (1ULL << prog_type)) &&
+ (token->allowed_attachs & (1ULL << attach_type));
+}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index e97aeda3a86b..d382f5ebe06c 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -115,10 +115,10 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
}
-void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
+void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym)
{
ksym->start = (unsigned long) data;
- ksym->end = ksym->start + PAGE_SIZE;
+ ksym->end = ksym->start + size;
bpf_ksym_add(ksym);
perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
PAGE_SIZE, false, ksym->name);
@@ -254,8 +254,8 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_a
static void bpf_tramp_image_free(struct bpf_tramp_image *im)
{
bpf_image_ksym_del(&im->ksym);
- bpf_jit_free_exec(im->image);
- bpf_jit_uncharge_modmem(PAGE_SIZE);
+ arch_free_bpf_trampoline(im->image, im->size);
+ bpf_jit_uncharge_modmem(im->size);
percpu_ref_exit(&im->pcref);
kfree_rcu(im, rcu);
}
@@ -349,7 +349,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
}
-static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
+static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size)
{
struct bpf_tramp_image *im;
struct bpf_ksym *ksym;
@@ -360,15 +360,15 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
if (!im)
goto out;
- err = bpf_jit_charge_modmem(PAGE_SIZE);
+ err = bpf_jit_charge_modmem(size);
if (err)
goto out_free_im;
+ im->size = size;
err = -ENOMEM;
- im->image = image = bpf_jit_alloc_exec(PAGE_SIZE);
+ im->image = image = arch_alloc_bpf_trampoline(size);
if (!image)
goto out_uncharge;
- set_vm_flush_reset_perms(image);
err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
if (err)
@@ -377,13 +377,13 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
ksym = &im->ksym;
INIT_LIST_HEAD_RCU(&ksym->lnode);
snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key);
- bpf_image_ksym_add(image, ksym);
+ bpf_image_ksym_add(image, size, ksym);
return im;
out_free_image:
- bpf_jit_free_exec(im->image);
+ arch_free_bpf_trampoline(im->image, im->size);
out_uncharge:
- bpf_jit_uncharge_modmem(PAGE_SIZE);
+ bpf_jit_uncharge_modmem(size);
out_free_im:
kfree(im);
out:
@@ -396,7 +396,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
struct bpf_tramp_links *tlinks;
u32 orig_flags = tr->flags;
bool ip_arg = false;
- int err, total;
+ int err, total, size;
tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg);
if (IS_ERR(tlinks))
@@ -409,12 +409,6 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
goto out;
}
- im = bpf_tramp_image_alloc(tr->key);
- if (IS_ERR(im)) {
- err = PTR_ERR(im);
- goto out;
- }
-
/* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */
tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX);
@@ -438,13 +432,31 @@ again:
tr->flags |= BPF_TRAMP_F_ORIG_STACK;
#endif
- err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
+ size = arch_bpf_trampoline_size(&tr->func.model, tr->flags,
+ tlinks, tr->func.addr);
+ if (size < 0) {
+ err = size;
+ goto out;
+ }
+
+ if (size > PAGE_SIZE) {
+ err = -E2BIG;
+ goto out;
+ }
+
+ im = bpf_tramp_image_alloc(tr->key, size);
+ if (IS_ERR(im)) {
+ err = PTR_ERR(im);
+ goto out;
+ }
+
+ err = arch_prepare_bpf_trampoline(im, im->image, im->image + size,
&tr->func.model, tr->flags, tlinks,
tr->func.addr);
if (err < 0)
goto out_free;
- set_memory_rox((long)im->image, 1);
+ arch_protect_bpf_trampoline(im->image, im->size);
WARN_ON(tr->cur_image && total == 0);
if (tr->cur_image)
@@ -464,9 +476,8 @@ again:
tr->fops->func = NULL;
tr->fops->trampoline = 0;
- /* reset im->image memory attr for arch_prepare_bpf_trampoline */
- set_memory_nx((long)im->image, 1);
- set_memory_rw((long)im->image, 1);
+ /* free im memory and reallocate later */
+ bpf_tramp_image_free(im);
goto again;
}
#endif
@@ -1032,10 +1043,50 @@ bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
}
int __weak
-arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
+arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
struct bpf_tramp_links *tlinks,
- void *orig_call)
+ void *func_addr)
+{
+ return -ENOTSUPP;
+}
+
+void * __weak arch_alloc_bpf_trampoline(unsigned int size)
+{
+ void *image;
+
+ if (WARN_ON_ONCE(size > PAGE_SIZE))
+ return NULL;
+ image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (image)
+ set_vm_flush_reset_perms(image);
+ return image;
+}
+
+void __weak arch_free_bpf_trampoline(void *image, unsigned int size)
+{
+ WARN_ON_ONCE(size > PAGE_SIZE);
+ /* bpf_jit_free_exec doesn't need "size", but
+ * bpf_prog_pack_free() needs it.
+ */
+ bpf_jit_free_exec(image);
+}
+
+void __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
+{
+ WARN_ON_ONCE(size > PAGE_SIZE);
+ set_memory_rox((long)image, 1);
+}
+
+void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+{
+ WARN_ON_ONCE(size > PAGE_SIZE);
+ set_memory_nx((long)image, 1);
+ set_memory_rw((long)image, 1);
+}
+
+int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks, void *func_addr)
{
return -ENOTSUPP;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8e7b6072e3f4..9456ee0ad129 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -362,20 +362,23 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
static void verbose_invalid_scalar(struct bpf_verifier_env *env,
struct bpf_reg_state *reg,
- struct tnum *range, const char *ctx,
+ struct bpf_retval_range range, const char *ctx,
const char *reg_name)
{
- char tn_buf[48];
+ bool unknown = true;
- verbose(env, "At %s the register %s ", ctx, reg_name);
- if (!tnum_is_unknown(reg->var_off)) {
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "has value %s", tn_buf);
- } else {
- verbose(env, "has unknown scalar value");
+ verbose(env, "%s the register %s has", ctx, reg_name);
+ if (reg->smin_value > S64_MIN) {
+ verbose(env, " smin=%lld", reg->smin_value);
+ unknown = false;
+ }
+ if (reg->smax_value < S64_MAX) {
+ verbose(env, " smax=%lld", reg->smax_value);
+ unknown = false;
}
- tnum_strn(tn_buf, sizeof(tn_buf), *range);
- verbose(env, " should have been in %s\n", tn_buf);
+ if (unknown)
+ verbose(env, " unknown scalar value");
+ verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval);
}
static bool type_may_be_null(u32 type)
@@ -439,6 +442,25 @@ static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env,
return &env->prog->aux->func_info_aux[subprog];
}
+static struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog)
+{
+ return &env->subprog_info[subprog];
+}
+
+static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
+{
+ struct bpf_subprog_info *info = subprog_info(env, subprog);
+
+ info->is_cb = true;
+ info->is_async_cb = true;
+ info->is_exception_cb = true;
+}
+
+static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog)
+{
+ return subprog_info(env, subprog)->is_exception_cb;
+}
+
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
@@ -1141,6 +1163,21 @@ static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack)
stack->spilled_ptr.type == SCALAR_VALUE;
}
+/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which
+ * case they are equivalent, or it's STACK_ZERO, in which case we preserve
+ * more precise STACK_ZERO.
+ * Note, in uprivileged mode leaving STACK_INVALID is wrong, so we take
+ * env->allow_ptr_leaks into account and force STACK_MISC, if necessary.
+ */
+static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
+{
+ if (*stype == STACK_ZERO)
+ return;
+ if (env->allow_ptr_leaks && *stype == STACK_INVALID)
+ return;
+ *stype = STACK_MISC;
+}
+
static void scrub_spilled_slot(u8 *stype)
{
if (*stype != STACK_INVALID)
@@ -1241,9 +1278,16 @@ static int resize_reference_state(struct bpf_func_state *state, size_t n)
return 0;
}
-static int grow_stack_state(struct bpf_func_state *state, int size)
+/* Possibly update state->allocated_stack to be at least size bytes. Also
+ * possibly update the function's high-water mark in its bpf_subprog_info.
+ */
+static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size)
{
- size_t old_n = state->allocated_stack / BPF_REG_SIZE, n = size / BPF_REG_SIZE;
+ size_t old_n = state->allocated_stack / BPF_REG_SIZE, n;
+
+ /* The stack size is always a multiple of BPF_REG_SIZE. */
+ size = round_up(size, BPF_REG_SIZE);
+ n = size / BPF_REG_SIZE;
if (old_n >= n)
return 0;
@@ -1253,6 +1297,11 @@ static int grow_stack_state(struct bpf_func_state *state, int size)
return -ENOMEM;
state->allocated_stack = size;
+
+ /* update known max for given subprogram */
+ if (env->subprog_info[state->subprogno].stack_depth < size)
+ env->subprog_info[state->subprogno].stack_depth = size;
+
return 0;
}
@@ -1352,8 +1401,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
int i, err;
dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
- src->jmp_history_cnt, sizeof(struct bpf_idx_pair),
- GFP_USER);
+ src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
+ GFP_USER);
if (!dst_state->jmp_history)
return -ENOMEM;
dst_state->jmp_history_cnt = src->jmp_history_cnt;
@@ -1728,10 +1777,14 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
__mark_reg_known(reg, 0);
}
-static void __mark_reg_const_zero(struct bpf_reg_state *reg)
+static void __mark_reg_const_zero(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
__mark_reg_known(reg, 0);
reg->type = SCALAR_VALUE;
+ /* all scalars are assumed imprecise initially (unless unprivileged,
+ * in which case everything is forced to be precise)
+ */
+ reg->precise = !env->bpf_capable;
}
static void mark_reg_known_zero(struct bpf_verifier_env *env,
@@ -2305,6 +2358,11 @@ static void init_reg_state(struct bpf_verifier_env *env,
regs[BPF_REG_FP].frameno = state->frameno;
}
+static struct bpf_retval_range retval_range(s32 minval, s32 maxval)
+{
+ return (struct bpf_retval_range){ minval, maxval };
+}
+
#define BPF_MAIN_FUNC (-1)
static void init_func_state(struct bpf_verifier_env *env,
struct bpf_func_state *state,
@@ -2313,7 +2371,7 @@ static void init_func_state(struct bpf_verifier_env *env,
state->callsite = callsite;
state->frameno = frameno;
state->subprogno = subprogno;
- state->callback_ret_range = tnum_range(0, 0);
+ state->callback_ret_range = retval_range(0, 0);
init_reg_state(env, state);
mark_verifier_state_scratched(env);
}
@@ -2857,6 +2915,7 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
if (env->subprog_info[i].start != ex_cb_insn)
continue;
env->exception_callback_subprog = i;
+ mark_subprog_exc_cb(env, i);
break;
}
}
@@ -3213,6 +3272,21 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
return __check_reg_arg(env, state->regs, regno, t);
}
+static int insn_stack_access_flags(int frameno, int spi)
+{
+ return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno;
+}
+
+static int insn_stack_access_spi(int insn_flags)
+{
+ return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
+}
+
+static int insn_stack_access_frameno(int insn_flags)
+{
+ return insn_flags & INSN_F_FRAMENO_MASK;
+}
+
static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
{
env->insn_aux_data[idx].jmp_point = true;
@@ -3224,28 +3298,51 @@ static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
}
/* for any branch, call, exit record the history of jmps in the given state */
-static int push_jmp_history(struct bpf_verifier_env *env,
- struct bpf_verifier_state *cur)
+static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
+ int insn_flags)
{
u32 cnt = cur->jmp_history_cnt;
- struct bpf_idx_pair *p;
+ struct bpf_jmp_history_entry *p;
size_t alloc_size;
- if (!is_jmp_point(env, env->insn_idx))
+ /* combine instruction flags if we already recorded this instruction */
+ if (env->cur_hist_ent) {
+ /* atomic instructions push insn_flags twice, for READ and
+ * WRITE sides, but they should agree on stack slot
+ */
+ WARN_ONCE((env->cur_hist_ent->flags & insn_flags) &&
+ (env->cur_hist_ent->flags & insn_flags) != insn_flags,
+ "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n",
+ env->insn_idx, env->cur_hist_ent->flags, insn_flags);
+ env->cur_hist_ent->flags |= insn_flags;
return 0;
+ }
cnt++;
alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
if (!p)
return -ENOMEM;
- p[cnt - 1].idx = env->insn_idx;
- p[cnt - 1].prev_idx = env->prev_insn_idx;
cur->jmp_history = p;
+
+ p = &cur->jmp_history[cnt - 1];
+ p->idx = env->insn_idx;
+ p->prev_idx = env->prev_insn_idx;
+ p->flags = insn_flags;
cur->jmp_history_cnt = cnt;
+ env->cur_hist_ent = p;
+
return 0;
}
+static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
+ u32 hist_end, int insn_idx)
+{
+ if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
+ return &st->jmp_history[hist_end - 1];
+ return NULL;
+}
+
/* Backtrack one insn at a time. If idx is not at the top of recorded
* history then previous instruction came from straight line execution.
* Return -ENOENT if we exhausted all instructions within given state.
@@ -3372,16 +3469,6 @@ static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u3
bt->stack_masks[frame] &= ~(1ull << slot);
}
-static inline void bt_set_slot(struct backtrack_state *bt, u32 slot)
-{
- bt_set_frame_slot(bt, bt->frame, slot);
-}
-
-static inline void bt_clear_slot(struct backtrack_state *bt, u32 slot)
-{
- bt_clear_frame_slot(bt, bt->frame, slot);
-}
-
static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame)
{
return bt->reg_masks[frame];
@@ -3407,9 +3494,9 @@ static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
return bt->reg_masks[bt->frame] & (1 << reg);
}
-static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot)
+static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
{
- return bt->stack_masks[bt->frame] & (1ull << slot);
+ return bt->stack_masks[frame] & (1ull << slot);
}
/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
@@ -3463,7 +3550,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
* - *was* processed previously during backtracking.
*/
static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
- struct backtrack_state *bt)
+ struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
{
const struct bpf_insn_cbs cbs = {
.cb_call = disasm_kfunc_name,
@@ -3476,7 +3563,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
u8 mode = BPF_MODE(insn->code);
u32 dreg = insn->dst_reg;
u32 sreg = insn->src_reg;
- u32 spi, i;
+ u32 spi, i, fr;
if (insn->code == 0)
return 0;
@@ -3537,20 +3624,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* by 'precise' mark in corresponding register of this state.
* No further tracking necessary.
*/
- if (insn->src_reg != BPF_REG_FP)
+ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
return 0;
-
/* dreg = *(u64 *)[fp - off] was a fill from the stack.
* that [fp - off] slot contains scalar that needs to be
* tracked with precision
*/
- spi = (-insn->off - 1) / BPF_REG_SIZE;
- if (spi >= 64) {
- verbose(env, "BUG spi %d\n", spi);
- WARN_ONCE(1, "verifier backtracking bug");
- return -EFAULT;
- }
- bt_set_slot(bt, spi);
+ spi = insn_stack_access_spi(hist->flags);
+ fr = insn_stack_access_frameno(hist->flags);
+ bt_set_frame_slot(bt, fr, spi);
} else if (class == BPF_STX || class == BPF_ST) {
if (bt_is_reg_set(bt, dreg))
/* stx & st shouldn't be using _scalar_ dst_reg
@@ -3559,17 +3641,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
*/
return -ENOTSUPP;
/* scalars can only be spilled into stack */
- if (insn->dst_reg != BPF_REG_FP)
+ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
return 0;
- spi = (-insn->off - 1) / BPF_REG_SIZE;
- if (spi >= 64) {
- verbose(env, "BUG spi %d\n", spi);
- WARN_ONCE(1, "verifier backtracking bug");
- return -EFAULT;
- }
- if (!bt_is_slot_set(bt, spi))
+ spi = insn_stack_access_spi(hist->flags);
+ fr = insn_stack_access_frameno(hist->flags);
+ if (!bt_is_frame_slot_set(bt, fr, spi))
return 0;
- bt_clear_slot(bt, spi);
+ bt_clear_frame_slot(bt, fr, spi);
if (class == BPF_STX)
bt_set_reg(bt, sreg);
} else if (class == BPF_JMP || class == BPF_JMP32) {
@@ -3613,10 +3691,14 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
}
- /* we don't track register spills perfectly,
- * so fallback to force-precise instead of failing */
- if (bt_stack_mask(bt) != 0)
- return -ENOTSUPP;
+ /* we are now tracking register spills correctly,
+ * so any instance of leftover slots is a bug
+ */
+ if (bt_stack_mask(bt) != 0) {
+ verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)");
+ return -EFAULT;
+ }
/* propagate r1-r5 to the caller */
for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
if (bt_is_reg_set(bt, i)) {
@@ -3641,8 +3723,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
}
- if (bt_stack_mask(bt) != 0)
- return -ENOTSUPP;
+ if (bt_stack_mask(bt) != 0) {
+ verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
+ WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)");
+ return -EFAULT;
+ }
/* clear r1-r5 in callback subprog's mask */
for (i = BPF_REG_1; i <= BPF_REG_5; i++)
bt_clear_reg(bt, i);
@@ -4079,6 +4164,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
for (;;) {
DECLARE_BITMAP(mask, 64);
u32 history = st->jmp_history_cnt;
+ struct bpf_jmp_history_entry *hist;
if (env->log.level & BPF_LOG_LEVEL2) {
verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
@@ -4142,7 +4228,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
err = 0;
skip_first = false;
} else {
- err = backtrack_insn(env, i, subseq_idx, bt);
+ hist = get_jmp_hist_entry(st, history, i);
+ err = backtrack_insn(env, i, subseq_idx, hist, bt);
}
if (err == -ENOTSUPP) {
mark_all_scalars_precise(env, env->cur_state);
@@ -4195,22 +4282,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
for_each_set_bit(i, mask, 64) {
if (i >= func->allocated_stack / BPF_REG_SIZE) {
- /* the sequence of instructions:
- * 2: (bf) r3 = r10
- * 3: (7b) *(u64 *)(r3 -8) = r0
- * 4: (79) r4 = *(u64 *)(r10 -8)
- * doesn't contain jmps. It's backtracked
- * as a single block.
- * During backtracking insn 3 is not recognized as
- * stack access, so at the end of backtracking
- * stack slot fp-8 is still marked in stack_mask.
- * However the parent state may not have accessed
- * fp-8 and it's "unallocated" stack space.
- * In such case fallback to conservative.
- */
- mark_all_scalars_precise(env, env->cur_state);
- bt_reset(bt);
- return 0;
+ verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n",
+ i, func->allocated_stack / BPF_REG_SIZE);
+ WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)");
+ return -EFAULT;
}
if (!is_spilled_scalar_reg(&func->stack[i])) {
@@ -4347,7 +4422,8 @@ static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_
dst->live = live;
}
-static void save_register_state(struct bpf_func_state *state,
+static void save_register_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *state,
int spi, struct bpf_reg_state *reg,
int size)
{
@@ -4362,7 +4438,7 @@ static void save_register_state(struct bpf_func_state *state,
/* size < 8 bytes spill */
for (; i; i--)
- scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]);
+ mark_stack_slot_misc(env, &state->stack[spi].slot_type[i - 1]);
}
static bool is_bpf_st_mem(struct bpf_insn *insn)
@@ -4383,16 +4459,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
struct bpf_reg_state *reg = NULL;
- u32 dst_reg = insn->dst_reg;
+ int insn_flags = insn_stack_access_flags(state->frameno, spi);
- err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE));
- if (err)
- return err;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
*/
if (!env->allow_ptr_leaks &&
- state->stack[spi].slot_type[0] == STACK_SPILL &&
+ is_spilled_reg(&state->stack[spi]) &&
size != BPF_REG_SIZE) {
verbose(env, "attempt to corrupt spilled pointer on stack\n");
return -EACCES;
@@ -4422,20 +4495,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
return err;
mark_stack_slot_scratched(env, spi);
- if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
- !register_is_null(reg) && env->bpf_capable) {
- if (dst_reg != BPF_REG_FP) {
- /* The backtracking logic can only recognize explicit
- * stack slot address like [fp - 8]. Other spill of
- * scalar via different register has to be conservative.
- * Backtrack from here and mark all registers as precise
- * that contributed into 'reg' being a constant.
- */
- err = mark_chain_precision(env, value_regno);
- if (err)
- return err;
- }
- save_register_state(state, spi, reg, size);
+ if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && env->bpf_capable) {
+ save_register_state(env, state, spi, reg, size);
/* Break the relation on a narrowing spill. */
if (fls64(reg->umax_value) > BITS_PER_BYTE * size)
state->stack[spi].spilled_ptr.id = 0;
@@ -4445,7 +4506,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
__mark_reg_known(&fake_reg, insn->imm);
fake_reg.type = SCALAR_VALUE;
- save_register_state(state, spi, &fake_reg, size);
+ save_register_state(env, state, spi, &fake_reg, size);
} else if (reg && is_spillable_regtype(reg->type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
@@ -4457,7 +4518,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
return -EINVAL;
}
- save_register_state(state, spi, reg, size);
+ save_register_state(env, state, spi, reg, size);
} else {
u8 type = STACK_MISC;
@@ -4482,7 +4543,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
/* when we zero initialize stack slots mark them as such */
if ((reg && register_is_null(reg)) ||
(!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
- /* backtracking doesn't work for STACK_ZERO yet. */
+ /* STACK_ZERO case happened because register spill
+ * wasn't properly aligned at the stack slot boundary,
+ * so it's not a register spill anymore; force
+ * originating register to be precise to make
+ * STACK_ZERO correct for subsequent states
+ */
err = mark_chain_precision(env, value_regno);
if (err)
return err;
@@ -4491,9 +4557,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
/* Mark slots affected by this stack write. */
for (i = 0; i < size; i++)
- state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
- type;
+ state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type;
+ insn_flags = 0; /* not a register spill */
}
+
+ if (insn_flags)
+ return push_jmp_history(env, env->cur_state, insn_flags);
return 0;
}
@@ -4543,10 +4612,6 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
(!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
writing_zero = true;
- err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE));
- if (err)
- return err;
-
for (i = min_off; i < max_off; i++) {
int spi;
@@ -4645,21 +4710,10 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env,
zeros++;
}
if (zeros == max_off - min_off) {
- /* any access_size read into register is zero extended,
- * so the whole register == const_zero
+ /* Any access_size read into register is zero extended,
+ * so the whole register == const_zero.
*/
- __mark_reg_const_zero(&state->regs[dst_regno]);
- /* backtracking doesn't support STACK_ZERO yet,
- * so mark it precise here, so that later
- * backtracking can stop here.
- * Backtracking may not need this if this register
- * doesn't participate in pointer adjustment.
- * Forward propagation of precise flag is not
- * necessary either. This mark is only to stop
- * backtracking. Any register that contributed
- * to const 0 was marked precise before spill.
- */
- state->regs[dst_regno].precise = true;
+ __mark_reg_const_zero(env, &state->regs[dst_regno]);
} else {
/* have read misc data from the stack */
mark_reg_unknown(env, state->regs, dst_regno);
@@ -4686,6 +4740,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
struct bpf_reg_state *reg;
u8 *stype, type;
+ int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);
stype = reg_state->stack[spi].slot_type;
reg = &reg_state->stack[spi].spilled_ptr;
@@ -4718,25 +4773,42 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
copy_register_state(&state->regs[dst_regno], reg);
state->regs[dst_regno].subreg_def = subreg_def;
} else {
+ int spill_cnt = 0, zero_cnt = 0;
+
for (i = 0; i < size; i++) {
type = stype[(slot - i) % BPF_REG_SIZE];
- if (type == STACK_SPILL)
+ if (type == STACK_SPILL) {
+ spill_cnt++;
continue;
+ }
if (type == STACK_MISC)
continue;
+ if (type == STACK_ZERO) {
+ zero_cnt++;
+ continue;
+ }
if (type == STACK_INVALID && env->allow_uninit_stack)
continue;
verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
}
- mark_reg_unknown(env, state->regs, dst_regno);
+
+ if (spill_cnt == size &&
+ tnum_is_const(reg->var_off) && reg->var_off.value == 0) {
+ __mark_reg_const_zero(env, &state->regs[dst_regno]);
+ /* this IS register fill, so keep insn_flags */
+ } else if (zero_cnt == size) {
+ /* similarly to mark_reg_stack_read(), preserve zeroes */
+ __mark_reg_const_zero(env, &state->regs[dst_regno]);
+ insn_flags = 0; /* not restoring original register state */
+ } else {
+ mark_reg_unknown(env, state->regs, dst_regno);
+ insn_flags = 0; /* not restoring original register state */
+ }
}
state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
- return 0;
- }
-
- if (dst_regno >= 0) {
+ } else if (dst_regno >= 0) {
/* restore register state from stack */
copy_register_state(&state->regs[dst_regno], reg);
/* mark reg as written since spilled pointer state likely
@@ -4772,7 +4844,10 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
if (dst_regno >= 0)
mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
+ insn_flags = 0; /* we are not restoring spilled register */
}
+ if (insn_flags)
+ return push_jmp_history(env, env->cur_state, insn_flags);
return 0;
}
@@ -5701,20 +5776,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
strict);
}
-static int update_stack_depth(struct bpf_verifier_env *env,
- const struct bpf_func_state *func,
- int off)
-{
- u16 stack = env->subprog_info[func->subprogno].stack_depth;
-
- if (stack >= -off)
- return 0;
-
- /* update known max for given subprogram */
- env->subprog_info[func->subprogno].stack_depth = -off;
- return 0;
-}
-
/* starting from main bpf function walk all instructions of the function
* and recursively walk all callees that given function can call.
* Ignore jump and exit insns.
@@ -6504,13 +6565,14 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
* The minimum valid offset is -MAX_BPF_STACK for writes, and
* -state->allocated_stack for reads.
*/
-static int check_stack_slot_within_bounds(int off,
- struct bpf_func_state *state,
- enum bpf_access_type t)
+static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
+ s64 off,
+ struct bpf_func_state *state,
+ enum bpf_access_type t)
{
int min_valid_off;
- if (t == BPF_WRITE)
+ if (t == BPF_WRITE || env->allow_uninit_stack)
min_valid_off = -MAX_BPF_STACK;
else
min_valid_off = -state->allocated_stack;
@@ -6533,7 +6595,7 @@ static int check_stack_access_within_bounds(
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = regs + regno;
struct bpf_func_state *state = func(env, reg);
- int min_off, max_off;
+ s64 min_off, max_off;
int err;
char *err_extra;
@@ -6546,11 +6608,8 @@ static int check_stack_access_within_bounds(
err_extra = " write to";
if (tnum_is_const(reg->var_off)) {
- min_off = reg->var_off.value + off;
- if (access_size > 0)
- max_off = min_off + access_size - 1;
- else
- max_off = min_off;
+ min_off = (s64)reg->var_off.value + off;
+ max_off = min_off + access_size;
} else {
if (reg->smax_value >= BPF_MAX_VAR_OFF ||
reg->smin_value <= -BPF_MAX_VAR_OFF) {
@@ -6559,15 +6618,12 @@ static int check_stack_access_within_bounds(
return -EACCES;
}
min_off = reg->smin_value + off;
- if (access_size > 0)
- max_off = reg->smax_value + off + access_size - 1;
- else
- max_off = min_off;
+ max_off = reg->smax_value + off + access_size;
}
- err = check_stack_slot_within_bounds(min_off, state, type);
- if (!err)
- err = check_stack_slot_within_bounds(max_off, state, type);
+ err = check_stack_slot_within_bounds(env, min_off, state, type);
+ if (!err && max_off > 0)
+ err = -EINVAL; /* out of stack access into non-negative offsets */
if (err) {
if (tnum_is_const(reg->var_off)) {
@@ -6577,11 +6633,16 @@ static int check_stack_access_within_bounds(
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n",
- err_extra, regno, tn_buf, access_size);
+ verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n",
+ err_extra, regno, tn_buf, off, access_size);
}
+ return err;
}
- return err;
+
+ /* Note that there is no stack access with offset zero, so the needed stack
+ * size is -min_off, not -min_off+1.
+ */
+ return grow_stack_state(env, state, -min_off /* size */);
}
/* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -6596,7 +6657,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
{
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = regs + regno;
- struct bpf_func_state *state;
int size, err = 0;
size = bpf_size_to_bytes(bpf_size);
@@ -6739,11 +6799,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (err)
return err;
- state = func(env, reg);
- err = update_stack_depth(env, state, off);
- if (err)
- return err;
-
if (t == BPF_READ)
err = check_stack_read(env, regno, off, size,
value_regno);
@@ -6932,13 +6987,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
if (err)
return err;
-
return 0;
}
/* When register 'regno' is used to read the stack (either directly or through
* a helper function) make sure that it's within stack boundary and, depending
- * on the access type, that all elements of the stack are initialized.
+ * on the access type and privileges, that all elements of the stack are
+ * initialized.
*
* 'off' includes 'regno->off', but not its dynamic part (if any).
*
@@ -7046,8 +7101,11 @@ static int check_stack_range_initialized(
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
- if (state->allocated_stack <= slot)
- goto err;
+ if (state->allocated_stack <= slot) {
+ verbose(env, "verifier bug: allocated_stack too small");
+ return -EFAULT;
+ }
+
stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
if (*stype == STACK_MISC)
goto mark;
@@ -7071,7 +7129,6 @@ static int check_stack_range_initialized(
goto mark;
}
-err:
if (tnum_is_const(reg->var_off)) {
verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
err_extra, regno, min_off, i - min_off, access_size);
@@ -7096,7 +7153,7 @@ mark:
* helper may write to the entire memory range.
*/
}
- return update_stack_depth(env, state, min_off);
+ return 0;
}
static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
@@ -7192,6 +7249,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
}
}
+/* verify arguments to helpers or kfuncs consisting of a pointer and an access
+ * size.
+ *
+ * @regno is the register containing the access size. regno-1 is the register
+ * containing the pointer.
+ */
static int check_mem_size_reg(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, u32 regno,
bool zero_size_allowed,
@@ -7893,7 +7956,7 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
/* switch to DRAINED state, but keep the depth unchanged */
/* mark current iter state as drained and assume returned NULL */
cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
- __mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]);
+ __mark_reg_const_zero(env, &cur_fr->regs[BPF_REG_0]);
return 0;
}
@@ -9396,7 +9459,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
return err;
callee->in_callback_fn = true;
- callee->callback_ret_range = tnum_range(0, 1);
+ callee->callback_ret_range = retval_range(0, 1);
return 0;
}
@@ -9418,7 +9481,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
- callee->callback_ret_range = tnum_range(0, 1);
+ callee->callback_ret_range = retval_range(0, 1);
return 0;
}
@@ -9448,7 +9511,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_async_callback_fn = true;
- callee->callback_ret_range = tnum_range(0, 1);
+ callee->callback_ret_range = retval_range(0, 1);
return 0;
}
@@ -9476,7 +9539,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
- callee->callback_ret_range = tnum_range(0, 1);
+ callee->callback_ret_range = retval_range(0, 1);
return 0;
}
@@ -9499,7 +9562,7 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
- callee->callback_ret_range = tnum_range(0, 1);
+ callee->callback_ret_range = retval_range(0, 1);
return 0;
}
@@ -9531,7 +9594,7 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_callback_fn = true;
- callee->callback_ret_range = tnum_range(0, 1);
+ callee->callback_ret_range = retval_range(0, 1);
return 0;
}
@@ -9560,6 +9623,11 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
return is_rbtree_lock_required_kfunc(kfunc_btf_id);
}
+static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg)
+{
+ return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
+}
+
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state, *prev_st;
@@ -9583,15 +9651,21 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
caller = state->frame[state->curframe - 1];
if (callee->in_callback_fn) {
- /* enforce R0 return value range [0, 1]. */
- struct tnum range = callee->callback_ret_range;
-
if (r0->type != SCALAR_VALUE) {
verbose(env, "R0 not a scalar value\n");
return -EACCES;
}
- if (!tnum_in(range, r0->var_off)) {
- verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
+
+ /* we are going to rely on register's precise value */
+ err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64);
+ err = err ?: mark_chain_precision(env, BPF_REG_0);
+ if (err)
+ return err;
+
+ /* enforce R0 return value range */
+ if (!retval_range_within(callee->callback_ret_range, r0)) {
+ verbose_invalid_scalar(env, r0, callee->callback_ret_range,
+ "At callback return", "R0");
return -EINVAL;
}
if (!calls_callback(env, callee->callsite)) {
@@ -11805,7 +11879,7 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env,
return 0;
}
-static int check_return_code(struct bpf_verifier_env *env, int regno);
+static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name);
static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
@@ -11942,7 +12016,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
* to bpf_throw becomes the return value of the program.
*/
if (!env->exception_callback_subprog) {
- err = check_return_code(env, BPF_REG_1);
+ err = check_return_code(env, BPF_REG_1, "R1");
if (err < 0)
return err;
}
@@ -14972,12 +15046,13 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
-static int check_return_code(struct bpf_verifier_env *env, int regno)
+static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name)
{
+ const char *exit_ctx = "At program exit";
struct tnum enforce_attach_type_range = tnum_unknown;
const struct bpf_prog *prog = env->prog;
struct bpf_reg_state *reg;
- struct tnum range = tnum_range(0, 1), const_0 = tnum_const(0);
+ struct bpf_retval_range range = retval_range(0, 1);
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
int err;
struct bpf_func_state *frame = env->cur_state->frame[0];
@@ -15019,17 +15094,9 @@ static int check_return_code(struct bpf_verifier_env *env, int regno)
if (frame->in_async_callback_fn) {
/* enforce return zero from async callbacks like timer */
- if (reg->type != SCALAR_VALUE) {
- verbose(env, "In async callback the register R%d is not a known value (%s)\n",
- regno, reg_type_str(env, reg->type));
- return -EINVAL;
- }
-
- if (!tnum_in(const_0, reg->var_off)) {
- verbose_invalid_scalar(env, reg, &const_0, "async callback", "R0");
- return -EINVAL;
- }
- return 0;
+ exit_ctx = "At async callback return";
+ range = retval_range(0, 0);
+ goto enforce_retval;
}
if (is_subprog && !frame->in_exception_callback_fn) {
@@ -15052,14 +15119,14 @@ static int check_return_code(struct bpf_verifier_env *env, int regno)
env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME ||
env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME)
- range = tnum_range(1, 1);
+ range = retval_range(1, 1);
if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
- range = tnum_range(0, 3);
+ range = retval_range(0, 3);
break;
case BPF_PROG_TYPE_CGROUP_SKB:
if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
- range = tnum_range(0, 3);
+ range = retval_range(0, 3);
enforce_attach_type_range = tnum_range(2, 3);
}
break;
@@ -15072,13 +15139,13 @@ static int check_return_code(struct bpf_verifier_env *env, int regno)
case BPF_PROG_TYPE_RAW_TRACEPOINT:
if (!env->prog->aux->attach_btf_id)
return 0;
- range = tnum_const(0);
+ range = retval_range(0, 0);
break;
case BPF_PROG_TYPE_TRACING:
switch (env->prog->expected_attach_type) {
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
- range = tnum_const(0);
+ range = retval_range(0, 0);
break;
case BPF_TRACE_RAW_TP:
case BPF_MODIFY_RETURN:
@@ -15090,7 +15157,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno)
}
break;
case BPF_PROG_TYPE_SK_LOOKUP:
- range = tnum_range(SK_DROP, SK_PASS);
+ range = retval_range(SK_DROP, SK_PASS);
break;
case BPF_PROG_TYPE_LSM:
@@ -15104,12 +15171,12 @@ static int check_return_code(struct bpf_verifier_env *env, int regno)
/* Make sure programs that attach to void
* hooks don't try to modify return value.
*/
- range = tnum_range(1, 1);
+ range = retval_range(1, 1);
}
break;
case BPF_PROG_TYPE_NETFILTER:
- range = tnum_range(NF_DROP, NF_ACCEPT);
+ range = retval_range(NF_DROP, NF_ACCEPT);
break;
case BPF_PROG_TYPE_EXT:
/* freplace program can return anything as its return value
@@ -15119,15 +15186,21 @@ static int check_return_code(struct bpf_verifier_env *env, int regno)
return 0;
}
+enforce_retval:
if (reg->type != SCALAR_VALUE) {
- verbose(env, "At program exit the register R%d is not a known value (%s)\n",
- regno, reg_type_str(env, reg->type));
+ verbose(env, "%s the register R%d is not a known value (%s)\n",
+ exit_ctx, regno, reg_type_str(env, reg->type));
return -EINVAL;
}
- if (!tnum_in(range, reg->var_off)) {
- verbose_invalid_scalar(env, reg, &range, "program exit", "R0");
- if (prog->expected_attach_type == BPF_LSM_CGROUP &&
+ err = mark_chain_precision(env, regno);
+ if (err)
+ return err;
+
+ if (!retval_range_within(range, reg)) {
+ verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name);
+ if (!is_subprog &&
+ prog->expected_attach_type == BPF_LSM_CGROUP &&
prog_type == BPF_PROG_TYPE_LSM &&
!prog->aux->attach_func_proto->type)
verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
@@ -16892,7 +16965,8 @@ hit:
* the precision needs to be propagated back in
* the current state.
*/
- err = err ? : push_jmp_history(env, cur);
+ if (is_jmp_point(env, env->insn_idx))
+ err = err ? : push_jmp_history(env, cur, 0);
err = err ? : propagate_precision(env, &sl->state);
if (err)
return err;
@@ -17117,6 +17191,9 @@ static int do_check(struct bpf_verifier_env *env)
u8 class;
int err;
+ /* reset current history entry on each new instruction */
+ env->cur_hist_ent = NULL;
+
env->prev_insn_idx = prev_insn_idx;
if (env->insn_idx >= insn_cnt) {
verbose(env, "invalid insn idx %d insn_cnt %d\n",
@@ -17156,7 +17233,7 @@ static int do_check(struct bpf_verifier_env *env)
}
if (is_jmp_point(env, env->insn_idx)) {
- err = push_jmp_history(env, state);
+ err = push_jmp_history(env, state, 0);
if (err)
return err;
}
@@ -17410,7 +17487,7 @@ process_bpf_exit_full:
continue;
}
- err = check_return_code(env, BPF_REG_0);
+ err = check_return_code(env, BPF_REG_0, "R0");
if (err)
return err;
process_bpf_exit:
@@ -17871,10 +17948,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
return -E2BIG;
}
+ if (env->prog->aux->sleepable)
+ atomic64_inc(&map->sleepable_refcnt);
/* hold the map. If the program is rejected by verifier,
* the map will be released by release_maps() or it
* will be used by the valid program until it's unloaded
- * and all maps are released in free_used_maps()
+ * and all maps are released in bpf_free_used_maps()
*/
bpf_map_inc(map);
@@ -19091,9 +19170,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
env->exception_callback_subprog = env->subprog_cnt - 1;
/* Don't update insn_cnt, as add_hidden_subprog always appends insns */
- env->subprog_info[env->exception_callback_subprog].is_cb = true;
- env->subprog_info[env->exception_callback_subprog].is_async_cb = true;
- env->subprog_info[env->exception_callback_subprog].is_exception_cb = true;
+ mark_subprog_exc_cb(env, env->exception_callback_subprog);
}
for (i = 0; i < insn_cnt; i++, insn++) {
@@ -19793,7 +19870,7 @@ static void free_states(struct bpf_verifier_env *env)
}
}
-static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex_cb)
+static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
struct bpf_verifier_state *state;
@@ -19824,9 +19901,23 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex
regs = state->frame[state->curframe]->regs;
if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
- ret = btf_prepare_func_args(env, subprog, regs, is_ex_cb);
+ u32 nargs;
+
+ ret = btf_prepare_func_args(env, subprog, regs, &nargs);
if (ret)
goto out;
+ if (subprog_is_exc_cb(env, subprog)) {
+ state->frame[0]->in_exception_callback_fn = true;
+ /* We have already ensured that the callback returns an integer, just
+ * like all global subprogs. We need to determine it only has a single
+ * scalar argument.
+ */
+ if (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE) {
+ verbose(env, "exception cb only supports single integer argument\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
if (regs[i].type == PTR_TO_CTX)
mark_reg_known_zero(env, regs, i);
@@ -19840,12 +19931,6 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex
regs[i].id = ++env->id_gen;
}
}
- if (is_ex_cb) {
- state->frame[0]->in_exception_callback_fn = true;
- env->subprog_info[subprog].is_cb = true;
- env->subprog_info[subprog].is_async_cb = true;
- env->subprog_info[subprog].is_exception_cb = true;
- }
} else {
/* 1st arg to a function */
regs[BPF_REG_1].type = PTR_TO_CTX;
@@ -19925,7 +20010,7 @@ again:
env->insn_idx = env->subprog_info[i].start;
WARN_ON_ONCE(env->insn_idx == 0);
- ret = do_check_common(env, i, env->exception_callback_subprog == i);
+ ret = do_check_common(env, i);
if (ret) {
return ret;
} else if (env->log.level & BPF_LOG_LEVEL) {
@@ -19955,7 +20040,7 @@ static int do_check_main(struct bpf_verifier_env *env)
int ret;
env->insn_idx = 0;
- ret = do_check_common(env, 0, false);
+ ret = do_check_common(env, 0);
if (!ret)
env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
return ret;
@@ -20509,7 +20594,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
- is_priv = bpf_capable();
+
+ env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token);
+ env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token);
+ env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token);
+ env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token);
+ env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF);
bpf_get_btf_vmlinux();
@@ -20541,12 +20631,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
env->strict_alignment = false;
- env->allow_ptr_leaks = bpf_allow_ptr_leaks();
- env->allow_uninit_stack = bpf_allow_uninit_stack();
- env->bypass_spec_v1 = bpf_bypass_spec_v1();
- env->bypass_spec_v4 = bpf_bypass_spec_v4();
- env->bpf_capable = bpf_capable();
-
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;