diff options
Diffstat (limited to 'kernel/bpf')
29 files changed, 1718 insertions, 436 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 9b9c151b5c82..9762bdddf1de 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -16,7 +16,7 @@ obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) -obj-$(CONFIG_BPF_SYSCALL) += arena.o +obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o endif obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) @@ -52,3 +52,4 @@ obj-$(CONFIG_BPF_PRELOAD) += preload/ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o +obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index e52b3ad231b9..945a5680f6a5 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -3,9 +3,11 @@ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/err.h> +#include "linux/filter.h" #include <linux/btf_ids.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> +#include "range_tree.h" /* * bpf_arena is a sparsely populated shared memory region between bpf program and @@ -45,7 +47,7 @@ struct bpf_arena { u64 user_vm_start; u64 user_vm_end; struct vm_struct *kern_vm; - struct maple_tree mt; + struct range_tree rt; struct list_head vma_list; struct mutex lock; }; @@ -98,6 +100,9 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) u64 vm_range; int err = -ENOMEM; + if (!bpf_jit_supports_arena()) + return ERR_PTR(-EOPNOTSUPP); + if (attr->key_size || attr->value_size || attr->max_entries == 0 || /* BPF_F_MMAPABLE must be set */ !(attr->map_flags & BPF_F_MMAPABLE) || @@ -132,7 +137,8 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) INIT_LIST_HEAD(&arena->vma_list); bpf_map_init_from_attr(&arena->map, attr); - mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE); + range_tree_init(&arena->rt); + range_tree_set(&arena->rt, 0, attr->max_entries); mutex_init(&arena->lock); return &arena->map; @@ -183,7 +189,7 @@ static void arena_map_free(struct bpf_map *map) apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL); free_vm_area(arena->kern_vm); - mtree_destroy(&arena->mt); + range_tree_destroy(&arena->rt); bpf_map_area_free(arena); } @@ -274,20 +280,20 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) /* User space requested to segfault when page is not allocated by bpf prog */ return VM_FAULT_SIGSEGV; - ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL); + ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); if (ret) return VM_FAULT_SIGSEGV; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); if (ret) { - mtree_erase(&arena->mt, vmf->pgoff); + range_tree_set(&arena->rt, vmf->pgoff, 1); return VM_FAULT_SIGSEGV; } ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page); if (ret) { - mtree_erase(&arena->mt, vmf->pgoff); + range_tree_set(&arena->rt, vmf->pgoff, 1); __free_page(page); return VM_FAULT_SIGSEGV; } @@ -444,12 +450,16 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt guard(mutex)(&arena->lock); - if (uaddr) - ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1, - MT_ENTRY, GFP_KERNEL); - else - ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY, - page_cnt, 0, page_cnt_max - 1, GFP_KERNEL); + if (uaddr) { + ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); + if (ret) + goto out_free_pages; + ret = range_tree_clear(&arena->rt, pgoff, page_cnt); + } else { + ret = pgoff = range_tree_find(&arena->rt, page_cnt); + if (pgoff >= 0) + ret = range_tree_clear(&arena->rt, pgoff, page_cnt); + } if (ret) goto out_free_pages; @@ -476,7 +486,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt kvfree(pages); return clear_lo32(arena->user_vm_start) + uaddr32; out: - mtree_erase(&arena->mt, pgoff); + range_tree_set(&arena->rt, pgoff, page_cnt); out_free_pages: kvfree(pages); return 0; @@ -516,7 +526,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) pgoff = compute_pgoff(arena, uaddr); /* clear range */ - mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL); + range_tree_set(&arena->rt, pgoff, page_cnt); if (page_cnt > 1) /* bulk zap if multiple pages being freed */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 79660e3fca4c..6cdbb4c33d31 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -947,22 +947,44 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { struct bpf_prog *prog = bpf_prog_get(fd); + bool is_extended; if (IS_ERR(prog)) return prog; - if (!bpf_prog_map_compatible(map, prog)) { + if (prog->type == BPF_PROG_TYPE_EXT || + !bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return ERR_PTR(-EINVAL); } + mutex_lock(&prog->aux->ext_mutex); + is_extended = prog->aux->is_extended; + if (!is_extended) + prog->aux->prog_array_member_cnt++; + mutex_unlock(&prog->aux->ext_mutex); + if (is_extended) { + /* Extended prog can not be tail callee. It's to prevent a + * potential infinite loop like: + * tail callee prog entry -> tail callee prog subprog -> + * freplace prog entry --tailcall-> tail callee prog entry. + */ + bpf_prog_put(prog); + return ERR_PTR(-EBUSY); + } + return prog; } static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { + struct bpf_prog *prog = ptr; + + mutex_lock(&prog->aux->ext_mutex); + prog->aux->prog_array_member_cnt--; + mutex_unlock(&prog->aux->ext_mutex); /* bpf_prog is freed after one RCU or tasks trace grace period */ - bpf_prog_put(ptr); + bpf_prog_put(prog); } static u32 prog_fd_array_sys_lookup_elem(void *ptr) diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 28efd0a3f220..20f05de92e9c 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -107,7 +107,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, bpf_cgrp_storage_lock(); sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, - value, map_flags, GFP_ATOMIC); + value, map_flags, false, GFP_ATOMIC); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return PTR_ERR_OR_ZERO(sdata); @@ -181,7 +181,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, if (!percpu_ref_is_dying(&cgroup->self.refcnt) && (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, - value, BPF_NOEXIST, gfp_flags); + value, BPF_NOEXIST, false, gfp_flags); unlock: bpf_cgrp_storage_unlock(); diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 29da6d3838f6..a51c82dee1bd 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -16,7 +16,6 @@ #include <uapi/linux/btf.h> #include <linux/bpf_lsm.h> #include <linux/btf_ids.h> -#include <linux/fdtable.h> #include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(inode_cache); @@ -100,7 +99,7 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, sdata = bpf_local_storage_update(file_inode(fd_file(f)), (struct bpf_local_storage_map *)map, - value, map_flags, GFP_ATOMIC); + value, map_flags, false, GFP_ATOMIC); return PTR_ERR_OR_ZERO(sdata); } @@ -154,7 +153,7 @@ BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { sdata = bpf_local_storage_update( inode, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, gfp_flags); + BPF_NOEXIST, false, gfp_flags); return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index c938dea5ddbf..7e6a0af0afc1 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -73,7 +73,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, - void *value, bool charge_mem, gfp_t gfp_flags) + void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; @@ -99,9 +99,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, } if (selem) { - if (value) + if (value) { + /* No need to call check_and_init_map_value as memory is zero init */ copy_map_value(&smap->map, SDATA(selem)->data, value); - /* No need to call check_and_init_map_value as memory is zero init */ + if (swap_uptrs) + bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value); + } return selem; } @@ -209,8 +212,12 @@ static void __bpf_selem_free(struct bpf_local_storage_elem *selem, static void bpf_selem_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); + /* The bpf_local_storage_map_free will wait for rcu_barrier */ + smap = rcu_dereference_check(SDATA(selem)->smap, 1); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); bpf_mem_cache_raw_free(selem); } @@ -226,16 +233,25 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *smap, bool reuse_now) { - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - if (!smap->bpf_ma) { + /* Only task storage has uptrs and task storage + * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true + * for task storage, so this bpf_obj_free_fields() won't unpin + * any uptr. + */ + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); return; } - if (!reuse_now) { - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); - } else { + if (reuse_now) { + /* reuse_now == true only happens when the storage owner + * (e.g. task_struct) is being destructed or the map itself + * is being destructed (ie map_free). In both cases, + * no bpf prog can have a hold on the selem. It is + * safe to unpin the uptrs and free the selem now. + */ + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); /* Instead of using the vanilla call_rcu(), * bpf_mem_cache_free will be able to reuse selem * immediately. @@ -243,6 +259,26 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, migrate_disable(); bpf_mem_cache_free(&smap->selem_ma, selem); migrate_enable(); + return; + } + + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); +} + +static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; + struct hlist_node *n; + + /* The "_safe" iteration is needed. + * The loop is not removing the selem from the list + * but bpf_selem_free will use the selem->rcu_head + * which is union-ized with the selem->free_node. + */ + hlist_for_each_entry_safe(selem, n, list, free_node) { + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + bpf_selem_free(selem, smap, reuse_now); } } @@ -252,7 +288,7 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, */ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_mem, bool reuse_now) + bool uncharge_mem, struct hlist_head *free_selem_list) { struct bpf_local_storage_map *smap; bool free_local_storage; @@ -296,7 +332,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); - bpf_selem_free(selem, smap, reuse_now); + hlist_add_head(&selem->free_node, free_selem_list); if (rcu_access_pointer(local_storage->smap) == smap) RCU_INIT_POINTER(local_storage->smap, NULL); @@ -345,6 +381,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *storage_smap; struct bpf_local_storage *local_storage; bool bpf_ma, free_local_storage = false; + HLIST_HEAD(selem_free_list); unsigned long flags; if (unlikely(!selem_linked_to_storage_lockless(selem))) @@ -360,9 +397,11 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, reuse_now); + local_storage, selem, true, &selem_free_list); raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_selem_free_list(&selem_free_list, reuse_now); + if (free_local_storage) bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now); } @@ -524,11 +563,12 @@ uncharge: */ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags, gfp_t gfp_flags) + void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; + HLIST_HEAD(old_selem_free_list); unsigned long flags; int err; @@ -550,7 +590,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); @@ -584,7 +624,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, /* A lookup has just been done before and concluded a new selem is * needed. The chance of an unnecessary alloc is unlikely. */ - alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); if (!alloc_selem) return ERR_PTR(-ENOMEM); @@ -624,11 +664,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - true, false); + true, &old_selem_free_list); } unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_selem_free_list(&old_selem_free_list, false); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); bpf_selem_free(alloc_selem, smap, true); @@ -706,6 +747,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) struct bpf_local_storage_map *storage_smap; struct bpf_local_storage_elem *selem; bool bpf_ma, free_storage = false; + HLIST_HEAD(free_selem_list); struct hlist_node *n; unsigned long flags; @@ -734,10 +776,12 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, true); + local_storage, selem, true, &free_selem_list); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_selem_free_list(&free_selem_list, true); + if (free_storage) bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true); } @@ -883,6 +927,9 @@ void bpf_local_storage_map_free(struct bpf_map *map, synchronize_rcu(); if (smap->bpf_ma) { + rcu_barrier_tasks_trace(); + if (!rcu_trace_implies_rcu_gp()) + rcu_barrier(); bpf_mem_alloc_destroy(&smap->selem_ma); bpf_mem_alloc_destroy(&smap->storage_ma); } diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 6292ac5f9bd1..3bc61628ab25 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -339,10 +339,6 @@ BTF_ID(func, bpf_lsm_path_chmod) BTF_ID(func, bpf_lsm_path_chown) #endif /* CONFIG_SECURITY_PATH */ -#ifdef CONFIG_KEYS -BTF_ID(func, bpf_lsm_key_free) -#endif /* CONFIG_KEYS */ - BTF_ID(func, bpf_lsm_mmap_file) BTF_ID(func, bpf_lsm_netlink_send) BTF_ID(func, bpf_lsm_path_notify) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index fda3dd2ee984..606efe32485a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -23,7 +23,6 @@ struct bpf_struct_ops_value { struct bpf_struct_ops_map { struct bpf_map map; - struct rcu_head rcu; const struct bpf_struct_ops_desc *st_ops_desc; /* protect map_update */ struct mutex lock; @@ -32,7 +31,9 @@ struct bpf_struct_ops_map { * (in kvalue.data). */ struct bpf_link **links; - u32 links_cnt; + /* ksyms for bpf trampolines */ + struct bpf_ksym **ksyms; + u32 funcs_cnt; u32 image_pages_cnt; /* image_pages is an array of pages that has all the trampolines * that stores the func args before calling the bpf_prog. @@ -481,11 +482,11 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) { u32 i; - for (i = 0; i < st_map->links_cnt; i++) { - if (st_map->links[i]) { - bpf_link_put(st_map->links[i]); - st_map->links[i] = NULL; - } + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->links[i]) + break; + bpf_link_put(st_map->links[i]); + st_map->links[i] = NULL; } } @@ -586,6 +587,49 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, return 0; } +static void bpf_struct_ops_ksym_init(const char *tname, const char *mname, + void *image, unsigned int size, + struct bpf_ksym *ksym) +{ + snprintf(ksym->name, KSYM_NAME_LEN, "bpf__%s_%s", tname, mname); + INIT_LIST_HEAD_RCU(&ksym->lnode); + bpf_image_ksym_init(image, size, ksym); +} + +static void bpf_struct_ops_map_add_ksyms(struct bpf_struct_ops_map *st_map) +{ + u32 i; + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->ksyms[i]) + break; + bpf_image_ksym_add(st_map->ksyms[i]); + } +} + +static void bpf_struct_ops_map_del_ksyms(struct bpf_struct_ops_map *st_map) +{ + u32 i; + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->ksyms[i]) + break; + bpf_image_ksym_del(st_map->ksyms[i]); + } +} + +static void bpf_struct_ops_map_free_ksyms(struct bpf_struct_ops_map *st_map) +{ + u32 i; + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->ksyms[i]) + break; + kfree(st_map->ksyms[i]); + st_map->ksyms[i] = NULL; + } +} + static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { @@ -601,6 +645,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, int prog_fd, err; u32 i, trampoline_start, image_off = 0; void *cur_image = NULL, *image = NULL; + struct bpf_link **plink; + struct bpf_ksym **pksym; + const char *tname, *mname; if (flags) return -EINVAL; @@ -639,14 +686,19 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, udata = &uvalue->data; kdata = &kvalue->data; + plink = st_map->links; + pksym = st_map->ksyms; + tname = btf_name_by_offset(st_map->btf, t->name_off); module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]); for_each_member(i, t, member) { const struct btf_type *mtype, *ptype; struct bpf_prog *prog; struct bpf_tramp_link *link; + struct bpf_ksym *ksym; u32 moff; moff = __btf_member_bit_offset(t, member) / 8; + mname = btf_name_by_offset(st_map->btf, member->name_off); ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL); if (ptype == module_type) { if (*(void **)(udata + moff)) @@ -714,7 +766,14 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, } bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog); - st_map->links[i] = &link->link; + *plink++ = &link->link; + + ksym = kzalloc(sizeof(*ksym), GFP_USER); + if (!ksym) { + err = -ENOMEM; + goto reset_unlock; + } + *pksym++ = ksym; trampoline_start = image_off; err = bpf_struct_ops_prepare_trampoline(tlinks, link, @@ -735,6 +794,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, /* put prog_id to udata */ *(unsigned long *)(udata + moff) = prog->aux->id; + + /* init ksym for this trampoline */ + bpf_struct_ops_ksym_init(tname, mname, + image + trampoline_start, + image_off - trampoline_start, + ksym); } if (st_ops->validate) { @@ -783,6 +848,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, */ reset_unlock: + bpf_struct_ops_map_free_ksyms(st_map); bpf_struct_ops_map_free_image(st_map); bpf_struct_ops_map_put_progs(st_map); memset(uvalue, 0, map->value_size); @@ -790,6 +856,8 @@ reset_unlock: unlock: kfree(tlinks); mutex_unlock(&st_map->lock); + if (!err) + bpf_struct_ops_map_add_ksyms(st_map); return err; } @@ -849,7 +917,10 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map) if (st_map->links) bpf_struct_ops_map_put_progs(st_map); + if (st_map->ksyms) + bpf_struct_ops_map_free_ksyms(st_map); bpf_map_area_free(st_map->links); + bpf_map_area_free(st_map->ksyms); bpf_struct_ops_map_free_image(st_map); bpf_map_area_free(st_map->uvalue); bpf_map_area_free(st_map); @@ -866,6 +937,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) if (btf_is_module(st_map->btf)) module_put(st_map->st_ops_desc->st_ops->owner); + bpf_struct_ops_map_del_ksyms(st_map); + /* The struct_ops's function may switch to another struct_ops. * * For example, bpf_tcp_cc_x->init() may switch to @@ -895,6 +968,19 @@ static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) return 0; } +static u32 count_func_ptrs(const struct btf *btf, const struct btf_type *t) +{ + int i; + u32 count; + const struct btf_member *member; + + count = 0; + for_each_member(i, t, member) + if (btf_type_resolve_func_ptr(btf, member->type, NULL)) + count++; + return count; +} + static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) { const struct bpf_struct_ops_desc *st_ops_desc; @@ -961,11 +1047,15 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) map = &st_map->map; st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); - st_map->links_cnt = btf_type_vlen(t); + st_map->funcs_cnt = count_func_ptrs(btf, t); st_map->links = - bpf_map_area_alloc(st_map->links_cnt * sizeof(struct bpf_links *), + bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_link *), + NUMA_NO_NODE); + + st_map->ksyms = + bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_ksym *), NUMA_NO_NODE); - if (!st_map->uvalue || !st_map->links) { + if (!st_map->uvalue || !st_map->links || !st_map->ksyms) { ret = -ENOMEM; goto errout_free; } @@ -994,7 +1084,8 @@ static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map) usage = sizeof(*st_map) + vt->size - sizeof(struct bpf_struct_ops_value); usage += vt->size; - usage += btf_type_vlen(vt) * sizeof(struct bpf_links *); + usage += st_map->funcs_cnt * sizeof(struct bpf_link *); + usage += st_map->funcs_cnt * sizeof(struct bpf_ksym *); usage += PAGE_SIZE; return usage; } diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index adf6dfe0ba68..bf7fa15fdcc6 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -16,7 +16,6 @@ #include <linux/filter.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> -#include <linux/fdtable.h> #include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(task_cache); @@ -129,6 +128,9 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, struct pid *pid; int fd, err; + if ((map_flags & BPF_F_LOCK) && btf_record_has_field(map->record, BPF_UPTR)) + return -EOPNOTSUPP; + fd = *(int *)key; pid = pidfd_get_pid(fd, &f_flags); if (IS_ERR(pid)) @@ -147,7 +149,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags, - GFP_ATOMIC); + true, GFP_ATOMIC); bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); @@ -219,7 +221,7 @@ static void *__bpf_task_storage_get(struct bpf_map *map, (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) { sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, gfp_flags); + BPF_NOEXIST, false, gfp_flags); return IS_ERR(sdata) ? NULL : sdata->data; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 75e4fe83c509..e7a59e6462a9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2808,7 +2808,7 @@ static void btf_ref_type_log(struct btf_verifier_env *env, btf_verifier_log(env, "type_id=%u", t->type); } -static struct btf_kind_operations modifier_ops = { +static const struct btf_kind_operations modifier_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_modifier_resolve, .check_member = btf_modifier_check_member, @@ -2817,7 +2817,7 @@ static struct btf_kind_operations modifier_ops = { .show = btf_modifier_show, }; -static struct btf_kind_operations ptr_ops = { +static const struct btf_kind_operations ptr_ops = { .check_meta = btf_ref_type_check_meta, .resolve = btf_ptr_resolve, .check_member = btf_ptr_check_member, @@ -2858,7 +2858,7 @@ static void btf_fwd_type_log(struct btf_verifier_env *env, btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct"); } -static struct btf_kind_operations fwd_ops = { +static const struct btf_kind_operations fwd_ops = { .check_meta = btf_fwd_check_meta, .resolve = btf_df_resolve, .check_member = btf_df_check_member, @@ -3109,7 +3109,7 @@ static void btf_array_show(const struct btf *btf, const struct btf_type *t, __btf_array_show(btf, t, type_id, data, bits_offset, show); } -static struct btf_kind_operations array_ops = { +static const struct btf_kind_operations array_ops = { .check_meta = btf_array_check_meta, .resolve = btf_array_resolve, .check_member = btf_array_check_member, @@ -3334,7 +3334,7 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t, } static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, - u32 off, int sz, struct btf_field_info *info) + u32 off, int sz, struct btf_field_info *info, u32 field_mask) { enum btf_field_type type; u32 res_id; @@ -3358,9 +3358,14 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, type = BPF_KPTR_REF; else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off))) type = BPF_KPTR_PERCPU; + else if (!strcmp("uptr", __btf_name_by_offset(btf, t->name_off))) + type = BPF_UPTR; else return -EINVAL; + if (!(type & field_mask)) + return BTF_FIELD_IGNORE; + /* Get the base type */ t = btf_type_skip_modifiers(btf, t->type, &res_id); /* Only pointer to struct is allowed */ @@ -3502,7 +3507,7 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ field_mask_test_name(BPF_REFCOUNT, "bpf_refcount"); /* Only return BPF_KPTR when all other types with matchable names fail */ - if (field_mask & BPF_KPTR && !__btf_type_is_struct(var_type)) { + if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) { type = BPF_KPTR_REF; goto end; } @@ -3523,7 +3528,7 @@ end: * (i + 1) * elem_size * where i is the repeat index and elem_size is the size of an element. */ -static int btf_repeat_fields(struct btf_field_info *info, +static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, u32 field_cnt, u32 repeat_cnt, u32 elem_size) { u32 i, j; @@ -3535,6 +3540,7 @@ static int btf_repeat_fields(struct btf_field_info *info, case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: case BPF_LIST_HEAD: case BPF_RB_ROOT: break; @@ -3543,6 +3549,12 @@ static int btf_repeat_fields(struct btf_field_info *info, } } + /* The type of struct size or variable size is u32, + * so the multiplication will not overflow. + */ + if (field_cnt * (repeat_cnt + 1) > info_cnt) + return -E2BIG; + cur = field_cnt; for (i = 0; i < repeat_cnt; i++) { memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0])); @@ -3587,7 +3599,7 @@ static int btf_find_nested_struct(const struct btf *btf, const struct btf_type * info[i].off += off; if (nelems > 1) { - err = btf_repeat_fields(info, ret, nelems - 1, t->size); + err = btf_repeat_fields(info, info_cnt, ret, nelems - 1, t->size); if (err == 0) ret *= nelems; else @@ -3661,8 +3673,9 @@ static int btf_find_field_one(const struct btf *btf, case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: ret = btf_find_kptr(btf, var_type, off, sz, - info_cnt ? &info[0] : &tmp); + info_cnt ? &info[0] : &tmp, field_mask); if (ret < 0) return ret; break; @@ -3681,10 +3694,10 @@ static int btf_find_field_one(const struct btf *btf, if (ret == BTF_FIELD_IGNORE) return 0; - if (nelems > info_cnt) + if (!info_cnt) return -E2BIG; if (nelems > 1) { - ret = btf_repeat_fields(info, 1, nelems - 1, sz); + ret = btf_repeat_fields(info, info_cnt, 1, nelems - 1, sz); if (ret < 0) return ret; } @@ -3985,6 +3998,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); if (ret < 0) goto end; @@ -4044,12 +4058,28 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) * Hence we only need to ensure that bpf_{list_head,rb_root} ownership * does not form cycles. */ - if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT)) + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & (BPF_GRAPH_ROOT | BPF_UPTR))) return 0; for (i = 0; i < rec->cnt; i++) { struct btf_struct_meta *meta; + const struct btf_type *t; u32 btf_id; + if (rec->fields[i].type == BPF_UPTR) { + /* The uptr only supports pinning one page and cannot + * point to a kernel struct + */ + if (btf_is_kernel(rec->fields[i].kptr.btf)) + return -EINVAL; + t = btf_type_by_id(rec->fields[i].kptr.btf, + rec->fields[i].kptr.btf_id); + if (!t->size) + return -EINVAL; + if (t->size > PAGE_SIZE) + return -E2BIG; + continue; + } + if (!(rec->fields[i].type & BPF_GRAPH_ROOT)) continue; btf_id = rec->fields[i].graph_root.value_btf_id; @@ -4185,7 +4215,7 @@ static void btf_struct_show(const struct btf *btf, const struct btf_type *t, __btf_struct_show(btf, t, type_id, data, bits_offset, show); } -static struct btf_kind_operations struct_ops = { +static const struct btf_kind_operations struct_ops = { .check_meta = btf_struct_check_meta, .resolve = btf_struct_resolve, .check_member = btf_struct_check_member, @@ -4353,7 +4383,7 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t, btf_show_end_type(show); } -static struct btf_kind_operations enum_ops = { +static const struct btf_kind_operations enum_ops = { .check_meta = btf_enum_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, @@ -4456,7 +4486,7 @@ static void btf_enum64_show(const struct btf *btf, const struct btf_type *t, btf_show_end_type(show); } -static struct btf_kind_operations enum64_ops = { +static const struct btf_kind_operations enum64_ops = { .check_meta = btf_enum64_check_meta, .resolve = btf_df_resolve, .check_member = btf_enum_check_member, @@ -4534,7 +4564,7 @@ done: btf_verifier_log(env, ")"); } -static struct btf_kind_operations func_proto_ops = { +static const struct btf_kind_operations func_proto_ops = { .check_meta = btf_func_proto_check_meta, .resolve = btf_df_resolve, /* @@ -4592,7 +4622,7 @@ static int btf_func_resolve(struct btf_verifier_env *env, return 0; } -static struct btf_kind_operations func_ops = { +static const struct btf_kind_operations func_ops = { .check_meta = btf_func_check_meta, .resolve = btf_func_resolve, .check_member = btf_df_check_member, @@ -5560,7 +5590,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) goto free_aof; } - ret = btf_find_kptr(btf, t, 0, 0, &tmp); + ret = btf_find_kptr(btf, t, 0, 0, &tmp, BPF_KPTR); if (ret != BTF_FIELD_FOUND) continue; @@ -6558,7 +6588,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (prog_args_trusted(prog)) info->reg_type |= PTR_TRUSTED; - if (btf_param_match_suffix(btf, &args[arg], "__nullable")) + /* Raw tracepoint arguments always get marked as maybe NULL */ + if (bpf_prog_is_raw_tp(prog)) + info->reg_type |= PTR_MAYBE_NULL; + else if (btf_param_match_suffix(btf, &args[arg], "__nullable")) info->reg_type |= PTR_MAYBE_NULL; if (tgt_prog) { @@ -8961,6 +8994,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, if (!type) { bpf_log(ctx->log, "relo #%u: bad type id %u\n", relo_idx, relo->type_id); + kfree(specs); return -EINVAL; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e7113d700b87..46e5db65dbc8 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -24,6 +24,23 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); +/* + * cgroup bpf destruction makes heavy use of work items and there can be a lot + * of concurrent destructions. Use a separate workqueue so that cgroup bpf + * destruction work items don't end up filling up max_active of system_wq + * which may lead to deadlock. + */ +static struct workqueue_struct *cgroup_bpf_destroy_wq; + +static int __init cgroup_bpf_wq_init(void) +{ + cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1); + if (!cgroup_bpf_destroy_wq) + panic("Failed to alloc workqueue for cgroup bpf destroy.\n"); + return 0; +} +core_initcall(cgroup_bpf_wq_init); + /* __always_inline is necessary to prevent indirect call through run_prog * function pointer. */ @@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref) struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); - queue_work(system_wq, &cgrp->bpf.release_work); + queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work); } /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through @@ -1691,7 +1708,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * returned value != 1 during execution. In all other cases 0 is returned. */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, - struct ctl_table *table, int write, + const struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, enum cgroup_bpf_attach_type atype) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5e77c58e0601..a2327c4fdc8b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -21,7 +21,7 @@ #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> -#include <linux/random.h> +#include <linux/prandom.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/objtool.h> @@ -131,6 +131,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode); #endif mutex_init(&fp->aux->used_maps_mutex); + mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); return fp; @@ -3044,6 +3045,11 @@ bool __weak bpf_jit_supports_exceptions(void) return false; } +bool __weak bpf_jit_supports_private_stack(void) +{ + return false; +} + void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) { } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 9e0e3b0a18e4..7878be18e9d2 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -333,9 +333,11 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, struct xdp_frame **frames, int n, - struct net_device *dev) + struct net_device *tx_dev, + struct net_device *rx_dev) { - struct xdp_txq_info txq = { .dev = dev }; + struct xdp_txq_info txq = { .dev = tx_dev }; + struct xdp_rxq_info rxq = { .dev = rx_dev }; struct xdp_buff xdp; int i, nframes = 0; @@ -346,6 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, xdp_convert_frame_to_buff(xdpf, &xdp); xdp.txq = &txq; + xdp.rxq = &rxq; act = bpf_prog_run_xdp(xdp_prog, &xdp); switch (act) { @@ -360,7 +363,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); fallthrough; case XDP_ABORTED: - trace_xdp_exception(dev, xdp_prog, act); + trace_xdp_exception(tx_dev, xdp_prog, act); fallthrough; case XDP_DROP: xdp_return_frame_rx_napi(xdpf); @@ -388,7 +391,7 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) } if (bq->xdp_prog) { - to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx); if (!to_send) goto out; } diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index 70fb82bf1637..b77db7413f8c 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -154,7 +154,8 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, d->image = NULL; goto out; } - bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym); + bpf_image_ksym_init(d->image, PAGE_SIZE, &d->ksym); + bpf_image_ksym_add(&d->ksym); } prev_num_progs = d->num_progs; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b14b87463ee0..3ec941a0ea41 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -896,9 +896,12 @@ find_first_elem: static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { check_and_free_fields(htab, l); + + migrate_disable(); if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); bpf_mem_cache_free(&htab->ma, l); + migrate_enable(); } static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) @@ -948,7 +951,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) if (htab_is_prealloc(htab)) { bpf_map_dec_elem_count(&htab->map); check_and_free_fields(htab, l); - __pcpu_freelist_push(&htab->freelist, &l->fnode); + pcpu_freelist_push(&htab->freelist, &l->fnode); } else { dec_elem_count(htab); htab_elem_free(htab, l); @@ -1018,7 +1021,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, */ pl_new = this_cpu_ptr(htab->extra_elems); l_new = *pl_new; - htab_put_fd_value(htab, old_elem); *pl_new = old_elem; } else { struct pcpu_freelist_node *l; @@ -1105,6 +1107,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, struct htab_elem *l_new = NULL, *l_old; struct hlist_nulls_head *head; unsigned long flags; + void *old_map_ptr; struct bucket *b; u32 key_size, hash; int ret; @@ -1183,12 +1186,27 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_nulls_del_rcu(&l_old->hash_node); + + /* l_old has already been stashed in htab->extra_elems, free + * its special fields before it is available for reuse. Also + * save the old map pointer in htab of maps before unlock + * and release it after unlock. + */ + old_map_ptr = NULL; + if (htab_is_prealloc(htab)) { + if (map->ops->map_fd_put_ptr) + old_map_ptr = fd_htab_map_get_ptr(map, l_old); + check_and_free_fields(htab, l_old); + } + } + htab_unlock_bucket(htab, b, hash, flags); + if (l_old) { + if (old_map_ptr) + map->ops->map_fd_put_ptr(map, old_map_ptr, true); if (!htab_is_prealloc(htab)) free_htab_elem(htab, l_old); - else - check_and_free_fields(htab, l_old); } - ret = 0; + return 0; err: htab_unlock_bucket(htab, b, hash, flags); return ret; @@ -1432,15 +1450,15 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) return ret; l = lookup_elem_raw(head, hash, key, key_size); - - if (l) { + if (l) hlist_nulls_del_rcu(&l->hash_node); - free_htab_elem(htab, l); - } else { + else ret = -ENOENT; - } htab_unlock_bucket(htab, b, hash, flags); + + if (l) + free_htab_elem(htab, l); return ret; } @@ -1853,13 +1871,14 @@ again_nocopy: * may cause deadlock. See comments in function * prealloc_lru_pop(). Let us do bpf_lru_push_free() * after releasing the bucket lock. + * + * For htab of maps, htab_put_fd_value() in + * free_htab_elem() may acquire a spinlock with bucket + * lock being held and it violates the lock rule, so + * invoke free_htab_elem() after unlock as well. */ - if (is_lru_map) { - l->batch_flink = node_to_free; - node_to_free = l; - } else { - free_htab_elem(htab, l); - } + l->batch_flink = node_to_free; + node_to_free = l; } dst_key += key_size; dst_val += value_size; @@ -1871,7 +1890,10 @@ again_nocopy: while (node_to_free) { l = node_to_free; node_to_free = node_to_free->batch_flink; - htab_lru_push_free(htab, l); + if (is_lru_map) + htab_lru_push_free(htab, l); + else + free_htab_elem(htab, l); } next_batch: diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1a43d06eab28..751c150f9e1c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -111,7 +111,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) @@ -124,7 +124,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) @@ -538,7 +538,7 @@ const struct bpf_func_proto bpf_strtol_proto = { .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(s64), }; @@ -566,7 +566,7 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(u64), }; @@ -1742,7 +1742,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, @@ -2522,6 +2522,25 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) } /** + * bpf_task_from_vpid - Find a struct task_struct from its vpid by looking it up + * in the pid namespace of the current task. If a task is returned, it must + * either be stored in a map, or released with bpf_task_release(). + * @vpid: The vpid of the task being looked up. + */ +__bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) +{ + struct task_struct *p; + + rcu_read_lock(); + p = find_task_by_vpid(vpid); + if (p) + p = bpf_task_acquire(p); + rcu_read_unlock(); + + return p; +} + +/** * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr @@ -2851,21 +2870,47 @@ struct bpf_iter_bits { __u64 __opaque[2]; } __aligned(8); +#define BITS_ITER_NR_WORDS_MAX 511 + struct bpf_iter_bits_kern { union { - unsigned long *bits; - unsigned long bits_copy; + __u64 *bits; + __u64 bits_copy; }; - u32 nr_bits; + int nr_bits; int bit; } __aligned(8); +/* On 64-bit hosts, unsigned long and u64 have the same size, so passing + * a u64 pointer and an unsigned long pointer to find_next_bit() will + * return the same result, as both point to the same 8-byte area. + * + * For 32-bit little-endian hosts, using a u64 pointer or unsigned long + * pointer also makes no difference. This is because the first iterated + * unsigned long is composed of bits 0-31 of the u64 and the second unsigned + * long is composed of bits 32-63 of the u64. + * + * However, for 32-bit big-endian hosts, this is not the case. The first + * iterated unsigned long will be bits 32-63 of the u64, so swap these two + * ulong values within the u64. + */ +static void swap_ulong_in_u64(u64 *bits, unsigned int nr) +{ +#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN) + unsigned int i; + + for (i = 0; i < nr; i++) + bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32); +#endif +} + /** * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area * @it: The new bpf_iter_bits to be created * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over * @nr_words: The size of the specified memory area, measured in 8-byte units. - * Due to the limitation of memalloc, it can't be greater than 512. + * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be + * further reduced by the BPF memory allocator implementation. * * This function initializes a new bpf_iter_bits structure for iterating over * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It @@ -2892,6 +2937,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w if (!unsafe_ptr__ign || !nr_words) return -EINVAL; + if (nr_words > BITS_ITER_NR_WORDS_MAX) + return -E2BIG; /* Optimization for u64 mask */ if (nr_bits == 64) { @@ -2899,10 +2946,15 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w if (err) return -EFAULT; + swap_ulong_in_u64(&kit->bits_copy, nr_words); + kit->nr_bits = nr_bits; return 0; } + if (bpf_mem_alloc_check_size(false, nr_bytes)) + return -E2BIG; + /* Fallback to memalloc */ kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes); if (!kit->bits) @@ -2914,6 +2966,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w return err; } + swap_ulong_in_u64(kit->bits, nr_words); + kit->nr_bits = nr_bits; return 0; } @@ -2930,17 +2984,16 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w __bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it) { struct bpf_iter_bits_kern *kit = (void *)it; - u32 nr_bits = kit->nr_bits; - const unsigned long *bits; - int bit; + int bit = kit->bit, nr_bits = kit->nr_bits; + const void *bits; - if (nr_bits == 0) + if (!nr_bits || bit >= nr_bits) return NULL; bits = nr_bits == 64 ? &kit->bits_copy : kit->bits; - bit = find_next_bit(bits, nr_bits, kit->bit + 1); + bit = find_next_bit(bits, nr_bits, bit + 1); if (bit >= nr_bits) { - kit->nr_bits = 0; + kit->bit = bit; return NULL; } @@ -3034,7 +3087,9 @@ BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU) BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) +BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { @@ -3052,8 +3107,8 @@ BTF_ID(func, bpf_cgroup_release_dtor) #endif BTF_KFUNCS_START(common_btf_ids) -BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx) -BTF_ID_FLAGS(func, bpf_rdonly_cast) +BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL) +BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL) BTF_ID_FLAGS(func, bpf_rcu_read_lock) BTF_ID_FLAGS(func, bpf_rcu_read_unlock) BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL) @@ -3090,6 +3145,10 @@ BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_get_kmem_cache) +BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index d8fc5eba529d..9aaf5124648b 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -880,7 +880,7 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) const struct btf_type *enum_t; const char *enum_pfx; u64 *delegate_msk, msk = 0; - char *p; + char *p, *str; int val; /* ignore errors, fallback to hex */ @@ -911,7 +911,8 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; } - while ((p = strsep(¶m->string, ":"))) { + str = param->string; + while ((p = strsep(&str, ":"))) { if (strcmp(p, "any") == 0) { msk |= ~0ULL; } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) { diff --git a/kernel/bpf/kmem_cache_iter.c b/kernel/bpf/kmem_cache_iter.c new file mode 100644 index 000000000000..3ae2158d767f --- /dev/null +++ b/kernel/bpf/kmem_cache_iter.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Google */ +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> + +#include "../../mm/slab.h" /* kmem_cache, slab_caches and slab_mutex */ + +/* open-coded version */ +struct bpf_iter_kmem_cache { + __u64 __opaque[1]; +} __attribute__((aligned(8))); + +struct bpf_iter_kmem_cache_kern { + struct kmem_cache *pos; +} __attribute__((aligned(8))); + +#define KMEM_CACHE_POS_START ((void *)1L) + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it) +{ + struct bpf_iter_kmem_cache_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(*kit) > sizeof(*it)); + BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it)); + + kit->pos = KMEM_CACHE_POS_START; + return 0; +} + +__bpf_kfunc struct kmem_cache *bpf_iter_kmem_cache_next(struct bpf_iter_kmem_cache *it) +{ + struct bpf_iter_kmem_cache_kern *kit = (void *)it; + struct kmem_cache *prev = kit->pos; + struct kmem_cache *next; + bool destroy = false; + + if (!prev) + return NULL; + + mutex_lock(&slab_mutex); + + if (list_empty(&slab_caches)) { + mutex_unlock(&slab_mutex); + return NULL; + } + + if (prev == KMEM_CACHE_POS_START) + next = list_first_entry(&slab_caches, struct kmem_cache, list); + else if (list_last_entry(&slab_caches, struct kmem_cache, list) == prev) + next = NULL; + else + next = list_next_entry(prev, list); + + /* boot_caches have negative refcount, don't touch them */ + if (next && next->refcount > 0) + next->refcount++; + + /* Skip kmem_cache_destroy() for active entries */ + if (prev && prev != KMEM_CACHE_POS_START) { + if (prev->refcount > 1) + prev->refcount--; + else if (prev->refcount == 1) + destroy = true; + } + + mutex_unlock(&slab_mutex); + + if (destroy) + kmem_cache_destroy(prev); + + kit->pos = next; + return next; +} + +__bpf_kfunc void bpf_iter_kmem_cache_destroy(struct bpf_iter_kmem_cache *it) +{ + struct bpf_iter_kmem_cache_kern *kit = (void *)it; + struct kmem_cache *s = kit->pos; + bool destroy = false; + + if (s == NULL || s == KMEM_CACHE_POS_START) + return; + + mutex_lock(&slab_mutex); + + /* Skip kmem_cache_destroy() for active entries */ + if (s->refcount > 1) + s->refcount--; + else if (s->refcount == 1) + destroy = true; + + mutex_unlock(&slab_mutex); + + if (destroy) + kmem_cache_destroy(s); +} + +__bpf_kfunc_end_defs(); + +struct bpf_iter__kmem_cache { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct kmem_cache *, s); +}; + +union kmem_cache_iter_priv { + struct bpf_iter_kmem_cache it; + struct bpf_iter_kmem_cache_kern kit; +}; + +static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos) +{ + loff_t cnt = 0; + bool found = false; + struct kmem_cache *s; + union kmem_cache_iter_priv *p = seq->private; + + mutex_lock(&slab_mutex); + + /* Find an entry at the given position in the slab_caches list instead + * of keeping a reference (of the last visited entry, if any) out of + * slab_mutex. It might miss something if one is deleted in the middle + * while it releases the lock. But it should be rare and there's not + * much we can do about it. + */ + list_for_each_entry(s, &slab_caches, list) { + if (cnt == *pos) { + /* Make sure this entry remains in the list by getting + * a new reference count. Note that boot_cache entries + * have a negative refcount, so don't touch them. + */ + if (s->refcount > 0) + s->refcount++; + found = true; + break; + } + cnt++; + } + mutex_unlock(&slab_mutex); + + if (!found) + s = NULL; + + p->kit.pos = s; + return s; +} + +static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_iter__kmem_cache ctx = { + .meta = &meta, + .s = v, + }; + union kmem_cache_iter_priv *p = seq->private; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog && !ctx.s) + bpf_iter_run_prog(prog, &ctx); + + bpf_iter_kmem_cache_destroy(&p->it); +} + +static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + union kmem_cache_iter_priv *p = seq->private; + + ++*pos; + + return bpf_iter_kmem_cache_next(&p->it); +} + +static int kmem_cache_iter_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_iter__kmem_cache ctx = { + .meta = &meta, + .s = v, + }; + struct bpf_prog *prog; + int ret = 0; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static const struct seq_operations kmem_cache_iter_seq_ops = { + .start = kmem_cache_iter_seq_start, + .next = kmem_cache_iter_seq_next, + .stop = kmem_cache_iter_seq_stop, + .show = kmem_cache_iter_seq_show, +}; + +BTF_ID_LIST_GLOBAL_SINGLE(bpf_kmem_cache_btf_id, struct, kmem_cache) + +static const struct bpf_iter_seq_info kmem_cache_iter_seq_info = { + .seq_ops = &kmem_cache_iter_seq_ops, + .seq_priv_size = sizeof(union kmem_cache_iter_priv), +}; + +static void bpf_iter_kmem_cache_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + seq_puts(seq, "kmem_cache iter\n"); +} + +DEFINE_BPF_ITER_FUNC(kmem_cache, struct bpf_iter_meta *meta, + struct kmem_cache *s) + +static struct bpf_iter_reg bpf_kmem_cache_reg_info = { + .target = "kmem_cache", + .feature = BPF_ITER_RESCHED, + .show_fdinfo = bpf_iter_kmem_cache_show_fdinfo, + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__kmem_cache, s), + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, + }, + .seq_info = &kmem_cache_iter_seq_info, +}; + +static int __init bpf_kmem_cache_iter_init(void) +{ + bpf_kmem_cache_reg_info.ctx_arg_info[0].btf_id = bpf_kmem_cache_btf_id[0]; + return bpf_iter_reg_target(&bpf_kmem_cache_reg_info); +} + +late_initcall(bpf_kmem_cache_iter_init); diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 5aebfc3051e3..4a858fdb6476 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -688,8 +688,7 @@ static void print_reg_state(struct bpf_verifier_env *env, if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) { - /* reg->off should be 0 for SCALAR_VALUE */ - verbose_snum(env, reg->var_off.value + reg->off); + verbose_snum(env, reg->var_off.value); return; } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 0218a5132ab5..9b60eda0f727 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -655,7 +655,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) if (!key || key->prefixlen > trie->max_prefixlen) goto find_leftmost; - node_stack = kmalloc_array(trie->max_prefixlen, + node_stack = kmalloc_array(trie->max_prefixlen + 1, sizeof(struct lpm_trie_node *), GFP_ATOMIC | __GFP_NOWARN); if (!node_stack) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index b3858a76e0b3..889374722d0a 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -35,6 +35,8 @@ */ #define LLIST_NODE_SZ sizeof(struct llist_node) +#define BPF_MEM_ALLOC_SIZE_MAX 4096 + /* similar to kmalloc, but sizeof == 8 bucket is gone */ static u8 size_index[24] __ro_after_init = { 3, /* 8 */ @@ -65,7 +67,7 @@ static u8 size_index[24] __ro_after_init = { static int bpf_mem_cache_idx(size_t size) { - if (!size || size > 4096) + if (!size || size > BPF_MEM_ALLOC_SIZE_MAX) return -1; if (size <= 192) @@ -252,11 +254,8 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) static void free_one(void *obj, bool percpu) { - if (percpu) { + if (percpu) free_percpu(((void __percpu **)obj)[1]); - kfree(obj); - return; - } kfree(obj); } @@ -1005,3 +1004,13 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) return !ret ? NULL : ret + LLIST_NODE_SZ; } + +int bpf_mem_alloc_check_size(bool percpu, size_t size) +{ + /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */ + if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) || + (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ)) + return -E2BIG; + + return 0; +} diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c new file mode 100644 index 000000000000..5bdf9aadca3a --- /dev/null +++ b/kernel/bpf/range_tree.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include <linux/interval_tree_generic.h> +#include <linux/slab.h> +#include <linux/bpf_mem_alloc.h> +#include <linux/bpf.h> +#include "range_tree.h" + +/* + * struct range_tree is a data structure used to allocate contiguous memory + * ranges in bpf arena. It's a large bitmap. The contiguous sequence of bits is + * represented by struct range_node or 'rn' for short. + * rn->rn_rbnode links it into an interval tree while + * rn->rb_range_size links it into a second rbtree sorted by size of the range. + * __find_range() performs binary search and best fit algorithm to find the + * range less or equal requested size. + * range_tree_clear/set() clears or sets a range of bits in this bitmap. The + * adjacent ranges are merged or split at the same time. + * + * The split/merge logic is based/borrowed from XFS's xbitmap32 added + * in commit 6772fcc8890a ("xfs: convert xbitmap to interval tree"). + * + * The implementation relies on external lock to protect rbtree-s. + * The alloc/free of range_node-s is done via bpf_mem_alloc. + * + * bpf arena is using range_tree to represent unallocated slots. + * At init time: + * range_tree_set(rt, 0, max); + * Then: + * start = range_tree_find(rt, len); + * if (start >= 0) + * range_tree_clear(rt, start, len); + * to find free range and mark slots as allocated and later: + * range_tree_set(rt, start, len); + * to mark as unallocated after use. + */ +struct range_node { + struct rb_node rn_rbnode; + struct rb_node rb_range_size; + u32 rn_start; + u32 rn_last; /* inclusive */ + u32 __rn_subtree_last; +}; + +static struct range_node *rb_to_range_node(struct rb_node *rb) +{ + return rb_entry(rb, struct range_node, rb_range_size); +} + +static u32 rn_size(struct range_node *rn) +{ + return rn->rn_last - rn->rn_start + 1; +} + +/* Find range that fits best to requested size */ +static inline struct range_node *__find_range(struct range_tree *rt, u32 len) +{ + struct rb_node *rb = rt->range_size_root.rb_root.rb_node; + struct range_node *best = NULL; + + while (rb) { + struct range_node *rn = rb_to_range_node(rb); + + if (len <= rn_size(rn)) { + best = rn; + rb = rb->rb_right; + } else { + rb = rb->rb_left; + } + } + + return best; +} + +s64 range_tree_find(struct range_tree *rt, u32 len) +{ + struct range_node *rn; + + rn = __find_range(rt, len); + if (!rn) + return -ENOENT; + return rn->rn_start; +} + +/* Insert the range into rbtree sorted by the range size */ +static inline void __range_size_insert(struct range_node *rn, + struct rb_root_cached *root) +{ + struct rb_node **link = &root->rb_root.rb_node, *rb = NULL; + u64 size = rn_size(rn); + bool leftmost = true; + + while (*link) { + rb = *link; + if (size > rn_size(rb_to_range_node(rb))) { + link = &rb->rb_left; + } else { + link = &rb->rb_right; + leftmost = false; + } + } + + rb_link_node(&rn->rb_range_size, rb, link); + rb_insert_color_cached(&rn->rb_range_size, root, leftmost); +} + +#define START(node) ((node)->rn_start) +#define LAST(node) ((node)->rn_last) + +INTERVAL_TREE_DEFINE(struct range_node, rn_rbnode, u32, + __rn_subtree_last, START, LAST, + static inline __maybe_unused, + __range_it) + +static inline __maybe_unused void +range_it_insert(struct range_node *rn, struct range_tree *rt) +{ + __range_size_insert(rn, &rt->range_size_root); + __range_it_insert(rn, &rt->it_root); +} + +static inline __maybe_unused void +range_it_remove(struct range_node *rn, struct range_tree *rt) +{ + rb_erase_cached(&rn->rb_range_size, &rt->range_size_root); + RB_CLEAR_NODE(&rn->rb_range_size); + __range_it_remove(rn, &rt->it_root); +} + +static inline __maybe_unused struct range_node * +range_it_iter_first(struct range_tree *rt, u32 start, u32 last) +{ + return __range_it_iter_first(&rt->it_root, start, last); +} + +/* Clear the range in this range tree */ +int range_tree_clear(struct range_tree *rt, u32 start, u32 len) +{ + u32 last = start + len - 1; + struct range_node *new_rn; + struct range_node *rn; + + while ((rn = range_it_iter_first(rt, start, last))) { + if (rn->rn_start < start && rn->rn_last > last) { + u32 old_last = rn->rn_last; + + /* Overlaps with the entire clearing range */ + range_it_remove(rn, rt); + rn->rn_last = start - 1; + range_it_insert(rn, rt); + + /* Add a range */ + migrate_disable(); + new_rn = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node)); + migrate_enable(); + if (!new_rn) + return -ENOMEM; + new_rn->rn_start = last + 1; + new_rn->rn_last = old_last; + range_it_insert(new_rn, rt); + } else if (rn->rn_start < start) { + /* Overlaps with the left side of the clearing range */ + range_it_remove(rn, rt); + rn->rn_last = start - 1; + range_it_insert(rn, rt); + } else if (rn->rn_last > last) { + /* Overlaps with the right side of the clearing range */ + range_it_remove(rn, rt); + rn->rn_start = last + 1; + range_it_insert(rn, rt); + break; + } else { + /* in the middle of the clearing range */ + range_it_remove(rn, rt); + migrate_disable(); + bpf_mem_free(&bpf_global_ma, rn); + migrate_enable(); + } + } + return 0; +} + +/* Is the whole range set ? */ +int is_range_tree_set(struct range_tree *rt, u32 start, u32 len) +{ + u32 last = start + len - 1; + struct range_node *left; + + /* Is this whole range set ? */ + left = range_it_iter_first(rt, start, last); + if (left && left->rn_start <= start && left->rn_last >= last) + return 0; + return -ESRCH; +} + +/* Set the range in this range tree */ +int range_tree_set(struct range_tree *rt, u32 start, u32 len) +{ + u32 last = start + len - 1; + struct range_node *right; + struct range_node *left; + int err; + + /* Is this whole range already set ? */ + left = range_it_iter_first(rt, start, last); + if (left && left->rn_start <= start && left->rn_last >= last) + return 0; + + /* Clear out everything in the range we want to set. */ + err = range_tree_clear(rt, start, len); + if (err) + return err; + + /* Do we have a left-adjacent range ? */ + left = range_it_iter_first(rt, start - 1, start - 1); + if (left && left->rn_last + 1 != start) + return -EFAULT; + + /* Do we have a right-adjacent range ? */ + right = range_it_iter_first(rt, last + 1, last + 1); + if (right && right->rn_start != last + 1) + return -EFAULT; + + if (left && right) { + /* Combine left and right adjacent ranges */ + range_it_remove(left, rt); + range_it_remove(right, rt); + left->rn_last = right->rn_last; + range_it_insert(left, rt); + migrate_disable(); + bpf_mem_free(&bpf_global_ma, right); + migrate_enable(); + } else if (left) { + /* Combine with the left range */ + range_it_remove(left, rt); + left->rn_last = last; + range_it_insert(left, rt); + } else if (right) { + /* Combine with the right range */ + range_it_remove(right, rt); + right->rn_start = start; + range_it_insert(right, rt); + } else { + migrate_disable(); + left = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node)); + migrate_enable(); + if (!left) + return -ENOMEM; + left->rn_start = start; + left->rn_last = last; + range_it_insert(left, rt); + } + return 0; +} + +void range_tree_destroy(struct range_tree *rt) +{ + struct range_node *rn; + + while ((rn = range_it_iter_first(rt, 0, -1U))) { + range_it_remove(rn, rt); + migrate_disable(); + bpf_mem_free(&bpf_global_ma, rn); + migrate_enable(); + } +} + +void range_tree_init(struct range_tree *rt) +{ + rt->it_root = RB_ROOT_CACHED; + rt->range_size_root = RB_ROOT_CACHED; +} diff --git a/kernel/bpf/range_tree.h b/kernel/bpf/range_tree.h new file mode 100644 index 000000000000..ff0b9110eb71 --- /dev/null +++ b/kernel/bpf/range_tree.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#ifndef _RANGE_TREE_H +#define _RANGE_TREE_H 1 + +struct range_tree { + /* root of interval tree */ + struct rb_root_cached it_root; + /* root of rbtree of interval sizes */ + struct rb_root_cached range_size_root; +}; + +void range_tree_init(struct range_tree *rt); +void range_tree_destroy(struct range_tree *rt); + +int range_tree_clear(struct range_tree *rt, u32 start, u32 len); +int range_tree_set(struct range_tree *rt, u32 start, u32 len); +int is_range_tree_set(struct range_tree *rt, u32 start, u32 len); +s64 range_tree_find(struct range_tree *rt, u32 len); + +#endif diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index e20b90c36131..e1cfe890e0be 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -29,7 +29,7 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; - spinlock_t spinlock ____cacheline_aligned_in_smp; + raw_spinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than * the spinlock that is used for kernel-producer ring buffers. This is @@ -173,7 +173,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) if (!rb) return NULL; - spin_lock_init(&rb->spinlock); + raw_spin_lock_init(&rb->spinlock); atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -421,10 +421,10 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) cons_pos = smp_load_acquire(&rb->consumer_pos); if (in_nmi()) { - if (!spin_trylock_irqsave(&rb->spinlock, flags)) + if (!raw_spin_trylock_irqsave(&rb->spinlock, flags)) return NULL; } else { - spin_lock_irqsave(&rb->spinlock, flags); + raw_spin_lock_irqsave(&rb->spinlock, flags); } pend_pos = rb->pending_pos; @@ -450,7 +450,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) */ if (new_prod_pos - cons_pos > rb->mask || new_prod_pos - pend_pos > rb->mask) { - spin_unlock_irqrestore(&rb->spinlock, flags); + raw_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } @@ -462,7 +462,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) /* pairs with consumer's smp_load_acquire() */ smp_store_release(&rb->producer_pos, new_prod_pos); - spin_unlock_irqrestore(&rb->spinlock, flags); + raw_spin_unlock_irqrestore(&rb->spinlock, flags); return (void *)hdr + BPF_RINGBUF_HDR_SZ; } @@ -632,7 +632,7 @@ const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a8f1808a1ca5..5684e8ce132d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -35,6 +35,7 @@ #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> #include <linux/trace_events.h> +#include <linux/tracepoint.h> #include <net/netfilter/nf_bpf_link.h> #include <net/netkit.h> @@ -155,6 +156,89 @@ static void maybe_wait_bpf_programs(struct bpf_map *map) synchronize_rcu(); } +static void unpin_uptr_kaddr(void *kaddr) +{ + if (kaddr) + unpin_user_page(virt_to_page(kaddr)); +} + +static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj) +{ + const struct btf_field *field; + void **uptr_addr; + int i; + + for (i = 0, field = rec->fields; i < cnt; i++, field++) { + if (field->type != BPF_UPTR) + continue; + + uptr_addr = obj + field->offset; + unpin_uptr_kaddr(*uptr_addr); + } +} + +static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj) +{ + if (!btf_record_has_field(rec, BPF_UPTR)) + return; + + __bpf_obj_unpin_uptrs(rec, rec->cnt, obj); +} + +static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj) +{ + const struct btf_field *field; + const struct btf_type *t; + unsigned long start, end; + struct page *page; + void **uptr_addr; + int i, err; + + if (!btf_record_has_field(rec, BPF_UPTR)) + return 0; + + for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) { + if (field->type != BPF_UPTR) + continue; + + uptr_addr = obj + field->offset; + start = *(unsigned long *)uptr_addr; + if (!start) + continue; + + t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id); + /* t->size was checked for zero before */ + if (check_add_overflow(start, t->size - 1, &end)) { + err = -EFAULT; + goto unpin_all; + } + + /* The uptr's struct cannot span across two pages */ + if ((start & PAGE_MASK) != (end & PAGE_MASK)) { + err = -EOPNOTSUPP; + goto unpin_all; + } + + err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page); + if (err != 1) + goto unpin_all; + + if (PageHighMem(page)) { + err = -EOPNOTSUPP; + unpin_user_page(page); + goto unpin_all; + } + + *uptr_addr = page_address(page) + offset_in_page(start); + } + + return 0; + +unpin_all: + __bpf_obj_unpin_uptrs(rec, i, obj); + return err; +} + static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, void *key, void *value, __u64 flags) { @@ -199,9 +283,14 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { err = map->ops->map_push_elem(map, value, flags); } else { - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, flags); - rcu_read_unlock(); + err = bpf_obj_pin_uptrs(map->record, value); + if (!err) { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, flags); + rcu_read_unlock(); + if (err) + bpf_obj_unpin_uptrs(map->record, value); + } } bpf_enable_instrumentation(); @@ -548,6 +637,7 @@ void btf_record_free(struct btf_record *rec) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: if (rec->fields[i].kptr.module) module_put(rec->fields[i].kptr.module); if (btf_is_kernel(rec->fields[i].kptr.btf)) @@ -597,6 +687,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: if (btf_is_kernel(fields[i].kptr.btf)) btf_get(fields[i].kptr.btf); if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { @@ -714,6 +805,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) field->kptr.dtor(xchgd_field); } break; + case BPF_UPTR: + /* The caller ensured that no one is using the uptr */ + unpin_uptr_kaddr(*(void **)field_ptr); + break; case BPF_LIST_HEAD: if (WARN_ON_ONCE(rec->spin_lock_off < 0)) continue; @@ -1105,7 +1200,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1161,6 +1256,12 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, goto free_map_tab; } break; + case BPF_UPTR: + if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; case BPF_LIST_HEAD: case BPF_RB_ROOT: if (map->map_type != BPF_MAP_TYPE_HASH && @@ -2933,17 +3034,33 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, - const struct bpf_link_ops *ops, struct bpf_prog *prog) +/* bpf_link_init_sleepable() allows to specify whether BPF link itself has + * "sleepable" semantics, which normally would mean that BPF link's attach + * hook can dereference link or link's underlying program for some time after + * detachment due to RCU Tasks Trace-based lifetime protection scheme. + * BPF program itself can be non-sleepable, yet, because it's transitively + * reachable through BPF link, its freeing has to be delayed until after RCU + * Tasks Trace GP. + */ +void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + bool sleepable) { WARN_ON(ops->dealloc && ops->dealloc_deferred); atomic64_set(&link->refcnt, 1); link->type = type; + link->sleepable = sleepable; link->id = 0; link->ops = ops; link->prog = prog; } +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog) +{ + bpf_link_init_sleepable(link, type, ops, prog, false); +} + static void bpf_link_free_id(int id) { if (!id) @@ -2976,12 +3093,24 @@ void bpf_link_inc(struct bpf_link *link) atomic64_inc(&link->refcnt); } +static void bpf_link_dealloc(struct bpf_link *link) +{ + /* now that we know that bpf_link itself can't be reached, put underlying BPF program */ + if (link->prog) + bpf_prog_put(link->prog); + + /* free bpf_link and its containing memory */ + if (link->ops->dealloc_deferred) + link->ops->dealloc_deferred(link); + else + link->ops->dealloc(link); +} + static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) { struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); - /* free bpf_link and its containing memory */ - link->ops->dealloc_deferred(link); + bpf_link_dealloc(link); } static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) @@ -2996,26 +3125,27 @@ static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) static void bpf_link_free(struct bpf_link *link) { const struct bpf_link_ops *ops = link->ops; - bool sleepable = false; bpf_link_free_id(link->id); - if (link->prog) { - sleepable = link->prog->sleepable; - /* detach BPF program, clean up used resources */ + /* detach BPF program, clean up used resources */ + if (link->prog) ops->release(link); - bpf_prog_put(link->prog); - } if (ops->dealloc_deferred) { - /* schedule BPF link deallocation; if underlying BPF program - * is sleepable, we need to first wait for RCU tasks trace - * sync, then go through "classic" RCU grace period + /* Schedule BPF link deallocation, which will only then + * trigger putting BPF program refcount. + * If underlying BPF program is sleepable or BPF link's target + * attach hookpoint is sleepable or otherwise requires RCU GPs + * to ensure link and its underlying BPF program is not + * reachable anymore, we need to first wait for RCU tasks + * trace sync, and then go through "classic" RCU grace period */ - if (sleepable) + if (link->sleepable || (link->prog && link->prog->sleepable)) call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); else call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); - } else if (ops->dealloc) - ops->dealloc(link); + } else if (ops->dealloc) { + bpf_link_dealloc(link); + } } static void bpf_link_put_deferred(struct work_struct *work) @@ -3069,13 +3199,17 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; + enum bpf_link_type type = link->type; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - seq_printf(m, - "link_type:\t%s\n" - "link_id:\t%u\n", - bpf_link_type_strs[link->type], - link->id); + if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) { + seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); + } else { + WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type); + seq_printf(m, "link_type:\t<%u>\n", type); + } + seq_printf(m, "link_id:\t%u\n", link->id); + if (prog) { bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, @@ -3214,7 +3348,8 @@ static void bpf_tracing_link_release(struct bpf_link *link) container_of(link, struct bpf_tracing_link, link.link); WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, - tr_link->trampoline)); + tr_link->trampoline, + tr_link->tgt_prog)); bpf_trampoline_put(tr_link->trampoline); @@ -3354,7 +3489,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, * in prog->aux * * - if prog->aux->dst_trampoline is NULL, the program has already been - * attached to a target and its initial target was cleared (below) + * attached to a target and its initial target was cleared (below) * * - if tgt_prog != NULL, the caller specified tgt_prog_fd + * target_btf_id using the link_create API. @@ -3429,7 +3564,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, if (err) goto out_unlock; - err = bpf_trampoline_link_prog(&link->link, tr); + err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog); if (err) { bpf_link_cleanup(&link_primer); link = NULL; @@ -3565,15 +3700,16 @@ static void bpf_perf_link_dealloc(struct bpf_link *link) } static int bpf_perf_link_fill_common(const struct perf_event *event, - char __user *uname, u32 ulen, + char __user *uname, u32 *ulenp, u64 *probe_offset, u64 *probe_addr, u32 *fd_type, unsigned long *missed) { const char *buf; - u32 prog_id; + u32 prog_id, ulen; size_t len; int err; + ulen = *ulenp; if (!ulen ^ !uname) return -EINVAL; @@ -3581,10 +3717,17 @@ static int bpf_perf_link_fill_common(const struct perf_event *event, probe_offset, probe_addr, missed); if (err) return err; + + if (buf) { + len = strlen(buf); + *ulenp = len + 1; + } else { + *ulenp = 1; + } if (!uname) return 0; + if (buf) { - len = strlen(buf); err = bpf_copy_to_user(uname, buf, ulen, len); if (err) return err; @@ -3609,7 +3752,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); ulen = info->perf_event.kprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, &type, &missed); if (err) return err; @@ -3617,7 +3760,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_KRETPROBE; else info->perf_event.type = BPF_PERF_EVENT_KPROBE; - + info->perf_event.kprobe.name_len = ulen; info->perf_event.kprobe.offset = offset; info->perf_event.kprobe.missed = missed; if (!kallsyms_show_value(current_cred())) @@ -3639,7 +3782,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); ulen = info->perf_event.uprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, &type, NULL); if (err) return err; @@ -3648,6 +3791,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_URETPROBE; else info->perf_event.type = BPF_PERF_EVENT_UPROBE; + info->perf_event.uprobe.name_len = ulen; info->perf_event.uprobe.offset = offset; info->perf_event.uprobe.cookie = event->bpf_cookie; return 0; @@ -3673,12 +3817,18 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, { char __user *uname; u32 ulen; + int err; uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); ulen = info->perf_event.tracepoint.name_len; + err = bpf_perf_link_fill_common(event, uname, &ulen, NULL, NULL, NULL, NULL); + if (err) + return err; + info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; + info->perf_event.tracepoint.name_len = ulen; info->perf_event.tracepoint.cookie = event->bpf_cookie; - return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL); + return 0; } static int bpf_perf_link_fill_perf_event(const struct perf_event *event, @@ -3816,8 +3966,9 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, - &bpf_raw_tp_link_lops, prog); + bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, + &bpf_raw_tp_link_lops, prog, + tracepoint_is_faultable(btp->tp)); link->btp = btp; link->cookie = cookie; @@ -3983,10 +4134,14 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION && + attach_type != BPF_TRACE_UPROBE_SESSION) + return -EINVAL; if (attach_type != BPF_PERF_EVENT && attach_type != BPF_TRACE_KPROBE_MULTI && attach_type != BPF_TRACE_KPROBE_SESSION && - attach_type != BPF_TRACE_UPROBE_MULTI) + attach_type != BPF_TRACE_UPROBE_MULTI && + attach_type != BPF_TRACE_UPROBE_SESSION) return -EINVAL; return 0; case BPF_PROG_TYPE_SCHED_CLS: @@ -5239,7 +5394,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI || attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION) ret = bpf_kprobe_multi_link_attach(attr, prog); - else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI) + else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI || + attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION) ret = bpf_uprobe_multi_link_attach(attr, prog); break; default: @@ -5877,7 +6033,7 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .arg1_type = ARG_PTR_TO_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(u64), }; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 02aa9db8d796..98d9b4c0daff 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -5,7 +5,6 @@ #include <linux/namei.h> #include <linux/pid_namespace.h> #include <linux/fs.h> -#include <linux/fdtable.h> #include <linux/filter.h> #include <linux/bpf_mem_alloc.h> #include <linux/btf_ids.h> @@ -99,7 +98,7 @@ static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *co rcu_read_lock(); pid = find_pid_ns(common->pid, common->ns); if (pid) { - task = get_pid_task(pid, PIDTYPE_TGID); + task = get_pid_task(pid, PIDTYPE_PID); *tid = common->pid; } rcu_read_unlock(); @@ -286,17 +285,14 @@ again: curr_fd = 0; } - rcu_read_lock(); - f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); + f = fget_task_next(curr_task, &curr_fd); if (f) { /* set info->fd */ info->fd = curr_fd; - rcu_read_unlock(); return f; } /* the current task is done, go to the next task */ - rcu_read_unlock(); put_task_struct(curr_task); if (info->common.type == BPF_TASK_ITER_TID) { diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index dcbec1a0dfb3..26057aa13503 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -1,6 +1,5 @@ #include <linux/bpf.h> #include <linux/vmalloc.h> -#include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/kernel.h> diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index f8302a5ca400..c4b1a98ff726 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -115,10 +115,14 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } -void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym) +void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym) { ksym->start = (unsigned long) data; ksym->end = ksym->start + size; +} + +void bpf_image_ksym_add(struct bpf_ksym *ksym) +{ bpf_ksym_add(ksym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, PAGE_SIZE, false, ksym->name); @@ -377,7 +381,8 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size) ksym = &im->ksym; INIT_LIST_HEAD_RCU(&ksym->lnode); snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key); - bpf_image_ksym_add(image, size, ksym); + bpf_image_ksym_init(image, size, ksym); + bpf_image_ksym_add(ksym); return im; out_free_image: @@ -523,7 +528,27 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) } } -static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog) +{ + struct bpf_prog_aux *aux = tgt_prog->aux; + + guard(mutex)(&aux->ext_mutex); + if (aux->prog_array_member_cnt) + /* Program extensions can not extend target prog when the target + * prog has been updated to any prog_array map as tail callee. + * It's to prevent a potential infinite loop like: + * tgt prog entry -> tgt prog subprog -> freplace prog entry + * --tailcall-> tgt prog entry. + */ + return -EBUSY; + + aux->is_extended = true; + return 0; +} + +static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog) { enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; @@ -544,6 +569,9 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr /* Cannot attach extension if fentry/fexit are in use. */ if (cnt) return -EBUSY; + err = bpf_freplace_check_tgt_prog(tgt_prog); + if (err) + return err; tr->extension_prog = link->link.prog; return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); @@ -570,17 +598,21 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr return err; } -int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +int bpf_trampoline_link_prog(struct bpf_tramp_link *link, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog) { int err; mutex_lock(&tr->mutex); - err = __bpf_trampoline_link_prog(link, tr); + err = __bpf_trampoline_link_prog(link, tr, tgt_prog); mutex_unlock(&tr->mutex); return err; } -static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog) { enum bpf_tramp_prog_type kind; int err; @@ -591,6 +623,8 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_ err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, tr->extension_prog->bpf_func, NULL); tr->extension_prog = NULL; + guard(mutex)(&tgt_prog->aux->ext_mutex); + tgt_prog->aux->is_extended = false; return err; } hlist_del_init(&link->tramp_hlist); @@ -599,12 +633,14 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_ } /* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog) { int err; mutex_lock(&tr->mutex); - err = __bpf_trampoline_unlink_prog(link, tr); + err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog); mutex_unlock(&tr->mutex); return err; } @@ -619,7 +655,7 @@ static void bpf_shim_tramp_link_release(struct bpf_link *link) if (!shim_link->trampoline) return; - WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline)); + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL)); bpf_trampoline_put(shim_link->trampoline); } @@ -733,7 +769,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, goto err; } - err = __bpf_trampoline_link_prog(&shim_link->link, tr); + err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL); if (err) goto err; @@ -868,6 +904,8 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); + if (prog->aux->recursion_detected) + prog->aux->recursion_detected(prog); return 0; } return bpf_prog_start_time(); @@ -944,6 +982,8 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); + if (prog->aux->recursion_detected) + prog->aux->recursion_detected(prog); return 0; } return bpf_prog_start_time(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9a7ed527e47e..1c4ebb326785 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem { #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512 +#define BPF_PRIV_STACK_MIN_SIZE 64 + static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); @@ -418,6 +420,25 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) return rec; } +static bool mask_raw_tp_reg_cond(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { + return reg->type == (PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL) && + bpf_prog_is_raw_tp(env->prog) && !reg->ref_obj_id; +} + +static bool mask_raw_tp_reg(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + if (!mask_raw_tp_reg_cond(env, reg)) + return false; + reg->type &= ~PTR_MAYBE_NULL; + return true; +} + +static void unmask_raw_tp_reg(struct bpf_reg_state *reg, bool result) +{ + if (result) + reg->type |= PTR_MAYBE_NULL; +} + static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog) { struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux; @@ -1265,6 +1286,7 @@ static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_fun if (!dst->refs) return -ENOMEM; + dst->active_locks = src->active_locks; dst->acquired_refs = src->acquired_refs; return 0; } @@ -1335,13 +1357,32 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) if (err) return err; id = ++env->id_gen; + state->refs[new_ofs].type = REF_TYPE_PTR; state->refs[new_ofs].id = id; state->refs[new_ofs].insn_idx = insn_idx; - state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0; return id; } +static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum ref_state_type type, + int id, void *ptr) +{ + struct bpf_func_state *state = cur_func(env); + int new_ofs = state->acquired_refs; + int err; + + err = resize_reference_state(state, state->acquired_refs + 1); + if (err) + return err; + state->refs[new_ofs].type = type; + state->refs[new_ofs].id = id; + state->refs[new_ofs].insn_idx = insn_idx; + state->refs[new_ofs].ptr = ptr; + + state->active_locks++; + return 0; +} + /* release function corresponding to acquire_reference_state(). Idempotent. */ static int release_reference_state(struct bpf_func_state *state, int ptr_id) { @@ -1349,10 +1390,9 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id) last_idx = state->acquired_refs - 1; for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].type != REF_TYPE_PTR) + continue; if (state->refs[i].id == ptr_id) { - /* Cannot release caller references in callbacks */ - if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno) - return -EINVAL; if (last_idx && i != last_idx) memcpy(&state->refs[i], &state->refs[last_idx], sizeof(*state->refs)); @@ -1364,6 +1404,45 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id) return -EINVAL; } +static int release_lock_state(struct bpf_func_state *state, int type, int id, void *ptr) +{ + int i, last_idx; + + last_idx = state->acquired_refs - 1; + for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].type != type) + continue; + if (state->refs[i].id == id && state->refs[i].ptr == ptr) { + if (last_idx && i != last_idx) + memcpy(&state->refs[i], &state->refs[last_idx], + sizeof(*state->refs)); + memset(&state->refs[last_idx], 0, sizeof(*state->refs)); + state->acquired_refs--; + state->active_locks--; + return 0; + } + } + return -EINVAL; +} + +static struct bpf_reference_state *find_lock_state(struct bpf_verifier_env *env, enum ref_state_type type, + int id, void *ptr) +{ + struct bpf_func_state *state = cur_func(env); + int i; + + for (i = 0; i < state->acquired_refs; i++) { + struct bpf_reference_state *s = &state->refs[i]; + + if (s->type == REF_TYPE_PTR || s->type != type) + continue; + + if (s->id == id && s->ptr == ptr) + return s; + } + return NULL; +} + static void free_func_state(struct bpf_func_state *state) { if (!state) @@ -1373,13 +1452,6 @@ static void free_func_state(struct bpf_func_state *state) kfree(state); } -static void clear_jmp_history(struct bpf_verifier_state *state) -{ - kfree(state->jmp_history); - state->jmp_history = NULL; - state->jmp_history_cnt = 0; -} - static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { @@ -1389,7 +1461,6 @@ static void free_verifier_state(struct bpf_verifier_state *state, free_func_state(state->frame[i]); state->frame[i] = NULL; } - clear_jmp_history(state); if (free_self) kfree(state); } @@ -1415,13 +1486,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, struct bpf_func_state *dst; int i, err; - dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history, - src->jmp_history_cnt, sizeof(*dst_state->jmp_history), - GFP_USER); - if (!dst_state->jmp_history) - return -ENOMEM; - dst_state->jmp_history_cnt = src->jmp_history_cnt; - /* if dst has more stack frames then src frame, free them, this is also * necessary in case of exceptional exits using bpf_throw. */ @@ -1434,12 +1498,12 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->active_preempt_lock = src->active_preempt_lock; dst_state->in_sleepable = src->in_sleepable; dst_state->curframe = src->curframe; - dst_state->active_lock.ptr = src->active_lock.ptr; - dst_state->active_lock.id = src->active_lock.id; dst_state->branches = src->branches; dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; dst_state->last_insn_idx = src->last_insn_idx; + dst_state->insn_hist_start = src->insn_hist_start; + dst_state->insn_hist_end = src->insn_hist_end; dst_state->dfs_depth = src->dfs_depth; dst_state->callback_unroll_depth = src->callback_unroll_depth; dst_state->used_as_loop_entry = src->used_as_loop_entry; @@ -2492,9 +2556,14 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, * The caller state doesn't matter. * This is async callback. It starts in a fresh stack. * Initialize it similar to do_check_common(). + * But we do need to make sure to not clobber insn_hist, so we keep + * chaining insn_hist_start/insn_hist_end indices as for a normal + * child state. */ elem->st.branches = 1; elem->st.in_sleepable = is_sleepable; + elem->st.insn_hist_start = env->cur_state->insn_hist_end; + elem->st.insn_hist_end = elem->st.insn_hist_start; frame = kzalloc(sizeof(*frame), GFP_KERNEL); if (!frame) goto err; @@ -2750,10 +2819,16 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, b->module = mod; b->offset = offset; + /* sort() reorders entries by value, so b may no longer point + * to the right entry after this + */ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), kfunc_btf_cmp_by_off, NULL); + } else { + btf = b->btf; } - return b->btf; + + return btf; } void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab) @@ -3468,11 +3543,10 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s) } /* for any branch, call, exit record the history of jmps in the given state */ -static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs) +static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, + int insn_flags, u64 linked_regs) { - u32 cnt = cur->jmp_history_cnt; - struct bpf_jmp_history_entry *p; + struct bpf_insn_hist_entry *p; size_t alloc_size; /* combine instruction flags if we already recorded this instruction */ @@ -3492,29 +3566,32 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st return 0; } - cnt++; - alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); - p = krealloc(cur->jmp_history, alloc_size, GFP_USER); - if (!p) - return -ENOMEM; - cur->jmp_history = p; + if (cur->insn_hist_end + 1 > env->insn_hist_cap) { + alloc_size = size_mul(cur->insn_hist_end + 1, sizeof(*p)); + p = kvrealloc(env->insn_hist, alloc_size, GFP_USER); + if (!p) + return -ENOMEM; + env->insn_hist = p; + env->insn_hist_cap = alloc_size / sizeof(*p); + } - p = &cur->jmp_history[cnt - 1]; + p = &env->insn_hist[cur->insn_hist_end]; p->idx = env->insn_idx; p->prev_idx = env->prev_insn_idx; p->flags = insn_flags; p->linked_regs = linked_regs; - cur->jmp_history_cnt = cnt; + + cur->insn_hist_end++; env->cur_hist_ent = p; return 0; } -static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st, - u32 hist_end, int insn_idx) +static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *env, + u32 hist_start, u32 hist_end, int insn_idx) { - if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx) - return &st->jmp_history[hist_end - 1]; + if (hist_end > hist_start && env->insn_hist[hist_end - 1].idx == insn_idx) + return &env->insn_hist[hist_end - 1]; return NULL; } @@ -3531,25 +3608,26 @@ static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_stat * history entry recording a jump from last instruction of parent state and * first instruction of given state. */ -static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, - u32 *history) +static int get_prev_insn_idx(const struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + int insn_idx, u32 hist_start, u32 *hist_endp) { - u32 cnt = *history; + u32 hist_end = *hist_endp; + u32 cnt = hist_end - hist_start; - if (i == st->first_insn_idx) { + if (insn_idx == st->first_insn_idx) { if (cnt == 0) return -ENOENT; - if (cnt == 1 && st->jmp_history[0].idx == i) + if (cnt == 1 && env->insn_hist[hist_start].idx == insn_idx) return -ENOENT; } - if (cnt && st->jmp_history[cnt - 1].idx == i) { - i = st->jmp_history[cnt - 1].prev_idx; - (*history)--; + if (cnt && env->insn_hist[hist_end - 1].idx == insn_idx) { + (*hist_endp)--; + return env->insn_hist[hist_end - 1].prev_idx; } else { - i--; + return insn_idx - 1; } - return i; } static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) @@ -3721,7 +3799,7 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) /* If any register R in hist->linked_regs is marked as precise in bt, * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs. */ -static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist) +static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist_entry *hist) { struct linked_regs linked_regs; bool some_precise = false; @@ -3766,7 +3844,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); * - *was* processed previously during backtracking. */ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, - struct bpf_jmp_history_entry *hist, struct backtrack_state *bt) + struct bpf_insn_hist_entry *hist, struct backtrack_state *bt) { const struct bpf_insn_cbs cbs = { .cb_call = disasm_kfunc_name, @@ -4185,7 +4263,7 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ * SCALARS, as well as any other registers and slots that contribute to * a tracked state of given registers/stack slots, depending on specific BPF * assembly instructions (see backtrack_insns() for exact instruction handling - * logic). This backtracking relies on recorded jmp_history and is able to + * logic). This backtracking relies on recorded insn_hist and is able to * traverse entire chain of parent states. This process ends only when all the * necessary registers/slots and their transitive dependencies are marked as * precise. @@ -4302,8 +4380,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) for (;;) { DECLARE_BITMAP(mask, 64); - u32 history = st->jmp_history_cnt; - struct bpf_jmp_history_entry *hist; + u32 hist_start = st->insn_hist_start; + u32 hist_end = st->insn_hist_end; + struct bpf_insn_hist_entry *hist; if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n", @@ -4342,7 +4421,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) err = 0; skip_first = false; } else { - hist = get_jmp_hist_entry(st, history, i); + hist = get_insn_hist_entry(env, hist_start, hist_end, i); err = backtrack_insn(env, i, subseq_idx, hist, bt); } if (err == -ENOTSUPP) { @@ -4359,7 +4438,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) */ return 0; subseq_idx = i; - i = get_prev_insn_idx(st, i, &history); + i = get_prev_insn_idx(env, st, i, hist_start, &hist_end); if (i == -ENOENT) break; if (i >= env->prog->len) { @@ -4725,7 +4804,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return push_jmp_history(env, env->cur_state, insn_flags, 0); + return push_insn_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -5032,7 +5111,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return push_jmp_history(env, env->cur_state, insn_flags, 0); + return push_insn_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -5417,7 +5496,7 @@ static bool in_sleepable(struct bpf_verifier_env *env) static bool in_rcu_cs(struct bpf_verifier_env *env) { return env->cur_state->active_rcu_lock || - env->cur_state->active_lock.ptr || + cur_func(env)->active_locks || !in_sleepable(env); } @@ -5485,6 +5564,22 @@ static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr return ret; } +static int mark_uptr_ld_reg(struct bpf_verifier_env *env, u32 regno, + struct btf_field *field) +{ + struct bpf_reg_state *reg; + const struct btf_type *t; + + t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id); + mark_reg_known_zero(env, cur_regs(env), regno); + reg = reg_state(env, regno); + reg->type = PTR_TO_MEM | PTR_MAYBE_NULL; + reg->mem_size = t->size; + reg->id = ++env->id_gen; + + return 0; +} + static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, int value_regno, int insn_idx, struct btf_field *kptr_field) @@ -5513,9 +5608,15 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, verbose(env, "store to referenced kptr disallowed\n"); return -EACCES; } + if (class != BPF_LDX && kptr_field->type == BPF_UPTR) { + verbose(env, "store to uptr disallowed\n"); + return -EACCES; + } if (class == BPF_LDX) { - val_reg = reg_state(env, value_regno); + if (kptr_field->type == BPF_UPTR) + return mark_uptr_ld_reg(env, value_regno, kptr_field); + /* We can simply mark the value_regno receiving the pointer * value from map as PTR_TO_BTF_ID, with the correct type. */ @@ -5573,21 +5674,26 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: if (src != ACCESS_DIRECT) { - verbose(env, "kptr cannot be accessed indirectly by helper\n"); + verbose(env, "%s cannot be accessed indirectly by helper\n", + btf_field_type_name(field->type)); return -EACCES; } if (!tnum_is_const(reg->var_off)) { - verbose(env, "kptr access cannot have variable offset\n"); + verbose(env, "%s access cannot have variable offset\n", + btf_field_type_name(field->type)); return -EACCES; } if (p != off + reg->var_off.value) { - verbose(env, "kptr access misaligned expected=%u off=%llu\n", + verbose(env, "%s access misaligned expected=%u off=%llu\n", + btf_field_type_name(field->type), p, off + reg->var_off.value); return -EACCES; } if (size != bpf_size_to_bytes(BPF_DW)) { - verbose(env, "kptr access size must be BPF_DW\n"); + verbose(env, "%s access size must be BPF_DW\n", + btf_field_type_name(field->type)); return -EACCES; } break; @@ -5982,6 +6088,34 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } +static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog) +{ + if (!bpf_jit_supports_private_stack()) + return NO_PRIV_STACK; + + /* bpf_prog_check_recur() checks all prog types that use bpf trampoline + * while kprobe/tp/perf_event/raw_tp don't use trampoline hence checked + * explicitly. + */ + switch (prog->type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + return PRIV_STACK_ADAPTIVE; + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: + if (prog->aux->priv_stack_requested || bpf_prog_check_recur(prog)) + return PRIV_STACK_ADAPTIVE; + fallthrough; + default: + break; + } + + return NO_PRIV_STACK; +} + static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) { if (env->prog->jit_requested) @@ -5999,17 +6133,20 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) * Since recursion is prevented by check_cfg() this algorithm * only needs a local stack of MAX_CALL_FRAMES to remember callsites */ -static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx) +static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx, + bool priv_stack_supported) { struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; - int depth = 0, frame = 0, i, subprog_end; + int depth = 0, frame = 0, i, subprog_end, subprog_depth; bool tail_call_reachable = false; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; int j; i = subprog[idx].start; + if (!priv_stack_supported) + subprog[idx].priv_stack_mode = NO_PRIV_STACK; process_func: /* protect against potential stack overflow that might happen when * bpf2bpf calls get combined with tailcalls. Limit the caller's stack @@ -6036,11 +6173,31 @@ process_func: depth); return -EACCES; } - depth += round_up_stack_depth(env, subprog[idx].stack_depth); - if (depth > MAX_BPF_STACK) { - verbose(env, "combined stack size of %d calls is %d. Too large\n", - frame + 1, depth); - return -EACCES; + + subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth); + if (priv_stack_supported) { + /* Request private stack support only if the subprog stack + * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to + * avoid jit penalty if the stack usage is small. + */ + if (subprog[idx].priv_stack_mode == PRIV_STACK_UNKNOWN && + subprog_depth >= BPF_PRIV_STACK_MIN_SIZE) + subprog[idx].priv_stack_mode = PRIV_STACK_ADAPTIVE; + } + + if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) { + if (subprog_depth > MAX_BPF_STACK) { + verbose(env, "stack size of subprog %d is %d. Too large\n", + idx, subprog_depth); + return -EACCES; + } + } else { + depth += subprog_depth; + if (depth > MAX_BPF_STACK) { + verbose(env, "combined stack size of %d calls is %d. Too large\n", + frame + 1, depth); + return -EACCES; + } } continue_func: subprog_end = subprog[idx + 1].start; @@ -6097,6 +6254,8 @@ continue_func: } i = next_insn; idx = sidx; + if (!priv_stack_supported) + subprog[idx].priv_stack_mode = NO_PRIV_STACK; if (subprog[idx].has_tail_call) tail_call_reachable = true; @@ -6130,7 +6289,8 @@ continue_func: */ if (frame == 0) return 0; - depth -= round_up_stack_depth(env, subprog[idx].stack_depth); + if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE) + depth -= round_up_stack_depth(env, subprog[idx].stack_depth); frame--; i = ret_insn[frame]; idx = ret_prog[frame]; @@ -6139,17 +6299,45 @@ continue_func: static int check_max_stack_depth(struct bpf_verifier_env *env) { + enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN; struct bpf_subprog_info *si = env->subprog_info; + bool priv_stack_supported; int ret; for (int i = 0; i < env->subprog_cnt; i++) { + if (si[i].has_tail_call) { + priv_stack_mode = NO_PRIV_STACK; + break; + } + } + + if (priv_stack_mode == PRIV_STACK_UNKNOWN) + priv_stack_mode = bpf_enable_priv_stack(env->prog); + + /* All async_cb subprogs use normal kernel stack. If a particular + * subprog appears in both main prog and async_cb subtree, that + * subprog will use normal kernel stack to avoid potential nesting. + * The reverse subprog traversal ensures when main prog subtree is + * checked, the subprogs appearing in async_cb subtrees are already + * marked as using normal kernel stack, so stack size checking can + * be done properly. + */ + for (int i = env->subprog_cnt - 1; i >= 0; i--) { if (!i || si[i].is_async_cb) { - ret = check_max_stack_depth_subprog(env, i); + priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE; + ret = check_max_stack_depth_subprog(env, i, priv_stack_supported); if (ret < 0) return ret; } - continue; } + + for (int i = 0; i < env->subprog_cnt; i++) { + if (si[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) { + env->prog->aux->jits_use_priv_stack = true; + break; + } + } + return 0; } @@ -6333,10 +6521,10 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) /* both of s64_max/s64_min positive or negative */ if ((s64_max >= 0) == (s64_min >= 0)) { - reg->smin_value = reg->s32_min_value = s64_min; - reg->smax_value = reg->s32_max_value = s64_max; - reg->umin_value = reg->u32_min_value = s64_min; - reg->umax_value = reg->u32_max_value = s64_max; + reg->s32_min_value = reg->smin_value = s64_min; + reg->s32_max_value = reg->smax_value = s64_max; + reg->u32_min_value = reg->umin_value = s64_min; + reg->u32_max_value = reg->umax_value = s64_max; reg->var_off = tnum_range(s64_min, s64_max); return; } @@ -6589,6 +6777,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, const char *field_name = NULL; enum bpf_type_flag flag = 0; u32 btf_id = 0; + bool mask; int ret; if (!env->allow_ptr_leaks) { @@ -6660,7 +6849,21 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; - + /* For raw_tp progs, we allow dereference of PTR_MAYBE_NULL + * trusted PTR_TO_BTF_ID, these are the ones that are possibly + * arguments to the raw_tp. Since internal checks in for trusted + * reg in check_ptr_to_btf_access would consider PTR_MAYBE_NULL + * modifier as problematic, mask it out temporarily for the + * check. Don't apply this to pointers with ref_obj_id > 0, as + * those won't be raw_tp args. + * + * We may end up applying this relaxation to other trusted + * PTR_TO_BTF_ID with maybe null flag, since we cannot + * distinguish PTR_MAYBE_NULL tagged for arguments vs normal + * tagging, but that should expand allowed behavior, and not + * cause regression for existing behavior. + */ + mask = mask_raw_tp_reg(env, reg); if (ret != PTR_TO_BTF_ID) { /* just mark; */ @@ -6721,8 +6924,13 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, clear_trusted_flags(&flag); } - if (atype == BPF_READ && value_regno >= 0) + if (atype == BPF_READ && value_regno >= 0) { mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); + /* We've assigned a new type to regno, so don't undo masking. */ + if (regno == value_regno) + mask = false; + } + unmask_raw_tp_reg(reg, mask); return 0; } @@ -6798,20 +7006,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, struct bpf_func_state *state, enum bpf_access_type t) { - struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx]; - int min_valid_off, max_bpf_stack; - - /* If accessing instruction is a spill/fill from bpf_fastcall pattern, - * add room for all caller saved registers below MAX_BPF_STACK. - * In case if bpf_fastcall rewrite won't happen maximal stack depth - * would be checked by check_max_stack_depth_subprog(). - */ - max_bpf_stack = MAX_BPF_STACK; - if (aux->fastcall_pattern) - max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE; + int min_valid_off; if (t == BPF_WRITE || env->allow_uninit_stack) - min_valid_off = -max_bpf_stack; + min_valid_off = -MAX_BPF_STACK; else min_valid_off = -state->allocated_stack; @@ -6953,7 +7151,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; if (tnum_is_const(reg->var_off)) kptr_field = btf_record_find(reg->map_ptr->record, - off + reg->var_off.value, BPF_KPTR); + off + reg->var_off.value, BPF_KPTR | BPF_UPTR); if (kptr_field) { err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field); } else if (t == BPF_READ && value_regno >= 0) { @@ -7107,7 +7305,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (base_type(reg->type) == PTR_TO_BTF_ID && - !type_may_be_null(reg->type)) { + (mask_raw_tp_reg_cond(env, reg) || !type_may_be_null(reg->type))) { err = check_ptr_to_btf_access(env, regs, regno, off, size, t, value_regno); } else if (reg->type == CONST_PTR_TO_MAP) { @@ -7432,7 +7630,8 @@ mark: } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, - int access_size, bool zero_size_allowed, + int access_size, enum bpf_access_type access_type, + bool zero_size_allowed, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; @@ -7444,7 +7643,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7452,15 +7651,13 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_mem_region_access(env, regno, reg->off, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: - if (check_map_access_type(env, regno, reg->off, access_size, - meta && meta->raw_mode ? BPF_WRITE : - BPF_READ)) + if (check_map_access_type(env, regno, reg->off, access_size, access_type)) return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7471,7 +7668,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, zero_size_allowed); case PTR_TO_BUF: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7499,7 +7696,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, * Dynamically check it now. */ if (!env->ops->convert_ctx_access) { - enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ; int offset = access_size - 1; /* Allow zero-byte read from PTR_TO_CTX */ @@ -7507,7 +7703,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return zero_size_allowed ? 0 : -EACCES; return check_mem_access(env, env->insn_idx, regno, offset, BPF_B, - atype, -1, false, false); + access_type, -1, false, false); } fallthrough; @@ -7532,6 +7728,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, */ static int check_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, + enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { @@ -7547,15 +7744,12 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, */ meta->msize_max_value = reg->umax_value; - /* The register is SCALAR_VALUE; the access check - * happens using its boundaries. + /* The register is SCALAR_VALUE; the access check happens using + * its boundaries. For unprivileged variable accesses, disable + * raw mode so that the program is required to initialize all + * the memory that the helper could just partially fill up. */ if (!tnum_is_const(reg->var_off)) - /* For unprivileged variable accesses, disable raw - * mode so that the program is required to - * initialize all the memory that the helper could - * just partially fill up. - */ meta = NULL; if (reg->smin_value < 0) { @@ -7575,9 +7769,8 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, regno); return -EACCES; } - err = check_helper_mem_access(env, regno - 1, - reg->umax_value, - zero_size_allowed, meta); + err = check_helper_mem_access(env, regno - 1, reg->umax_value, + access_type, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, regno); return err; @@ -7588,13 +7781,11 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg { bool may_be_null = type_may_be_null(reg->type); struct bpf_reg_state saved_reg; - struct bpf_call_arg_meta meta; int err; if (register_is_null(reg)) return 0; - memset(&meta, 0, sizeof(meta)); /* Assuming that the register contains a value check if the memory * access is safe. Temporarily save and restore the register's state as * the conversion shouldn't be visible to a caller. @@ -7604,10 +7795,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg mark_ptr_not_null_reg(reg); } - err = check_helper_mem_access(env, regno, mem_size, true, &meta); - /* Check access for BPF_WRITE */ - meta.raw_mode = true; - err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta); + err = check_helper_mem_access(env, regno, mem_size, BPF_READ, true, NULL); + err = err ?: check_helper_mem_access(env, regno, mem_size, BPF_WRITE, true, NULL); if (may_be_null) *reg = saved_reg; @@ -7633,13 +7822,12 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg mark_ptr_not_null_reg(mem_reg); } - err = check_mem_size_reg(env, reg, regno, true, &meta); - /* Check access for BPF_WRITE */ - meta.raw_mode = true; - err = err ?: check_mem_size_reg(env, reg, regno, true, &meta); + err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta); + err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta); if (may_be_null) *mem_reg = saved_reg; + return err; } @@ -7662,19 +7850,20 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg * Since only one bpf_spin_lock is allowed the checks are simpler than * reg_is_refcounted() logic. The verifier needs to remember only * one spin_lock instead of array of acquired_refs. - * cur_state->active_lock remembers which map value element or allocated + * cur_func(env)->active_locks remembers which map value element or allocated * object got locked and clears it after bpf_spin_unlock. */ static int process_spin_lock(struct bpf_verifier_env *env, int regno, bool is_lock) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); + struct bpf_func_state *cur = cur_func(env); u64 val = reg->var_off.value; struct bpf_map *map = NULL; struct btf *btf = NULL; struct btf_record *rec; + int err; if (!is_const) { verbose(env, @@ -7706,16 +7895,23 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, return -EINVAL; } if (is_lock) { - if (cur->active_lock.ptr) { + void *ptr; + + if (map) + ptr = map; + else + ptr = btf; + + if (cur->active_locks) { verbose(env, "Locking two bpf_spin_locks are not allowed\n"); return -EINVAL; } - if (map) - cur->active_lock.ptr = map; - else - cur->active_lock.ptr = btf; - cur->active_lock.id = reg->id; + err = acquire_lock_state(env, env->insn_idx, REF_TYPE_LOCK, reg->id, ptr); + if (err < 0) { + verbose(env, "Failed to acquire lock state\n"); + return err; + } } else { void *ptr; @@ -7724,20 +7920,17 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, else ptr = btf; - if (!cur->active_lock.ptr) { + if (!cur->active_locks) { verbose(env, "bpf_spin_unlock without taking a lock\n"); return -EINVAL; } - if (cur->active_lock.ptr != ptr || - cur->active_lock.id != reg->id) { + + if (release_lock_state(cur_func(env), REF_TYPE_LOCK, reg->id, ptr)) { verbose(env, "bpf_spin_unlock of different lock\n"); return -EINVAL; } invalidate_non_owning_refs(env); - - cur->active_lock.ptr = NULL; - cur->active_lock.id = 0; } return 0; } @@ -8810,6 +9003,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, enum bpf_reg_type type = reg->type; u32 *arg_btf_id = NULL; int err = 0; + bool mask; if (arg_type == ARG_DONTCARE) return 0; @@ -8850,11 +9044,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK) arg_btf_id = fn->arg_btf_id[arg]; + mask = mask_raw_tp_reg(env, reg); err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); - if (err) - return err; - err = check_func_arg_reg_off(env, reg, regno, arg_type); + err = err ?: check_func_arg_reg_off(env, reg, regno, arg_type); + unmask_raw_tp_reg(reg, mask); if (err) return err; @@ -8942,9 +9136,8 @@ skip_type_check: verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_helper_mem_access(env, regno, - meta->map_ptr->key_size, false, - NULL); + err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, + BPF_READ, false, NULL); break; case ARG_PTR_TO_MAP_VALUE: if (type_may_be_null(arg_type) && register_is_null(reg)) @@ -8959,9 +9152,9 @@ skip_type_check: return -EACCES; } meta->raw_mode = arg_type & MEM_UNINIT; - err = check_helper_mem_access(env, regno, - meta->map_ptr->value_size, false, - meta); + err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, + arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, + false, meta); break; case ARG_PTR_TO_PERCPU_BTF_ID: if (!reg->btf_id) { @@ -9003,7 +9196,9 @@ skip_type_check: */ meta->raw_mode = arg_type & MEM_UNINIT; if (arg_type & MEM_FIXED_SIZE) { - err = check_helper_mem_access(env, regno, fn->arg_size[arg], false, meta); + err = check_helper_mem_access(env, regno, fn->arg_size[arg], + arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, + false, meta); if (err) return err; if (arg_type & MEM_ALIGNED) @@ -9011,10 +9206,16 @@ skip_type_check: } break; case ARG_CONST_SIZE: - err = check_mem_size_reg(env, reg, regno, false, meta); + err = check_mem_size_reg(env, reg, regno, + fn->arg_type[arg - 1] & MEM_WRITE ? + BPF_WRITE : BPF_READ, + false, meta); break; case ARG_CONST_SIZE_OR_ZERO: - err = check_mem_size_reg(env, reg, regno, true, meta); + err = check_mem_size_reg(env, reg, regno, + fn->arg_type[arg - 1] & MEM_WRITE ? + BPF_WRITE : BPF_READ, + true, meta); break; case ARG_PTR_TO_DYNPTR: err = process_dynptr_func(env, regno, insn_idx, arg_type, 0); @@ -9642,14 +9843,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, return ret; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { struct bpf_call_arg_meta meta; + bool mask; int err; if (register_is_null(reg) && type_may_be_null(arg->arg_type)) continue; memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */ + mask = mask_raw_tp_reg(env, reg); err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta); err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type); + unmask_raw_tp_reg(reg, mask); if (err) return err; } else { @@ -9788,7 +9992,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, const char *sub_name = subprog_name(env, subprog); /* Only global subprogs cannot be called with a lock held. */ - if (env->cur_state->active_lock.ptr) { + if (cur_func(env)->active_locks) { verbose(env, "global function calls are not allowed while holding a lock,\n" "use static function instead\n"); return -EINVAL; @@ -9917,7 +10121,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env, { /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, * u64 flags); - * callback_fn(u32 index, void *callback_ctx); + * callback_fn(u64 index, void *callback_ctx); */ callee->regs[BPF_REG_1].type = SCALAR_VALUE; callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; @@ -10129,17 +10333,10 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) caller->regs[BPF_REG_0] = *r0; } - /* callback_fn frame should have released its own additions to parent's - * reference state at this point, or check_reference_leak would - * complain, hence it must be the same as the caller. There is no need - * to copy it back. - */ - if (!callee->in_callback_fn) { - /* Transfer references to the caller */ - err = copy_reference_state(caller, callee); - if (err) - return err; - } + /* Transfer references to the caller */ + err = copy_reference_state(caller, callee); + if (err) + return err; /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite, * there function call logic would reschedule callback visit. If iteration @@ -10309,11 +10506,11 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi bool refs_lingering = false; int i; - if (!exception_exit && state->frameno && !state->in_callback_fn) + if (!exception_exit && state->frameno) return 0; for (i = 0; i < state->acquired_refs; i++) { - if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno) + if (state->refs[i].type != REF_TYPE_PTR) continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); @@ -10322,6 +10519,34 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi return refs_lingering ? -EINVAL : 0; } +static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit, bool check_lock, const char *prefix) +{ + int err; + + if (check_lock && cur_func(env)->active_locks) { + verbose(env, "%s cannot be used inside bpf_spin_lock-ed region\n", prefix); + return -EINVAL; + } + + err = check_reference_leak(env, exception_exit); + if (err) { + verbose(env, "%s would lead to reference leak\n", prefix); + return err; + } + + if (check_lock && env->cur_state->active_rcu_lock) { + verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix); + return -EINVAL; + } + + if (check_lock && env->cur_state->active_preempt_lock) { + verbose(env, "%s cannot be used inside bpf_preempt_disable-ed region\n", prefix); + return -EINVAL; + } + + return 0; +} + static int check_bpf_snprintf_call(struct bpf_verifier_env *env, struct bpf_reg_state *regs) { @@ -10590,11 +10815,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn switch (func_id) { case BPF_FUNC_tail_call: - err = check_reference_leak(env, false); - if (err) { - verbose(env, "tail_call would lead to reference leak\n"); + err = check_resource_leak(env, false, true, "tail_call"); + if (err) return err; - } break; case BPF_FUNC_get_local_storage: /* check that flags argument in get_local_storage(map, flags) is 0, @@ -11259,6 +11482,7 @@ enum special_kfunc_type { KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, KF_bpf_session_cookie, + KF_bpf_get_kmem_cache, }; BTF_SET_START(special_kfunc_set) @@ -11324,6 +11548,7 @@ BTF_ID(func, bpf_session_cookie) #else BTF_ID_UNUSED #endif +BTF_ID(func, bpf_get_kmem_cache) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -11519,10 +11744,9 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_verifier_state *state = env->cur_state; struct btf_record *rec = reg_btf_record(reg); - if (!state->active_lock.ptr) { + if (!cur_func(env)->active_locks) { verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n"); return -EFAULT; } @@ -11619,6 +11843,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o */ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { + struct bpf_reference_state *s; void *ptr; u32 id; @@ -11635,10 +11860,10 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ } id = reg->id; - if (!env->cur_state->active_lock.ptr) + if (!cur_func(env)->active_locks) return -EINVAL; - if (env->cur_state->active_lock.ptr != ptr || - env->cur_state->active_lock.id != id) { + s = find_lock_state(env, REF_TYPE_LOCK, id, ptr); + if (!s) { verbose(env, "held lock and object are not in the same allocation\n"); return -EINVAL; } @@ -11949,6 +12174,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ enum bpf_arg_type arg_type = ARG_DONTCARE; u32 regno = i + 1, ref_id, type_size; bool is_ret_buf_sz = false; + bool mask = false; int kf_arg_type; t = btf_type_skip_modifiers(btf, args[i].type, NULL); @@ -12007,12 +12233,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } + mask = mask_raw_tp_reg(env, reg); if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) && (register_is_null(reg) || type_may_be_null(reg->type)) && !is_kfunc_arg_nullable(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); + unmask_raw_tp_reg(reg, mask); return -EACCES; } + unmask_raw_tp_reg(reg, mask); if (reg->ref_obj_id) { if (is_kfunc_release(meta) && meta->ref_obj_id) { @@ -12070,16 +12299,24 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) break; + /* Allow passing maybe NULL raw_tp arguments to + * kfuncs for compatibility. Don't apply this to + * arguments with ref_obj_id > 0. + */ + mask = mask_raw_tp_reg(env, reg); if (!is_trusted_reg(reg)) { if (!is_kfunc_rcu(meta)) { verbose(env, "R%d must be referenced or trusted\n", regno); + unmask_raw_tp_reg(reg, mask); return -EINVAL; } if (!is_rcu_reg(reg)) { verbose(env, "R%d must be a rcu pointer\n", regno); + unmask_raw_tp_reg(reg, mask); return -EINVAL; } } + unmask_raw_tp_reg(reg, mask); fallthrough; case KF_ARG_PTR_TO_CTX: case KF_ARG_PTR_TO_DYNPTR: @@ -12102,7 +12339,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_release(meta) && reg->ref_obj_id) arg_type |= OBJ_RELEASE; + mask = mask_raw_tp_reg(env, reg); ret = check_func_arg_reg_off(env, reg, regno, arg_type); + unmask_raw_tp_reg(reg, mask); if (ret < 0) return ret; @@ -12279,6 +12518,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ ref_tname = btf_name_by_offset(btf, ref_t->name_off); fallthrough; case KF_ARG_PTR_TO_BTF_ID: + mask = mask_raw_tp_reg(env, reg); /* Only base_type is checked, further checks are done here */ if ((base_type(reg->type) != PTR_TO_BTF_ID || (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) && @@ -12287,9 +12527,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "expected %s or socket\n", reg_type_str(env, base_type(reg->type) | (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS))); + unmask_raw_tp_reg(reg, mask); return -EINVAL; } ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i); + unmask_raw_tp_reg(reg, mask); if (ret < 0) return ret; break; @@ -12834,6 +13076,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].type = PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = ptr_type_id; + if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) + regs[BPF_REG_0].type |= PTR_UNTRUSTED; + if (is_iter_next_kfunc(&meta)) { struct bpf_reg_state *cur_iter; @@ -13259,7 +13504,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, */ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn, - const struct bpf_reg_state *ptr_reg, + struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { struct bpf_verifier_state *vstate = env->cur_state; @@ -13273,6 +13518,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; + bool mask; int ret; dst_reg = ®s[dst]; @@ -13299,11 +13545,14 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } + mask = mask_raw_tp_reg(env, ptr_reg); if (ptr_reg->type & PTR_MAYBE_NULL) { verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", dst, reg_type_str(env, ptr_reg->type)); + unmask_raw_tp_reg(ptr_reg, mask); return -EACCES; } + unmask_raw_tp_reg(ptr_reg, mask); switch (base_type(ptr_reg->type)) { case PTR_TO_CTX: @@ -14264,12 +14513,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * r1 += 0x1 * if r2 < 1000 goto ... * use r1 in memory access - * So remember constant delta between r2 and r1 and update r1 after - * 'if' condition. + * So for 64-bit alu remember constant delta between r2 and r1 and + * update r1 after 'if' condition. */ - if (env->bpf_capable && BPF_OP(insn->code) == BPF_ADD && - dst_reg->id && is_reg_const(src_reg, alu32)) { - u64 val = reg_const_value(src_reg, alu32); + if (env->bpf_capable && + BPF_OP(insn->code) == BPF_ADD && !alu32 && + dst_reg->id && is_reg_const(src_reg, false)) { + u64 val = reg_const_value(src_reg, false); if ((dst_reg->id & BPF_ADD_CONST) || /* prevent overflow in sync_linked_regs() later */ @@ -15326,8 +15576,12 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s continue; if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) || reg->off == known_reg->off) { + s32 saved_subreg_def = reg->subreg_def; + copy_register_state(reg, known_reg); + reg->subreg_def = saved_subreg_def; } else { + s32 saved_subreg_def = reg->subreg_def; s32 saved_off = reg->off; fake_reg.type = SCALAR_VALUE; @@ -15340,6 +15594,7 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s * otherwise another sync_linked_regs() will be incorrect. */ reg->off = saved_off; + reg->subreg_def = saved_subreg_def; scalar32_min_max_add(reg, &fake_reg); scalar_min_max_add(reg, &fake_reg); @@ -15481,7 +15736,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && dst_reg->id) collect_linked_regs(this_branch, dst_reg->id, &linked_regs); if (linked_regs.cnt > 1) { - err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + err = push_insn_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); if (err) return err; } @@ -15745,26 +16000,9 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * gen_ld_abs() may terminate the program at runtime, leading to * reference leak. */ - err = check_reference_leak(env, false); - if (err) { - verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n"); + err = check_resource_leak(env, false, true, "BPF_LD_[ABS|IND]"); + if (err) return err; - } - - if (env->cur_state->active_lock.ptr) { - verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n"); - return -EINVAL; - } - - if (env->cur_state->active_rcu_lock) { - verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n"); - return -EINVAL; - } - - if (env->cur_state->active_preempt_lock) { - verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_preempt_disable-ed region\n"); - return -EINVAL; - } if (regs[ctx_reg].type != PTR_TO_CTX) { verbose(env, @@ -15910,6 +16148,16 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char return -ENOTSUPP; } break; + case BPF_PROG_TYPE_KPROBE: + switch (env->prog->expected_attach_type) { + case BPF_TRACE_KPROBE_SESSION: + case BPF_TRACE_UPROBE_SESSION: + range = retval_range(0, 1); + break; + default: + return 0; + } + break; case BPF_PROG_TYPE_SK_LOOKUP: range = retval_range(SK_DROP, SK_PASS); break; @@ -16176,10 +16424,7 @@ static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta) /* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */ static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta) { - if (meta->btf == btf_vmlinux) - return meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || - meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]; - return false; + return meta->kfunc_flags & KF_FASTCALL; } /* LLVM define a bpf_fastcall function attribute. @@ -17514,8 +17759,20 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur, return false; for (i = 0; i < old->acquired_refs; i++) { - if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap)) + if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) || + old->refs[i].type != cur->refs[i].type) return false; + switch (old->refs[i].type) { + case REF_TYPE_PTR: + break; + case REF_TYPE_LOCK: + if (old->refs[i].ptr != cur->refs[i].ptr) + return false; + break; + default: + WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type); + return false; + } } return true; @@ -17593,19 +17850,6 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->speculative && !cur->speculative) return false; - if (old->active_lock.ptr != cur->active_lock.ptr) - return false; - - /* Old and cur active_lock's have to be either both present - * or both absent. - */ - if (!!old->active_lock.id != !!cur->active_lock.id) - return false; - - if (old->active_lock.id && - !check_ids(old->active_lock.id, cur->active_lock.id, &env->idmap_scratch)) - return false; - if (old->active_rcu_lock != cur->active_rcu_lock) return false; @@ -17877,9 +18121,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; int i, j, n, err, states_cnt = 0; - bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx); - bool add_new_state = force_new_state; - bool force_exact; + bool force_new_state, add_new_state, force_exact; + + force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || + /* Avoid accumulating infinitely long jmp history */ + cur->insn_hist_end - cur->insn_hist_start > 40; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -17889,6 +18135,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * In tests that amounts to up to 50% reduction into total verifier * memory consumption and 20% verifier time speedup. */ + add_new_state = force_new_state; if (env->jmps_processed - env->prev_jmps_processed >= 2 && env->insn_processed - env->prev_insn_processed >= 8) add_new_state = true; @@ -18076,7 +18323,7 @@ hit: * the current state. */ if (is_jmp_point(env, env->insn_idx)) - err = err ? : push_jmp_history(env, cur, 0, 0); + err = err ? : push_insn_history(env, cur, 0, 0); err = err ? : propagate_precision(env, &sl->state); if (err) return err; @@ -18175,8 +18422,8 @@ next: cur->parent = new; cur->first_insn_idx = insn_idx; + cur->insn_hist_start = cur->insn_hist_end; cur->dfs_depth = new->dfs_depth + 1; - clear_jmp_history(cur); new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all @@ -18344,7 +18591,7 @@ static int do_check(struct bpf_verifier_env *env) } if (is_jmp_point(env, env->insn_idx)) { - err = push_jmp_history(env, state, 0, 0); + err = push_insn_history(env, state, 0, 0); if (err) return err; } @@ -18504,7 +18751,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - if (env->cur_state->active_lock.ptr) { + if (cur_func(env)->active_locks) { if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) || (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) { @@ -18553,30 +18800,14 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } process_bpf_exit_full: - if (env->cur_state->active_lock.ptr && !env->cur_state->curframe) { - verbose(env, "bpf_spin_unlock is missing\n"); - return -EINVAL; - } - - if (env->cur_state->active_rcu_lock && !env->cur_state->curframe) { - verbose(env, "bpf_rcu_read_unlock is missing\n"); - return -EINVAL; - } - - if (env->cur_state->active_preempt_lock && !env->cur_state->curframe) { - verbose(env, "%d bpf_preempt_enable%s missing\n", - env->cur_state->active_preempt_lock, - env->cur_state->active_preempt_lock == 1 ? " is" : "(s) are"); - return -EINVAL; - } - /* We must do check_reference_leak here before * prepare_func_exit to handle the case when * state->curframe > 0, it may be a callback * function, for which reference_state must * match caller reference state when it exits. */ - err = check_reference_leak(env, exception_exit); + err = check_resource_leak(env, exception_exit, !env->cur_state->curframe, + "BPF_EXIT instruction"); if (err) return err; @@ -19835,6 +20066,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) * for this case. */ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: + case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL: if (type == BPF_READ) { if (BPF_MODE(insn->code) == BPF_MEM) insn->code = BPF_LDX | BPF_PROBE_MEM | @@ -20039,6 +20271,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; + if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) + func[i]->aux->jits_use_priv_stack = true; + func[i]->jit_requested = 1; func[i]->blinding_requested = prog->blinding_requested; func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; @@ -21201,7 +21436,7 @@ patch_map_ops_generic: delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement bpf_kptr_xchg inline */ @@ -21807,6 +22042,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } } + if (prog->aux->priv_stack_requested && !bpf_jit_supports_private_stack()) { + verbose(env, "Private stack not supported by jit\n"); + return -EACCES; + } + /* btf_ctx_access() used this to provide argument type info */ prog->aux->ctx_arg_info = st_ops_desc->arg_info[member_idx].info; @@ -22310,7 +22550,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ - env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); + env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; @@ -22545,7 +22785,8 @@ err_unlock: if (!is_priv) mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); + kvfree(env->insn_hist); err_free_env: - kfree(env); + kvfree(env); return ret; } |