diff options
Diffstat (limited to 'kernel/bpf')
31 files changed, 3942 insertions, 1470 deletions
| diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 02242614dcc7..1d3892168d32 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,8 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse  endif  CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o  obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o  obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 484706959556..2058e89b5ddd 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -307,8 +307,8 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key  }  /* Called from syscall or from eBPF program */ -static int array_map_update_elem(struct bpf_map *map, void *key, void *value, -				 u64 map_flags) +static long array_map_update_elem(struct bpf_map *map, void *key, void *value, +				  u64 map_flags)  {  	struct bpf_array *array = container_of(map, struct bpf_array, map);  	u32 index = *(u32 *)key; @@ -386,7 +386,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,  }  /* Called from syscall or from eBPF program */ -static int array_map_delete_elem(struct bpf_map *map, void *key) +static long array_map_delete_elem(struct bpf_map *map, void *key)  {  	return -EINVAL;  } @@ -686,8 +686,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {  	.seq_priv_size		= sizeof(struct bpf_iter_seq_array_map_info),  }; -static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn, -				   void *callback_ctx, u64 flags) +static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn, +				    void *callback_ctx, u64 flags)  {  	u32 i, key, num_elems = 0;  	struct bpf_array *array; @@ -721,6 +721,28 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_  	return num_elems;  } +static u64 array_map_mem_usage(const struct bpf_map *map) +{ +	struct bpf_array *array = container_of(map, struct bpf_array, map); +	bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; +	u32 elem_size = array->elem_size; +	u64 entries = map->max_entries; +	u64 usage = sizeof(*array); + +	if (percpu) { +		usage += entries * sizeof(void *); +		usage += entries * elem_size * num_possible_cpus(); +	} else { +		if (map->map_flags & BPF_F_MMAPABLE) { +			usage = PAGE_ALIGN(usage); +			usage += PAGE_ALIGN(entries * elem_size); +		} else { +			usage += entries * elem_size; +		} +	} +	return usage; +} +  BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array)  const struct bpf_map_ops array_map_ops = {  	.map_meta_equal = array_map_meta_equal, @@ -742,6 +764,7 @@ const struct bpf_map_ops array_map_ops = {  	.map_update_batch = generic_map_update_batch,  	.map_set_for_each_callback_args = map_set_for_each_callback_args,  	.map_for_each_callback = bpf_for_each_array_elem, +	.map_mem_usage = array_map_mem_usage,  	.map_btf_id = &array_map_btf_ids[0],  	.iter_seq_info = &iter_seq_info,  }; @@ -762,6 +785,7 @@ const struct bpf_map_ops percpu_array_map_ops = {  	.map_update_batch = generic_map_update_batch,  	.map_set_for_each_callback_args = map_set_for_each_callback_args,  	.map_for_each_callback = bpf_for_each_array_elem, +	.map_mem_usage = array_map_mem_usage,  	.map_btf_id = &array_map_btf_ids[0],  	.iter_seq_info = &iter_seq_info,  }; @@ -847,7 +871,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,  	return 0;  } -static int fd_array_map_delete_elem(struct bpf_map *map, void *key) +static long fd_array_map_delete_elem(struct bpf_map *map, void *key)  {  	struct bpf_array *array = container_of(map, struct bpf_array, map);  	void *old_ptr; @@ -1156,6 +1180,7 @@ const struct bpf_map_ops prog_array_map_ops = {  	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,  	.map_release_uref = prog_array_map_clear,  	.map_seq_show_elem = prog_array_map_seq_show_elem, +	.map_mem_usage = array_map_mem_usage,  	.map_btf_id = &array_map_btf_ids[0],  }; @@ -1257,6 +1282,7 @@ const struct bpf_map_ops perf_event_array_map_ops = {  	.map_fd_put_ptr = perf_event_fd_array_put_ptr,  	.map_release = perf_event_fd_array_release,  	.map_check_btf = map_check_no_btf, +	.map_mem_usage = array_map_mem_usage,  	.map_btf_id = &array_map_btf_ids[0],  }; @@ -1291,6 +1317,7 @@ const struct bpf_map_ops cgroup_array_map_ops = {  	.map_fd_get_ptr = cgroup_fd_array_get_ptr,  	.map_fd_put_ptr = cgroup_fd_array_put_ptr,  	.map_check_btf = map_check_no_btf, +	.map_mem_usage = array_map_mem_usage,  	.map_btf_id = &array_map_btf_ids[0],  };  #endif @@ -1379,5 +1406,6 @@ const struct bpf_map_ops array_of_maps_map_ops = {  	.map_lookup_batch = generic_map_lookup_batch,  	.map_update_batch = generic_map_update_batch,  	.map_check_btf = map_check_no_btf, +	.map_mem_usage = array_map_mem_usage,  	.map_btf_id = &array_map_btf_ids[0],  }; diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 48ee750849f2..540331b610a9 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -16,13 +16,6 @@ struct bpf_bloom_filter {  	struct bpf_map map;  	u32 bitset_mask;  	u32 hash_seed; -	/* If the size of the values in the bloom filter is u32 aligned, -	 * then it is more performant to use jhash2 as the underlying hash -	 * function, else we use jhash. This tracks the number of u32s -	 * in an u32-aligned value size. If the value size is not u32 aligned, -	 * this will be 0. -	 */ -	u32 aligned_u32_count;  	u32 nr_hash_funcs;  	unsigned long bitset[];  }; @@ -32,16 +25,15 @@ static u32 hash(struct bpf_bloom_filter *bloom, void *value,  {  	u32 h; -	if (bloom->aligned_u32_count) -		h = jhash2(value, bloom->aligned_u32_count, -			   bloom->hash_seed + index); +	if (likely(value_size % 4 == 0)) +		h = jhash2(value, value_size / 4, bloom->hash_seed + index);  	else  		h = jhash(value, value_size, bloom->hash_seed + index);  	return h & bloom->bitset_mask;  } -static int bloom_map_peek_elem(struct bpf_map *map, void *value) +static long bloom_map_peek_elem(struct bpf_map *map, void *value)  {  	struct bpf_bloom_filter *bloom =  		container_of(map, struct bpf_bloom_filter, map); @@ -56,7 +48,7 @@ static int bloom_map_peek_elem(struct bpf_map *map, void *value)  	return 0;  } -static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags) +static long bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)  {  	struct bpf_bloom_filter *bloom =  		container_of(map, struct bpf_bloom_filter, map); @@ -73,12 +65,12 @@ static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)  	return 0;  } -static int bloom_map_pop_elem(struct bpf_map *map, void *value) +static long bloom_map_pop_elem(struct bpf_map *map, void *value)  {  	return -EOPNOTSUPP;  } -static int bloom_map_delete_elem(struct bpf_map *map, void *value) +static long bloom_map_delete_elem(struct bpf_map *map, void *value)  {  	return -EOPNOTSUPP;  } @@ -152,11 +144,6 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)  	bloom->nr_hash_funcs = nr_hash_funcs;  	bloom->bitset_mask = bitset_mask; -	/* Check whether the value size is u32-aligned */ -	if ((attr->value_size & (sizeof(u32) - 1)) == 0) -		bloom->aligned_u32_count = -			attr->value_size / sizeof(u32); -  	if (!(attr->map_flags & BPF_F_ZERO_SEED))  		bloom->hash_seed = get_random_u32(); @@ -177,8 +164,8 @@ static void *bloom_map_lookup_elem(struct bpf_map *map, void *key)  	return ERR_PTR(-EINVAL);  } -static int bloom_map_update_elem(struct bpf_map *map, void *key, -				 void *value, u64 flags) +static long bloom_map_update_elem(struct bpf_map *map, void *key, +				  void *value, u64 flags)  {  	/* The eBPF program should use map_push_elem instead */  	return -EINVAL; @@ -193,6 +180,17 @@ static int bloom_map_check_btf(const struct bpf_map *map,  	return btf_type_is_void(key_type) ? 0 : -EINVAL;  } +static u64 bloom_map_mem_usage(const struct bpf_map *map) +{ +	struct bpf_bloom_filter *bloom; +	u64 bitset_bytes; + +	bloom = container_of(map, struct bpf_bloom_filter, map); +	bitset_bytes = BITS_TO_BYTES((u64)bloom->bitset_mask + 1); +	bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long)); +	return sizeof(*bloom) + bitset_bytes; +} +  BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter)  const struct bpf_map_ops bloom_filter_map_ops = {  	.map_meta_equal = bpf_map_meta_equal, @@ -206,5 +204,6 @@ const struct bpf_map_ops bloom_filter_map_ops = {  	.map_update_elem = bloom_map_update_elem,  	.map_delete_elem = bloom_map_delete_elem,  	.map_check_btf = bloom_map_check_btf, +	.map_mem_usage = bloom_map_mem_usage,  	.map_btf_id = &bpf_bloom_map_btf_ids[0],  }; diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 6cdf6d9ed91d..d44fe8dd9732 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -46,8 +46,6 @@ static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)  void bpf_cgrp_storage_free(struct cgroup *cgroup)  {  	struct bpf_local_storage *local_storage; -	bool free_cgroup_storage = false; -	unsigned long flags;  	rcu_read_lock();  	local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); @@ -57,14 +55,9 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)  	}  	bpf_cgrp_storage_lock(); -	raw_spin_lock_irqsave(&local_storage->lock, flags); -	free_cgroup_storage = bpf_local_storage_unlink_nolock(local_storage); -	raw_spin_unlock_irqrestore(&local_storage->lock, flags); +	bpf_local_storage_destroy(local_storage);  	bpf_cgrp_storage_unlock();  	rcu_read_unlock(); - -	if (free_cgroup_storage) -		kfree_rcu(local_storage, rcu);  }  static struct bpf_local_storage_data * @@ -100,8 +93,8 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)  	return sdata ? sdata->data : NULL;  } -static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, -					  void *value, u64 map_flags) +static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, +					 void *value, u64 map_flags)  {  	struct bpf_local_storage_data *sdata;  	struct cgroup *cgroup; @@ -128,11 +121,11 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)  	if (!sdata)  		return -ENOENT; -	bpf_selem_unlink(SELEM(sdata), true); +	bpf_selem_unlink(SELEM(sdata), false);  	return 0;  } -static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) +static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)  {  	struct cgroup *cgroup;  	int err, fd; @@ -156,7 +149,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)  static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)  { -	return bpf_local_storage_map_alloc(attr, &cgroup_cache); +	return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);  }  static void cgroup_storage_map_free(struct bpf_map *map) @@ -221,6 +214,7 @@ const struct bpf_map_ops cgrp_storage_map_ops = {  	.map_update_elem = bpf_cgrp_storage_update_elem,  	.map_delete_elem = bpf_cgrp_storage_delete_elem,  	.map_check_btf = bpf_local_storage_map_check_btf, +	.map_mem_usage = bpf_local_storage_map_mem_usage,  	.map_btf_id = &bpf_local_storage_map_btf_id[0],  	.map_owner_storage_ptr = cgroup_storage_ptr,  }; @@ -230,7 +224,7 @@ const struct bpf_func_proto bpf_cgrp_storage_get_proto = {  	.gpl_only	= false,  	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,  	.arg1_type	= ARG_CONST_MAP_PTR, -	.arg2_type	= ARG_PTR_TO_BTF_ID, +	.arg2_type	= ARG_PTR_TO_BTF_ID_OR_NULL,  	.arg2_btf_id	= &bpf_cgroup_btf_id[0],  	.arg3_type	= ARG_PTR_TO_MAP_VALUE_OR_NULL,  	.arg4_type	= ARG_ANYTHING, @@ -241,6 +235,6 @@ const struct bpf_func_proto bpf_cgrp_storage_delete_proto = {  	.gpl_only	= false,  	.ret_type	= RET_INTEGER,  	.arg1_type	= ARG_CONST_MAP_PTR, -	.arg2_type	= ARG_PTR_TO_BTF_ID, +	.arg2_type	= ARG_PTR_TO_BTF_ID_OR_NULL,  	.arg2_btf_id	= &bpf_cgroup_btf_id[0],  }; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 05f4c66c9089..b0ef45db207c 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -57,7 +57,6 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,  void bpf_inode_storage_free(struct inode *inode)  {  	struct bpf_local_storage *local_storage; -	bool free_inode_storage = false;  	struct bpf_storage_blob *bsb;  	bsb = bpf_inode(inode); @@ -72,51 +71,40 @@ void bpf_inode_storage_free(struct inode *inode)  		return;  	} -	raw_spin_lock_bh(&local_storage->lock); -	free_inode_storage = bpf_local_storage_unlink_nolock(local_storage); -	raw_spin_unlock_bh(&local_storage->lock); +	bpf_local_storage_destroy(local_storage);  	rcu_read_unlock(); - -	if (free_inode_storage) -		kfree_rcu(local_storage, rcu);  }  static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)  {  	struct bpf_local_storage_data *sdata; -	struct file *f; -	int fd; +	struct fd f = fdget_raw(*(int *)key); -	fd = *(int *)key; -	f = fget_raw(fd); -	if (!f) +	if (!f.file)  		return ERR_PTR(-EBADF); -	sdata = inode_storage_lookup(f->f_inode, map, true); -	fput(f); +	sdata = inode_storage_lookup(file_inode(f.file), map, true); +	fdput(f);  	return sdata ? sdata->data : NULL;  } -static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, -					 void *value, u64 map_flags) +static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, +					     void *value, u64 map_flags)  {  	struct bpf_local_storage_data *sdata; -	struct file *f; -	int fd; +	struct fd f = fdget_raw(*(int *)key); -	fd = *(int *)key; -	f = fget_raw(fd); -	if (!f) +	if (!f.file)  		return -EBADF; -	if (!inode_storage_ptr(f->f_inode)) { -		fput(f); +	if (!inode_storage_ptr(file_inode(f.file))) { +		fdput(f);  		return -EBADF;  	} -	sdata = bpf_local_storage_update(f->f_inode, +	sdata = bpf_local_storage_update(file_inode(f.file),  					 (struct bpf_local_storage_map *)map,  					 value, map_flags, GFP_ATOMIC); -	fput(f); +	fdput(f);  	return PTR_ERR_OR_ZERO(sdata);  } @@ -128,23 +116,21 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)  	if (!sdata)  		return -ENOENT; -	bpf_selem_unlink(SELEM(sdata), true); +	bpf_selem_unlink(SELEM(sdata), false);  	return 0;  } -static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) +static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)  { -	struct file *f; -	int fd, err; +	struct fd f = fdget_raw(*(int *)key); +	int err; -	fd = *(int *)key; -	f = fget_raw(fd); -	if (!f) +	if (!f.file)  		return -EBADF; -	err = inode_storage_delete(f->f_inode, map); -	fput(f); +	err = inode_storage_delete(file_inode(f.file), map); +	fdput(f);  	return err;  } @@ -205,7 +191,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,  static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)  { -	return bpf_local_storage_map_alloc(attr, &inode_cache); +	return bpf_local_storage_map_alloc(attr, &inode_cache, false);  }  static void inode_storage_map_free(struct bpf_map *map) @@ -223,6 +209,7 @@ const struct bpf_map_ops inode_storage_map_ops = {  	.map_update_elem = bpf_fd_inode_storage_update_elem,  	.map_delete_elem = bpf_fd_inode_storage_delete_elem,  	.map_check_btf = bpf_local_storage_map_check_btf, +	.map_mem_usage = bpf_local_storage_map_mem_usage,  	.map_btf_id = &bpf_local_storage_map_btf_id[0],  	.map_owner_storage_ptr = inode_storage_ptr,  }; @@ -234,7 +221,7 @@ const struct bpf_func_proto bpf_inode_storage_get_proto = {  	.gpl_only	= false,  	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,  	.arg1_type	= ARG_CONST_MAP_PTR, -	.arg2_type	= ARG_PTR_TO_BTF_ID, +	.arg2_type	= ARG_PTR_TO_BTF_ID_OR_NULL,  	.arg2_btf_id	= &bpf_inode_storage_btf_ids[0],  	.arg3_type	= ARG_PTR_TO_MAP_VALUE_OR_NULL,  	.arg4_type	= ARG_ANYTHING, @@ -245,6 +232,6 @@ const struct bpf_func_proto bpf_inode_storage_delete_proto = {  	.gpl_only	= false,  	.ret_type	= RET_INTEGER,  	.arg1_type	= ARG_CONST_MAP_PTR, -	.arg2_type	= ARG_PTR_TO_BTF_ID, +	.arg2_type	= ARG_PTR_TO_BTF_ID_OR_NULL,  	.arg2_btf_id	= &bpf_inode_storage_btf_ids[0],  }; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 5dc307bdeaeb..96856f130cbf 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -776,3 +776,73 @@ const struct bpf_func_proto bpf_loop_proto = {  	.arg3_type	= ARG_PTR_TO_STACK_OR_NULL,  	.arg4_type	= ARG_ANYTHING,  }; + +struct bpf_iter_num_kern { +	int cur; /* current value, inclusive */ +	int end; /* final value, exclusive */ +} __aligned(8); + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", +		  "Global functions as their definitions will be in vmlinux BTF"); + +__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) +{ +	struct bpf_iter_num_kern *s = (void *)it; + +	BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num)); +	BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num)); + +	BTF_TYPE_EMIT(struct btf_iter_num); + +	/* start == end is legit, it's an empty range and we'll just get NULL +	 * on first (and any subsequent) bpf_iter_num_next() call +	 */ +	if (start > end) { +		s->cur = s->end = 0; +		return -EINVAL; +	} + +	/* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */ +	if ((s64)end - (s64)start > BPF_MAX_LOOPS) { +		s->cur = s->end = 0; +		return -E2BIG; +	} + +	/* user will call bpf_iter_num_next() first, +	 * which will set s->cur to exactly start value; +	 * underflow shouldn't matter +	 */ +	s->cur = start - 1; +	s->end = end; + +	return 0; +} + +__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it) +{ +	struct bpf_iter_num_kern *s = (void *)it; + +	/* check failed initialization or if we are done (same behavior); +	 * need to be careful about overflow, so convert to s64 for checks, +	 * e.g., if s->cur == s->end == INT_MAX, we can't just do +	 * s->cur + 1 >= s->end +	 */ +	if ((s64)(s->cur + 1) >= s->end) { +		s->cur = s->end = 0; +		return NULL; +	} + +	s->cur++; + +	return &s->cur; +} + +__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it) +{ +	struct bpf_iter_num_kern *s = (void *)it; + +	s->cur = s->end = 0; +} + +__diag_pop(); diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 35f4138a54dc..47d9948d768f 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -51,11 +51,21 @@ owner_storage(struct bpf_local_storage_map *smap, void *owner)  	return map->ops->map_owner_storage_ptr(owner);  } +static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem) +{ +	return !hlist_unhashed_lockless(&selem->snode); +} +  static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)  {  	return !hlist_unhashed(&selem->snode);  } +static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem) +{ +	return !hlist_unhashed_lockless(&selem->map_node); +} +  static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)  {  	return !hlist_unhashed(&selem->map_node); @@ -70,11 +80,28 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,  	if (charge_mem && mem_charge(smap, owner, smap->elem_size))  		return NULL; -	selem = bpf_map_kzalloc(&smap->map, smap->elem_size, -				gfp_flags | __GFP_NOWARN); +	if (smap->bpf_ma) { +		migrate_disable(); +		selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags); +		migrate_enable(); +		if (selem) +			/* Keep the original bpf_map_kzalloc behavior +			 * before started using the bpf_mem_cache_alloc. +			 * +			 * No need to use zero_map_value. The bpf_selem_free() +			 * only does bpf_mem_cache_free when there is +			 * no other bpf prog is using the selem. +			 */ +			memset(SDATA(selem)->data, 0, smap->map.value_size); +	} else { +		selem = bpf_map_kzalloc(&smap->map, smap->elem_size, +					gfp_flags | __GFP_NOWARN); +	} +  	if (selem) {  		if (value)  			copy_map_value(&smap->map, SDATA(selem)->data, value); +		/* No need to call check_and_init_map_value as memory is zero init */  		return selem;  	} @@ -84,7 +111,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,  	return NULL;  } -void bpf_local_storage_free_rcu(struct rcu_head *rcu) +/* rcu tasks trace callback for bpf_ma == false */ +static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)  {  	struct bpf_local_storage *local_storage; @@ -98,7 +126,66 @@ void bpf_local_storage_free_rcu(struct rcu_head *rcu)  		kfree_rcu(local_storage, rcu);  } -static void bpf_selem_free_rcu(struct rcu_head *rcu) +static void bpf_local_storage_free_rcu(struct rcu_head *rcu) +{ +	struct bpf_local_storage *local_storage; + +	local_storage = container_of(rcu, struct bpf_local_storage, rcu); +	bpf_mem_cache_raw_free(local_storage); +} + +static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) +{ +	if (rcu_trace_implies_rcu_gp()) +		bpf_local_storage_free_rcu(rcu); +	else +		call_rcu(rcu, bpf_local_storage_free_rcu); +} + +/* Handle bpf_ma == false */ +static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, +				     bool vanilla_rcu) +{ +	if (vanilla_rcu) +		kfree_rcu(local_storage, rcu); +	else +		call_rcu_tasks_trace(&local_storage->rcu, +				     __bpf_local_storage_free_trace_rcu); +} + +static void bpf_local_storage_free(struct bpf_local_storage *local_storage, +				   struct bpf_local_storage_map *smap, +				   bool bpf_ma, bool reuse_now) +{ +	if (!local_storage) +		return; + +	if (!bpf_ma) { +		__bpf_local_storage_free(local_storage, reuse_now); +		return; +	} + +	if (!reuse_now) { +		call_rcu_tasks_trace(&local_storage->rcu, +				     bpf_local_storage_free_trace_rcu); +		return; +	} + +	if (smap) { +		migrate_disable(); +		bpf_mem_cache_free(&smap->storage_ma, local_storage); +		migrate_enable(); +	} else { +		/* smap could be NULL if the selem that triggered +		 * this 'local_storage' creation had been long gone. +		 * In this case, directly do call_rcu(). +		 */ +		call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); +	} +} + +/* rcu tasks trace callback for bpf_ma == false */ +static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)  {  	struct bpf_local_storage_elem *selem; @@ -109,13 +196,63 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)  		kfree_rcu(selem, rcu);  } +/* Handle bpf_ma == false */ +static void __bpf_selem_free(struct bpf_local_storage_elem *selem, +			     bool vanilla_rcu) +{ +	if (vanilla_rcu) +		kfree_rcu(selem, rcu); +	else +		call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu); +} + +static void bpf_selem_free_rcu(struct rcu_head *rcu) +{ +	struct bpf_local_storage_elem *selem; + +	selem = container_of(rcu, struct bpf_local_storage_elem, rcu); +	bpf_mem_cache_raw_free(selem); +} + +static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) +{ +	if (rcu_trace_implies_rcu_gp()) +		bpf_selem_free_rcu(rcu); +	else +		call_rcu(rcu, bpf_selem_free_rcu); +} + +void bpf_selem_free(struct bpf_local_storage_elem *selem, +		    struct bpf_local_storage_map *smap, +		    bool reuse_now) +{ +	bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + +	if (!smap->bpf_ma) { +		__bpf_selem_free(selem, reuse_now); +		return; +	} + +	if (!reuse_now) { +		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); +	} else { +		/* Instead of using the vanilla call_rcu(), +		 * bpf_mem_cache_free will be able to reuse selem +		 * immediately. +		 */ +		migrate_disable(); +		bpf_mem_cache_free(&smap->selem_ma, selem); +		migrate_enable(); +	} +} +  /* local_storage->lock must be held and selem->local_storage == local_storage.   * The caller must ensure selem->smap is still valid to be   * dereferenced for its smap->elem_size and smap->cache_idx.   */  static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,  					    struct bpf_local_storage_elem *selem, -					    bool uncharge_mem, bool use_trace_rcu) +					    bool uncharge_mem, bool reuse_now)  {  	struct bpf_local_storage_map *smap;  	bool free_local_storage; @@ -159,40 +296,75 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor  	    SDATA(selem))  		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); -	if (use_trace_rcu) -		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu); -	else -		kfree_rcu(selem, rcu); +	bpf_selem_free(selem, smap, reuse_now); + +	if (rcu_access_pointer(local_storage->smap) == smap) +		RCU_INIT_POINTER(local_storage->smap, NULL);  	return free_local_storage;  } -static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, -				       bool use_trace_rcu) +static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage, +				 struct bpf_local_storage_map *storage_smap, +				 struct bpf_local_storage_elem *selem) +{ + +	struct bpf_local_storage_map *selem_smap; + +	/* local_storage->smap may be NULL. If it is, get the bpf_ma +	 * from any selem in the local_storage->list. The bpf_ma of all +	 * local_storage and selem should have the same value +	 * for the same map type. +	 * +	 * If the local_storage->list is already empty, the caller will not +	 * care about the bpf_ma value also because the caller is not +	 * responsibile to free the local_storage. +	 */ + +	if (storage_smap) +		return storage_smap->bpf_ma; + +	if (!selem) { +		struct hlist_node *n; + +		n = rcu_dereference_check(hlist_first_rcu(&local_storage->list), +					  bpf_rcu_lock_held()); +		if (!n) +			return false; + +		selem = hlist_entry(n, struct bpf_local_storage_elem, snode); +	} +	selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + +	return selem_smap->bpf_ma; +} + +static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, +				     bool reuse_now)  { +	struct bpf_local_storage_map *storage_smap;  	struct bpf_local_storage *local_storage; -	bool free_local_storage = false; +	bool bpf_ma, free_local_storage = false;  	unsigned long flags; -	if (unlikely(!selem_linked_to_storage(selem))) +	if (unlikely(!selem_linked_to_storage_lockless(selem)))  		/* selem has already been unlinked from sk */  		return;  	local_storage = rcu_dereference_check(selem->local_storage,  					      bpf_rcu_lock_held()); +	storage_smap = rcu_dereference_check(local_storage->smap, +					     bpf_rcu_lock_held()); +	bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem); +  	raw_spin_lock_irqsave(&local_storage->lock, flags);  	if (likely(selem_linked_to_storage(selem)))  		free_local_storage = bpf_selem_unlink_storage_nolock( -			local_storage, selem, true, use_trace_rcu); +			local_storage, selem, true, reuse_now);  	raw_spin_unlock_irqrestore(&local_storage->lock, flags); -	if (free_local_storage) { -		if (use_trace_rcu) -			call_rcu_tasks_trace(&local_storage->rcu, -				     bpf_local_storage_free_rcu); -		else -			kfree_rcu(local_storage, rcu); -	} +	if (free_local_storage) +		bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);  }  void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, @@ -202,13 +374,13 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,  	hlist_add_head_rcu(&selem->snode, &local_storage->list);  } -void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) +static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)  {  	struct bpf_local_storage_map *smap;  	struct bpf_local_storage_map_bucket *b;  	unsigned long flags; -	if (unlikely(!selem_linked_to_map(selem))) +	if (unlikely(!selem_linked_to_map_lockless(selem)))  		/* selem has already be unlinked from smap */  		return; @@ -232,14 +404,14 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,  	raw_spin_unlock_irqrestore(&b->lock, flags);  } -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu) +void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)  {  	/* Always unlink from map before unlinking from local_storage  	 * because selem will be freed after successfully unlinked from  	 * the local_storage.  	 */  	bpf_selem_unlink_map(selem); -	__bpf_selem_unlink_storage(selem, use_trace_rcu); +	bpf_selem_unlink_storage(selem, reuse_now);  }  /* If cacheit_lockit is false, this lookup function is lockless */ @@ -312,13 +484,21 @@ int bpf_local_storage_alloc(void *owner,  	if (err)  		return err; -	storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), -				  gfp_flags | __GFP_NOWARN); +	if (smap->bpf_ma) { +		migrate_disable(); +		storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags); +		migrate_enable(); +	} else { +		storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), +					  gfp_flags | __GFP_NOWARN); +	} +  	if (!storage) {  		err = -ENOMEM;  		goto uncharge;  	} +	RCU_INIT_POINTER(storage->smap, smap);  	INIT_HLIST_HEAD(&storage->list);  	raw_spin_lock_init(&storage->lock);  	storage->owner = owner; @@ -358,7 +538,7 @@ int bpf_local_storage_alloc(void *owner,  	return 0;  uncharge: -	kfree(storage); +	bpf_local_storage_free(storage, smap, smap->bpf_ma, true);  	mem_uncharge(smap, owner, sizeof(*storage));  	return err;  } @@ -402,7 +582,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,  		err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);  		if (err) { -			kfree(selem); +			bpf_selem_free(selem, smap, true);  			mem_uncharge(smap, owner, smap->elem_size);  			return ERR_PTR(err);  		} @@ -420,7 +600,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,  		err = check_flags(old_sdata, map_flags);  		if (err)  			return ERR_PTR(err); -		if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) { +		if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) {  			copy_map_value_locked(&smap->map, old_sdata->data,  					      value, false);  			return old_sdata; @@ -485,7 +665,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,  	if (old_sdata) {  		bpf_selem_unlink_map(SELEM(old_sdata));  		bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), -						false, true); +						false, false);  	}  unlock: @@ -496,7 +676,7 @@ unlock_err:  	raw_spin_unlock_irqrestore(&local_storage->lock, flags);  	if (selem) {  		mem_uncharge(smap, owner, smap->elem_size); -		kfree(selem); +		bpf_selem_free(selem, smap, true);  	}  	return ERR_PTR(err);  } @@ -552,40 +732,6 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)  	return 0;  } -static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_attr *attr) -{ -	struct bpf_local_storage_map *smap; -	unsigned int i; -	u32 nbuckets; - -	smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); -	if (!smap) -		return ERR_PTR(-ENOMEM); -	bpf_map_init_from_attr(&smap->map, attr); - -	nbuckets = roundup_pow_of_two(num_possible_cpus()); -	/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ -	nbuckets = max_t(u32, 2, nbuckets); -	smap->bucket_log = ilog2(nbuckets); - -	smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), -					 nbuckets, GFP_USER | __GFP_NOWARN); -	if (!smap->buckets) { -		bpf_map_area_free(smap); -		return ERR_PTR(-ENOMEM); -	} - -	for (i = 0; i < nbuckets; i++) { -		INIT_HLIST_HEAD(&smap->buckets[i].list); -		raw_spin_lock_init(&smap->buckets[i].lock); -	} - -	smap->elem_size = offsetof(struct bpf_local_storage_elem, -				   sdata.data[attr->value_size]); - -	return smap; -} -  int bpf_local_storage_map_check_btf(const struct bpf_map *map,  				    const struct btf *btf,  				    const struct btf_type *key_type, @@ -603,11 +749,16 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,  	return 0;  } -bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage) +void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)  { +	struct bpf_local_storage_map *storage_smap;  	struct bpf_local_storage_elem *selem; -	bool free_storage = false; +	bool bpf_ma, free_storage = false;  	struct hlist_node *n; +	unsigned long flags; + +	storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held()); +	bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL);  	/* Neither the bpf_prog nor the bpf_map's syscall  	 * could be modifying the local_storage->list now. @@ -618,6 +769,7 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)  	 * when unlinking elem from the local_storage->list and  	 * the map's bucket->list.  	 */ +	raw_spin_lock_irqsave(&local_storage->lock, flags);  	hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {  		/* Always unlink from map before unlinking from  		 * local_storage. @@ -630,24 +782,89 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)  		 * of the loop will set the free_cgroup_storage to true.  		 */  		free_storage = bpf_selem_unlink_storage_nolock( -			local_storage, selem, false, false); +			local_storage, selem, false, true);  	} +	raw_spin_unlock_irqrestore(&local_storage->lock, flags); -	return free_storage; +	if (free_storage) +		bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);  } +u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) +{ +	struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map; +	u64 usage = sizeof(*smap); + +	/* The dynamically callocated selems are not counted currently. */ +	usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log); +	return usage; +} + +/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory. + * A deadlock free allocator is useful for storage that the bpf prog can easily + * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf. + * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses + * memory immediately. To be reuse-immediate safe, the owner destruction + * code path needs to go through a rcu grace period before calling + * bpf_local_storage_destroy(). + * + * When bpf_ma == false, the kmalloc and kfree are used. + */  struct bpf_map *  bpf_local_storage_map_alloc(union bpf_attr *attr, -			    struct bpf_local_storage_cache *cache) +			    struct bpf_local_storage_cache *cache, +			    bool bpf_ma)  {  	struct bpf_local_storage_map *smap; +	unsigned int i; +	u32 nbuckets; +	int err; + +	smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); +	if (!smap) +		return ERR_PTR(-ENOMEM); +	bpf_map_init_from_attr(&smap->map, attr); + +	nbuckets = roundup_pow_of_two(num_possible_cpus()); +	/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ +	nbuckets = max_t(u32, 2, nbuckets); +	smap->bucket_log = ilog2(nbuckets); + +	smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), +					 nbuckets, GFP_USER | __GFP_NOWARN); +	if (!smap->buckets) { +		err = -ENOMEM; +		goto free_smap; +	} + +	for (i = 0; i < nbuckets; i++) { +		INIT_HLIST_HEAD(&smap->buckets[i].list); +		raw_spin_lock_init(&smap->buckets[i].lock); +	} + +	smap->elem_size = offsetof(struct bpf_local_storage_elem, +				   sdata.data[attr->value_size]); -	smap = __bpf_local_storage_map_alloc(attr); -	if (IS_ERR(smap)) -		return ERR_CAST(smap); +	smap->bpf_ma = bpf_ma; +	if (bpf_ma) { +		err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false); +		if (err) +			goto free_smap; + +		err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false); +		if (err) { +			bpf_mem_alloc_destroy(&smap->selem_ma); +			goto free_smap; +		} +	}  	smap->cache_idx = bpf_local_storage_cache_idx_get(cache);  	return &smap->map; + +free_smap: +	kvfree(smap->buckets); +	bpf_map_area_free(smap); +	return ERR_PTR(err);  }  void bpf_local_storage_map_free(struct bpf_map *map, @@ -689,7 +906,7 @@ void bpf_local_storage_map_free(struct bpf_map *map,  				migrate_disable();  				this_cpu_inc(*busy_counter);  			} -			bpf_selem_unlink(selem, false); +			bpf_selem_unlink(selem, true);  			if (busy_counter) {  				this_cpu_dec(*busy_counter);  				migrate_enable(); @@ -713,6 +930,10 @@ void bpf_local_storage_map_free(struct bpf_map *map,  	 */  	synchronize_rcu(); +	if (smap->bpf_ma) { +		bpf_mem_alloc_destroy(&smap->selem_ma); +		bpf_mem_alloc_destroy(&smap->storage_ma); +	}  	kvfree(smap->buckets);  	bpf_map_area_free(smap);  } diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index ece9870cab68..d3f0a4825fa6 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -11,11 +11,13 @@  #include <linux/refcount.h>  #include <linux/mutex.h>  #include <linux/btf_ids.h> +#include <linux/rcupdate_wait.h>  enum bpf_struct_ops_state {  	BPF_STRUCT_OPS_STATE_INIT,  	BPF_STRUCT_OPS_STATE_INUSE,  	BPF_STRUCT_OPS_STATE_TOBEFREE, +	BPF_STRUCT_OPS_STATE_READY,  };  #define BPF_STRUCT_OPS_COMMON_VALUE			\ @@ -58,6 +60,13 @@ struct bpf_struct_ops_map {  	struct bpf_struct_ops_value kvalue;  }; +struct bpf_struct_ops_link { +	struct bpf_link link; +	struct bpf_map __rcu *map; +}; + +static DEFINE_MUTEX(update_mutex); +  #define VALUE_PREFIX "bpf_struct_ops_"  #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) @@ -249,6 +258,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,  	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;  	struct bpf_struct_ops_value *uvalue, *kvalue;  	enum bpf_struct_ops_state state; +	s64 refcnt;  	if (unlikely(*(u32 *)key != 0))  		return -ENOENT; @@ -267,7 +277,14 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,  	uvalue = value;  	memcpy(uvalue, st_map->uvalue, map->value_size);  	uvalue->state = state; -	refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); + +	/* This value offers the user space a general estimate of how +	 * many sockets are still utilizing this struct_ops for TCP +	 * congestion control. The number might not be exact, but it +	 * should sufficiently meet our present goals. +	 */ +	refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt); +	refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0));  	return 0;  } @@ -349,8 +366,8 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,  					   model, flags, tlinks, NULL);  } -static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, -					  void *value, u64 flags) +static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, +					   void *value, u64 flags)  {  	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;  	const struct bpf_struct_ops *st_ops = st_map->st_ops; @@ -491,12 +508,29 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,  		*(unsigned long *)(udata + moff) = prog->aux->id;  	} -	refcount_set(&kvalue->refcnt, 1); -	bpf_map_inc(map); +	if (st_map->map.map_flags & BPF_F_LINK) { +		err = st_ops->validate(kdata); +		if (err) +			goto reset_unlock; +		set_memory_rox((long)st_map->image, 1); +		/* Let bpf_link handle registration & unregistration. +		 * +		 * Pair with smp_load_acquire() during lookup_elem(). +		 */ +		smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY); +		goto unlock; +	}  	set_memory_rox((long)st_map->image, 1);  	err = st_ops->reg(kdata);  	if (likely(!err)) { +		/* This refcnt increment on the map here after +		 * 'st_ops->reg()' is secure since the state of the +		 * map must be set to INIT at this moment, and thus +		 * bpf_struct_ops_map_delete_elem() can't unregister +		 * or transition it to TOBEFREE concurrently. +		 */ +		bpf_map_inc(map);  		/* Pair with smp_load_acquire() during lookup_elem().  		 * It ensures the above udata updates (e.g. prog->aux->id)  		 * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. @@ -512,7 +546,6 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,  	 */  	set_memory_nx((long)st_map->image, 1);  	set_memory_rw((long)st_map->image, 1); -	bpf_map_put(map);  reset_unlock:  	bpf_struct_ops_map_put_progs(st_map); @@ -524,20 +557,22 @@ unlock:  	return err;  } -static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) +static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)  {  	enum bpf_struct_ops_state prev_state;  	struct bpf_struct_ops_map *st_map;  	st_map = (struct bpf_struct_ops_map *)map; +	if (st_map->map.map_flags & BPF_F_LINK) +		return -EOPNOTSUPP; +  	prev_state = cmpxchg(&st_map->kvalue.state,  			     BPF_STRUCT_OPS_STATE_INUSE,  			     BPF_STRUCT_OPS_STATE_TOBEFREE);  	switch (prev_state) {  	case BPF_STRUCT_OPS_STATE_INUSE:  		st_map->st_ops->unreg(&st_map->kvalue.data); -		if (refcount_dec_and_test(&st_map->kvalue.refcnt)) -			bpf_map_put(map); +		bpf_map_put(map);  		return 0;  	case BPF_STRUCT_OPS_STATE_TOBEFREE:  		return -EINPROGRESS; @@ -570,7 +605,7 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,  	kfree(value);  } -static void bpf_struct_ops_map_free(struct bpf_map *map) +static void __bpf_struct_ops_map_free(struct bpf_map *map)  {  	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; @@ -582,10 +617,32 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)  	bpf_map_area_free(st_map);  } +static void bpf_struct_ops_map_free(struct bpf_map *map) +{ +	/* The struct_ops's function may switch to another struct_ops. +	 * +	 * For example, bpf_tcp_cc_x->init() may switch to +	 * another tcp_cc_y by calling +	 * setsockopt(TCP_CONGESTION, "tcp_cc_y"). +	 * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called +	 * and its refcount may reach 0 which then free its +	 * trampoline image while tcp_cc_x is still running. +	 * +	 * A vanilla rcu gp is to wait for all bpf-tcp-cc prog +	 * to finish. bpf-tcp-cc prog is non sleepable. +	 * A rcu_tasks gp is to wait for the last few insn +	 * in the tramopline image to finish before releasing +	 * the trampoline image. +	 */ +	synchronize_rcu_mult(call_rcu, call_rcu_tasks); + +	__bpf_struct_ops_map_free(map); +} +  static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)  {  	if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || -	    attr->map_flags || !attr->btf_vmlinux_value_type_id) +	    (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)  		return -EINVAL;  	return 0;  } @@ -609,6 +666,9 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)  	if (attr->value_size != vt->size)  		return ERR_PTR(-EINVAL); +	if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update)) +		return ERR_PTR(-EOPNOTSUPP); +  	t = st_ops->type;  	st_map_size = sizeof(*st_map) + @@ -630,7 +690,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)  				   NUMA_NO_NODE);  	st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);  	if (!st_map->uvalue || !st_map->links || !st_map->image) { -		bpf_struct_ops_map_free(map); +		__bpf_struct_ops_map_free(map);  		return ERR_PTR(-ENOMEM);  	} @@ -641,6 +701,21 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)  	return map;  } +static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map) +{ +	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; +	const struct bpf_struct_ops *st_ops = st_map->st_ops; +	const struct btf_type *vt = st_ops->value_type; +	u64 usage; + +	usage = sizeof(*st_map) + +			vt->size - sizeof(struct bpf_struct_ops_value); +	usage += vt->size; +	usage += btf_type_vlen(vt) * sizeof(struct bpf_links *); +	usage += PAGE_SIZE; +	return usage; +} +  BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map)  const struct bpf_map_ops bpf_struct_ops_map_ops = {  	.map_alloc_check = bpf_struct_ops_map_alloc_check, @@ -651,6 +726,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {  	.map_delete_elem = bpf_struct_ops_map_delete_elem,  	.map_update_elem = bpf_struct_ops_map_update_elem,  	.map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, +	.map_mem_usage = bpf_struct_ops_map_mem_usage,  	.map_btf_id = &bpf_struct_ops_map_btf_ids[0],  }; @@ -660,41 +736,175 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {  bool bpf_struct_ops_get(const void *kdata)  {  	struct bpf_struct_ops_value *kvalue; +	struct bpf_struct_ops_map *st_map; +	struct bpf_map *map;  	kvalue = container_of(kdata, struct bpf_struct_ops_value, data); +	st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); -	return refcount_inc_not_zero(&kvalue->refcnt); +	map = __bpf_map_inc_not_zero(&st_map->map, false); +	return !IS_ERR(map);  } -static void bpf_struct_ops_put_rcu(struct rcu_head *head) +void bpf_struct_ops_put(const void *kdata)  { +	struct bpf_struct_ops_value *kvalue;  	struct bpf_struct_ops_map *st_map; -	st_map = container_of(head, struct bpf_struct_ops_map, rcu); +	kvalue = container_of(kdata, struct bpf_struct_ops_value, data); +	st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); +  	bpf_map_put(&st_map->map);  } -void bpf_struct_ops_put(const void *kdata) +static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)  { -	struct bpf_struct_ops_value *kvalue; +	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; -	kvalue = container_of(kdata, struct bpf_struct_ops_value, data); -	if (refcount_dec_and_test(&kvalue->refcnt)) { -		struct bpf_struct_ops_map *st_map; +	return map->map_type == BPF_MAP_TYPE_STRUCT_OPS && +		map->map_flags & BPF_F_LINK && +		/* Pair with smp_store_release() during map_update */ +		smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY; +} -		st_map = container_of(kvalue, struct bpf_struct_ops_map, -				      kvalue); -		/* The struct_ops's function may switch to another struct_ops. -		 * -		 * For example, bpf_tcp_cc_x->init() may switch to -		 * another tcp_cc_y by calling -		 * setsockopt(TCP_CONGESTION, "tcp_cc_y"). -		 * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called -		 * and its map->refcnt may reach 0 which then free its -		 * trampoline image while tcp_cc_x is still running. -		 * -		 * Thus, a rcu grace period is needed here. +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) +{ +	struct bpf_struct_ops_link *st_link; +	struct bpf_struct_ops_map *st_map; + +	st_link = container_of(link, struct bpf_struct_ops_link, link); +	st_map = (struct bpf_struct_ops_map *) +		rcu_dereference_protected(st_link->map, true); +	if (st_map) { +		/* st_link->map can be NULL if +		 * bpf_struct_ops_link_create() fails to register.  		 */ -		call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); +		st_map->st_ops->unreg(&st_map->kvalue.data); +		bpf_map_put(&st_map->map);  	} +	kfree(st_link); +} + +static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link, +					    struct seq_file *seq) +{ +	struct bpf_struct_ops_link *st_link; +	struct bpf_map *map; + +	st_link = container_of(link, struct bpf_struct_ops_link, link); +	rcu_read_lock(); +	map = rcu_dereference(st_link->map); +	seq_printf(seq, "map_id:\t%d\n", map->id); +	rcu_read_unlock();  } + +static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, +					       struct bpf_link_info *info) +{ +	struct bpf_struct_ops_link *st_link; +	struct bpf_map *map; + +	st_link = container_of(link, struct bpf_struct_ops_link, link); +	rcu_read_lock(); +	map = rcu_dereference(st_link->map); +	info->struct_ops.map_id = map->id; +	rcu_read_unlock(); +	return 0; +} + +static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map, +					  struct bpf_map *expected_old_map) +{ +	struct bpf_struct_ops_map *st_map, *old_st_map; +	struct bpf_map *old_map; +	struct bpf_struct_ops_link *st_link; +	int err = 0; + +	st_link = container_of(link, struct bpf_struct_ops_link, link); +	st_map = container_of(new_map, struct bpf_struct_ops_map, map); + +	if (!bpf_struct_ops_valid_to_reg(new_map)) +		return -EINVAL; + +	mutex_lock(&update_mutex); + +	old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); +	if (expected_old_map && old_map != expected_old_map) { +		err = -EPERM; +		goto err_out; +	} + +	old_st_map = container_of(old_map, struct bpf_struct_ops_map, map); +	/* The new and old struct_ops must be the same type. */ +	if (st_map->st_ops != old_st_map->st_ops) { +		err = -EINVAL; +		goto err_out; +	} + +	err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data); +	if (err) +		goto err_out; + +	bpf_map_inc(new_map); +	rcu_assign_pointer(st_link->map, new_map); +	bpf_map_put(old_map); + +err_out: +	mutex_unlock(&update_mutex); + +	return err; +} + +static const struct bpf_link_ops bpf_struct_ops_map_lops = { +	.dealloc = bpf_struct_ops_map_link_dealloc, +	.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, +	.fill_link_info = bpf_struct_ops_map_link_fill_link_info, +	.update_map = bpf_struct_ops_map_link_update, +}; + +int bpf_struct_ops_link_create(union bpf_attr *attr) +{ +	struct bpf_struct_ops_link *link = NULL; +	struct bpf_link_primer link_primer; +	struct bpf_struct_ops_map *st_map; +	struct bpf_map *map; +	int err; + +	map = bpf_map_get(attr->link_create.map_fd); +	if (IS_ERR(map)) +		return PTR_ERR(map); + +	st_map = (struct bpf_struct_ops_map *)map; + +	if (!bpf_struct_ops_valid_to_reg(map)) { +		err = -EINVAL; +		goto err_out; +	} + +	link = kzalloc(sizeof(*link), GFP_USER); +	if (!link) { +		err = -ENOMEM; +		goto err_out; +	} +	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); + +	err = bpf_link_prime(&link->link, &link_primer); +	if (err) +		goto err_out; + +	err = st_map->st_ops->reg(st_map->kvalue.data); +	if (err) { +		bpf_link_cleanup(&link_primer); +		link = NULL; +		goto err_out; +	} +	RCU_INIT_POINTER(link->map, map); + +	return bpf_link_settle(&link_primer); + +err_out: +	bpf_map_put(map); +	kfree(link); +	return err; +} + diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 1e486055a523..adf6dfe0ba68 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -72,8 +72,6 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,  void bpf_task_storage_free(struct task_struct *task)  {  	struct bpf_local_storage *local_storage; -	bool free_task_storage = false; -	unsigned long flags;  	rcu_read_lock(); @@ -84,14 +82,9 @@ void bpf_task_storage_free(struct task_struct *task)  	}  	bpf_task_storage_lock(); -	raw_spin_lock_irqsave(&local_storage->lock, flags); -	free_task_storage = bpf_local_storage_unlink_nolock(local_storage); -	raw_spin_unlock_irqrestore(&local_storage->lock, flags); +	bpf_local_storage_destroy(local_storage);  	bpf_task_storage_unlock();  	rcu_read_unlock(); - -	if (free_task_storage) -		kfree_rcu(local_storage, rcu);  }  static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) @@ -127,8 +120,8 @@ out:  	return ERR_PTR(err);  } -static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, -					    void *value, u64 map_flags) +static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, +					     void *value, u64 map_flags)  {  	struct bpf_local_storage_data *sdata;  	struct task_struct *task; @@ -175,12 +168,12 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map,  	if (!nobusy)  		return -EBUSY; -	bpf_selem_unlink(SELEM(sdata), true); +	bpf_selem_unlink(SELEM(sdata), false);  	return 0;  } -static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) +static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)  {  	struct task_struct *task;  	unsigned int f_flags; @@ -316,7 +309,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)  static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)  { -	return bpf_local_storage_map_alloc(attr, &task_cache); +	return bpf_local_storage_map_alloc(attr, &task_cache, true);  }  static void task_storage_map_free(struct bpf_map *map) @@ -335,6 +328,7 @@ const struct bpf_map_ops task_storage_map_ops = {  	.map_update_elem = bpf_pid_task_storage_update_elem,  	.map_delete_elem = bpf_pid_task_storage_delete_elem,  	.map_check_btf = bpf_local_storage_map_check_btf, +	.map_mem_usage = bpf_local_storage_map_mem_usage,  	.map_btf_id = &bpf_local_storage_map_btf_id[0],  	.map_owner_storage_ptr = task_storage_ptr,  }; @@ -344,7 +338,7 @@ const struct bpf_func_proto bpf_task_storage_get_recur_proto = {  	.gpl_only = false,  	.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,  	.arg1_type = ARG_CONST_MAP_PTR, -	.arg2_type = ARG_PTR_TO_BTF_ID, +	.arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,  	.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],  	.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,  	.arg4_type = ARG_ANYTHING, @@ -355,7 +349,7 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {  	.gpl_only = false,  	.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,  	.arg1_type = ARG_CONST_MAP_PTR, -	.arg2_type = ARG_PTR_TO_BTF_ID, +	.arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,  	.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],  	.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,  	.arg4_type = ARG_ANYTHING, @@ -366,7 +360,7 @@ const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {  	.gpl_only = false,  	.ret_type = RET_INTEGER,  	.arg1_type = ARG_CONST_MAP_PTR, -	.arg2_type = ARG_PTR_TO_BTF_ID, +	.arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,  	.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],  }; @@ -375,6 +369,6 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = {  	.gpl_only = false,  	.ret_type = RET_INTEGER,  	.arg1_type = ARG_CONST_MAP_PTR, -	.arg2_type = ARG_PTR_TO_BTF_ID, +	.arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,  	.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],  }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 73780748404c..6b682b8e4b50 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -25,6 +25,9 @@  #include <linux/bsearch.h>  #include <linux/kobject.h>  #include <linux/sysfs.h> + +#include <net/netfilter/nf_bpf_link.h> +  #include <net/sock.h>  #include "../tools/lib/bpf/relo_core.h" @@ -207,6 +210,12 @@ enum btf_kfunc_hook {  	BTF_KFUNC_HOOK_TRACING,  	BTF_KFUNC_HOOK_SYSCALL,  	BTF_KFUNC_HOOK_FMODRET, +	BTF_KFUNC_HOOK_CGROUP_SKB, +	BTF_KFUNC_HOOK_SCHED_ACT, +	BTF_KFUNC_HOOK_SK_SKB, +	BTF_KFUNC_HOOK_SOCKET_FILTER, +	BTF_KFUNC_HOOK_LWT, +	BTF_KFUNC_HOOK_NETFILTER,  	BTF_KFUNC_HOOK_MAX,  }; @@ -572,8 +581,8 @@ static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)  			*btf_p = btf;  			return ret;  		} -		spin_lock_bh(&btf_idr_lock);  		btf_put(btf); +		spin_lock_bh(&btf_idr_lock);  	}  	spin_unlock_bh(&btf_idr_lock);  	return ret; @@ -1661,10 +1670,8 @@ static void btf_struct_metas_free(struct btf_struct_metas *tab)  	if (!tab)  		return; -	for (i = 0; i < tab->cnt; i++) { +	for (i = 0; i < tab->cnt; i++)  		btf_record_free(tab->types[i].record); -		kfree(tab->types[i].field_offs); -	}  	kfree(tab);  } @@ -3226,12 +3233,6 @@ static void btf_struct_log(struct btf_verifier_env *env,  	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));  } -enum btf_field_info_type { -	BTF_FIELD_SPIN_LOCK, -	BTF_FIELD_TIMER, -	BTF_FIELD_KPTR, -}; -  enum {  	BTF_FIELD_IGNORE = 0,  	BTF_FIELD_FOUND  = 1, @@ -3283,9 +3284,9 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,  	/* Reject extra tags */  	if (btf_type_is_type_tag(btf_type_by_id(btf, t->type)))  		return -EINVAL; -	if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) +	if (!strcmp("kptr_untrusted", __btf_name_by_offset(btf, t->name_off)))  		type = BPF_KPTR_UNREF; -	else if (!strcmp("kptr_ref", __btf_name_by_offset(btf, t->name_off))) +	else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))  		type = BPF_KPTR_REF;  	else  		return -EINVAL; @@ -3394,6 +3395,7 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,  	field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");  	field_mask_test_name(BPF_RB_ROOT,   "bpf_rb_root");  	field_mask_test_name(BPF_RB_NODE,   "bpf_rb_node"); +	field_mask_test_name(BPF_REFCOUNT,  "bpf_refcount");  	/* Only return BPF_KPTR when all other types with matchable names fail */  	if (field_mask & BPF_KPTR) { @@ -3442,6 +3444,7 @@ static int btf_find_struct_field(const struct btf *btf,  		case BPF_TIMER:  		case BPF_LIST_NODE:  		case BPF_RB_NODE: +		case BPF_REFCOUNT:  			ret = btf_find_struct(btf, member_type, off, sz, field_type,  					      idx < info_cnt ? &info[idx] : &tmp);  			if (ret < 0) @@ -3507,6 +3510,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,  		case BPF_TIMER:  		case BPF_LIST_NODE:  		case BPF_RB_NODE: +		case BPF_REFCOUNT:  			ret = btf_find_struct(btf, var_type, off, sz, field_type,  					      idx < info_cnt ? &info[idx] : &tmp);  			if (ret < 0) @@ -3557,7 +3561,10 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,  {  	struct module *mod = NULL;  	const struct btf_type *t; -	struct btf *kernel_btf; +	/* If a matching btf type is found in kernel or module BTFs, kptr_ref +	 * is that BTF, otherwise it's program BTF +	 */ +	struct btf *kptr_btf;  	int ret;  	s32 id; @@ -3566,7 +3573,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,  	 */  	t = btf_type_by_id(btf, info->kptr.type_id);  	id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), -			     &kernel_btf); +			     &kptr_btf); +	if (id == -ENOENT) { +		/* btf_parse_kptr should only be called w/ btf = program BTF */ +		WARN_ON_ONCE(btf_is_kernel(btf)); + +		/* Type exists only in program BTF. Assume that it's a MEM_ALLOC +		 * kptr allocated via bpf_obj_new +		 */ +		field->kptr.dtor = NULL; +		id = info->kptr.type_id; +		kptr_btf = (struct btf *)btf; +		btf_get(kptr_btf); +		goto found_dtor; +	}  	if (id < 0)  		return id; @@ -3583,20 +3603,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,  		 * can be used as a referenced pointer and be stored in a map at  		 * the same time.  		 */ -		dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); +		dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id);  		if (dtor_btf_id < 0) {  			ret = dtor_btf_id;  			goto end_btf;  		} -		dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); +		dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id);  		if (!dtor_func) {  			ret = -ENOENT;  			goto end_btf;  		} -		if (btf_is_module(kernel_btf)) { -			mod = btf_try_get_module(kernel_btf); +		if (btf_is_module(kptr_btf)) { +			mod = btf_try_get_module(kptr_btf);  			if (!mod) {  				ret = -ENXIO;  				goto end_btf; @@ -3606,7 +3626,7 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,  		/* We already verified dtor_func to be btf_type_is_func  		 * in register_btf_id_dtor_kfuncs.  		 */ -		dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); +		dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off);  		addr = kallsyms_lookup_name(dtor_func_name);  		if (!addr) {  			ret = -EINVAL; @@ -3615,14 +3635,15 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,  		field->kptr.dtor = (void *)addr;  	} +found_dtor:  	field->kptr.btf_id = id; -	field->kptr.btf = kernel_btf; +	field->kptr.btf = kptr_btf;  	field->kptr.module = mod;  	return 0;  end_mod:  	module_put(mod);  end_btf: -	btf_put(kernel_btf); +	btf_put(kptr_btf);  	return ret;  } @@ -3684,12 +3705,24 @@ static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field,  					    __alignof__(struct bpf_rb_node));  } +static int btf_field_cmp(const void *_a, const void *_b, const void *priv) +{ +	const struct btf_field *a = (const struct btf_field *)_a; +	const struct btf_field *b = (const struct btf_field *)_b; + +	if (a->offset < b->offset) +		return -1; +	else if (a->offset > b->offset) +		return 1; +	return 0; +} +  struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,  				    u32 field_mask, u32 value_size)  {  	struct btf_field_info info_arr[BTF_FIELDS_MAX]; +	u32 next_off = 0, field_type_size;  	struct btf_record *rec; -	u32 next_off = 0;  	int ret, i, cnt;  	ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr)); @@ -3708,8 +3741,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type  	rec->spin_lock_off = -EINVAL;  	rec->timer_off = -EINVAL; +	rec->refcount_off = -EINVAL;  	for (i = 0; i < cnt; i++) { -		if (info_arr[i].off + btf_field_type_size(info_arr[i].type) > value_size) { +		field_type_size = btf_field_type_size(info_arr[i].type); +		if (info_arr[i].off + field_type_size > value_size) {  			WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size);  			ret = -EFAULT;  			goto end; @@ -3718,11 +3753,12 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type  			ret = -EEXIST;  			goto end;  		} -		next_off = info_arr[i].off + btf_field_type_size(info_arr[i].type); +		next_off = info_arr[i].off + field_type_size;  		rec->field_mask |= info_arr[i].type;  		rec->fields[i].offset = info_arr[i].off;  		rec->fields[i].type = info_arr[i].type; +		rec->fields[i].size = field_type_size;  		switch (info_arr[i].type) {  		case BPF_SPIN_LOCK: @@ -3735,6 +3771,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type  			/* Cache offset for faster lookup at runtime */  			rec->timer_off = rec->fields[i].offset;  			break; +		case BPF_REFCOUNT: +			WARN_ON_ONCE(rec->refcount_off >= 0); +			/* Cache offset for faster lookup at runtime */ +			rec->refcount_off = rec->fields[i].offset; +			break;  		case BPF_KPTR_UNREF:  		case BPF_KPTR_REF:  			ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); @@ -3768,30 +3809,16 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type  		goto end;  	} -	/* need collection identity for non-owning refs before allowing this -	 * -	 * Consider a node type w/ both list and rb_node fields: -	 *   struct node { -	 *     struct bpf_list_node l; -	 *     struct bpf_rb_node r; -	 *   } -	 * -	 * Used like so: -	 *   struct node *n = bpf_obj_new(....); -	 *   bpf_list_push_front(&list_head, &n->l); -	 *   bpf_rbtree_remove(&rb_root, &n->r); -	 * -	 * It should not be possible to rbtree_remove the node since it hasn't -	 * been added to a tree. But push_front converts n to a non-owning -	 * reference, and rbtree_remove accepts the non-owning reference to -	 * a type w/ bpf_rb_node field. -	 */ -	if (btf_record_has_field(rec, BPF_LIST_NODE) && +	if (rec->refcount_off < 0 && +	    btf_record_has_field(rec, BPF_LIST_NODE) &&  	    btf_record_has_field(rec, BPF_RB_NODE)) {  		ret = -EINVAL;  		goto end;  	} +	sort_r(rec->fields, rec->cnt, sizeof(struct btf_field), btf_field_cmp, +	       NULL, rec); +  	return rec;  end:  	btf_record_free(rec); @@ -3873,61 +3900,6 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)  	return 0;  } -static int btf_field_offs_cmp(const void *_a, const void *_b, const void *priv) -{ -	const u32 a = *(const u32 *)_a; -	const u32 b = *(const u32 *)_b; - -	if (a < b) -		return -1; -	else if (a > b) -		return 1; -	return 0; -} - -static void btf_field_offs_swap(void *_a, void *_b, int size, const void *priv) -{ -	struct btf_field_offs *foffs = (void *)priv; -	u32 *off_base = foffs->field_off; -	u32 *a = _a, *b = _b; -	u8 *sz_a, *sz_b; - -	sz_a = foffs->field_sz + (a - off_base); -	sz_b = foffs->field_sz + (b - off_base); - -	swap(*a, *b); -	swap(*sz_a, *sz_b); -} - -struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec) -{ -	struct btf_field_offs *foffs; -	u32 i, *off; -	u8 *sz; - -	BUILD_BUG_ON(ARRAY_SIZE(foffs->field_off) != ARRAY_SIZE(foffs->field_sz)); -	if (IS_ERR_OR_NULL(rec)) -		return NULL; - -	foffs = kzalloc(sizeof(*foffs), GFP_KERNEL | __GFP_NOWARN); -	if (!foffs) -		return ERR_PTR(-ENOMEM); - -	off = foffs->field_off; -	sz = foffs->field_sz; -	for (i = 0; i < rec->cnt; i++) { -		off[i] = rec->fields[i].offset; -		sz[i] = btf_field_type_size(rec->fields[i].type); -	} -	foffs->cnt = rec->cnt; - -	if (foffs->cnt == 1) -		return foffs; -	sort_r(foffs->field_off, foffs->cnt, sizeof(foffs->field_off[0]), -	       btf_field_offs_cmp, btf_field_offs_swap, foffs); -	return foffs; -} -  static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,  			      u32 type_id, void *data, u8 bits_offset,  			      struct btf_show *show) @@ -5332,6 +5304,7 @@ static const char *alloc_obj_fields[] = {  	"bpf_list_node",  	"bpf_rb_root",  	"bpf_rb_node", +	"bpf_refcount",  };  static struct btf_struct_metas * @@ -5370,7 +5343,6 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)  	for (i = 1; i < n; i++) {  		struct btf_struct_metas *new_tab;  		const struct btf_member *member; -		struct btf_field_offs *foffs;  		struct btf_struct_meta *type;  		struct btf_record *record;  		const struct btf_type *t; @@ -5406,23 +5378,13 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)  		type = &tab->types[tab->cnt];  		type->btf_id = i;  		record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | -						  BPF_RB_ROOT | BPF_RB_NODE, t->size); +						  BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);  		/* The record cannot be unset, treat it as an error if so */  		if (IS_ERR_OR_NULL(record)) {  			ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;  			goto free;  		} -		foffs = btf_parse_field_offs(record); -		/* We need the field_offs to be valid for a valid record, -		 * either both should be set or both should be unset. -		 */ -		if (IS_ERR_OR_NULL(foffs)) { -			btf_record_free(record); -			ret = -EFAULT; -			goto free; -		}  		type->record = record; -		type->field_offs = foffs;  		tab->cnt++;  	}  	return tab; @@ -5489,38 +5451,45 @@ static int btf_check_type_tags(struct btf_verifier_env *env,  	return 0;  } -static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, -			     u32 log_level, char __user *log_ubuf, u32 log_size) +static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size) +{ +	u32 log_true_size; +	int err; + +	err = bpf_vlog_finalize(log, &log_true_size); + +	if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) && +	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size), +				  &log_true_size, sizeof(log_true_size))) +		err = -EFAULT; + +	return err; +} + +static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)  { +	bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel); +	char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);  	struct btf_struct_metas *struct_meta_tab;  	struct btf_verifier_env *env = NULL; -	struct bpf_verifier_log *log;  	struct btf *btf = NULL;  	u8 *data; -	int err; +	int err, ret; -	if (btf_data_size > BTF_MAX_SIZE) +	if (attr->btf_size > BTF_MAX_SIZE)  		return ERR_PTR(-E2BIG);  	env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);  	if (!env)  		return ERR_PTR(-ENOMEM); -	log = &env->log; -	if (log_level || log_ubuf || log_size) { -		/* user requested verbose verifier output -		 * and supplied buffer to store the verification trace -		 */ -		log->level = log_level; -		log->ubuf = log_ubuf; -		log->len_total = log_size; - -		/* log attributes have to be sane */ -		if (!bpf_verifier_log_attr_valid(log)) { -			err = -EINVAL; -			goto errout; -		} -	} +	/* user could have requested verbose verifier output +	 * and supplied buffer to store the verification trace +	 */ +	err = bpf_vlog_init(&env->log, attr->btf_log_level, +			    log_ubuf, attr->btf_log_size); +	if (err) +		goto errout_free;  	btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);  	if (!btf) { @@ -5529,16 +5498,16 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,  	}  	env->btf = btf; -	data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); +	data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN);  	if (!data) {  		err = -ENOMEM;  		goto errout;  	}  	btf->data = data; -	btf->data_size = btf_data_size; +	btf->data_size = attr->btf_size; -	if (copy_from_bpfptr(data, btf_data, btf_data_size)) { +	if (copy_from_bpfptr(data, btf_data, attr->btf_size)) {  		err = -EFAULT;  		goto errout;  	} @@ -5561,7 +5530,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,  	if (err)  		goto errout; -	struct_meta_tab = btf_parse_struct_metas(log, btf); +	struct_meta_tab = btf_parse_struct_metas(&env->log, btf);  	if (IS_ERR(struct_meta_tab)) {  		err = PTR_ERR(struct_meta_tab);  		goto errout; @@ -5578,10 +5547,9 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,  		}  	} -	if (log->level && bpf_verifier_log_full(log)) { -		err = -ENOSPC; -		goto errout_meta; -	} +	err = finalize_log(&env->log, uattr, uattr_size); +	if (err) +		goto errout_free;  	btf_verifier_env_free(env);  	refcount_set(&btf->refcnt, 1); @@ -5590,6 +5558,11 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,  errout_meta:  	btf_free_struct_meta_tab(btf);  errout: +	/* overwrite err with -ENOSPC or -EFAULT */ +	ret = finalize_log(&env->log, uattr, uattr_size); +	if (ret) +		err = ret; +errout_free:  	btf_verifier_env_free(env);  	if (btf)  		btf_free(btf); @@ -5684,6 +5657,10 @@ again:  	 * int socket_filter_bpf_prog(struct __sk_buff *skb)  	 * { // no fields of skb are ever used }  	 */ +	if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0) +		return ctx_type; +	if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0) +		return ctx_type;  	if (strcmp(ctx_tname, tname)) {  		/* bpf_user_pt_regs_t is a typedef, so resolve it to  		 * underlying struct and check name again @@ -5891,12 +5868,8 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)  static bool is_int_ptr(struct btf *btf, const struct btf_type *t)  { -	/* t comes in already as a pointer */ -	t = btf_type_by_id(btf, t->type); - -	/* allow const */ -	if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST) -		t = btf_type_by_id(btf, t->type); +	/* skip modifiers */ +	t = btf_type_skip_modifiers(btf, t->type, NULL);  	return btf_type_is_int(t);  } @@ -6147,7 +6120,8 @@ enum bpf_struct_walk_result {  static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,  			   const struct btf_type *t, int off, int size, -			   u32 *next_btf_id, enum bpf_type_flag *flag) +			   u32 *next_btf_id, enum bpf_type_flag *flag, +			   const char **field_name)  {  	u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;  	const struct btf_type *mtype, *elem_type = NULL; @@ -6155,6 +6129,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,  	const char *tname, *mname, *tag_value;  	u32 vlen, elem_id, mid; +	*flag = 0;  again:  	tname = __btf_name_by_offset(btf, t->name_off);  	if (!btf_type_is_struct(t)) { @@ -6186,11 +6161,13 @@ again:  		if (off < moff)  			goto error; -		/* Only allow structure for now, can be relaxed for -		 * other types later. -		 */ +		/* allow structure and integer */  		t = btf_type_skip_modifiers(btf, array_elem->type,  					    NULL); + +		if (btf_type_is_int(t)) +			return WALK_SCALAR; +  		if (!btf_type_is_struct(t))  			goto error; @@ -6321,6 +6298,15 @@ error:  		 * of this field or inside of this struct  		 */  		if (btf_type_is_struct(mtype)) { +			if (BTF_INFO_KIND(mtype->info) == BTF_KIND_UNION && +			    btf_type_vlen(mtype) != 1) +				/* +				 * walking unions yields untrusted pointers +				 * with exception of __bpf_md_ptr and other +				 * unions with a single member +				 */ +				*flag |= PTR_UNTRUSTED; +  			/* our field must be inside that union or struct */  			t = mtype; @@ -6365,7 +6351,9 @@ error:  			stype = btf_type_skip_modifiers(btf, mtype->type, &id);  			if (btf_type_is_struct(stype)) {  				*next_btf_id = id; -				*flag = tmp_flag; +				*flag |= tmp_flag; +				if (field_name) +					*field_name = mname;  				return WALK_PTR;  			}  		} @@ -6392,7 +6380,8 @@ error:  int btf_struct_access(struct bpf_verifier_log *log,  		      const struct bpf_reg_state *reg,  		      int off, int size, enum bpf_access_type atype __maybe_unused, -		      u32 *next_btf_id, enum bpf_type_flag *flag) +		      u32 *next_btf_id, enum bpf_type_flag *flag, +		      const char **field_name)  {  	const struct btf *btf = reg->btf;  	enum bpf_type_flag tmp_flag = 0; @@ -6424,7 +6413,7 @@ int btf_struct_access(struct bpf_verifier_log *log,  	t = btf_type_by_id(btf, id);  	do { -		err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag); +		err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag, field_name);  		switch (err) {  		case WALK_PTR: @@ -6499,7 +6488,7 @@ again:  	type = btf_type_by_id(btf, id);  	if (!type)  		return false; -	err = btf_struct_walk(log, btf, type, off, 1, &id, &flag); +	err = btf_struct_walk(log, btf, type, off, 1, &id, &flag, NULL);  	if (err != WALK_STRUCT)  		return false; @@ -7180,15 +7169,12 @@ static int __btf_new_fd(struct btf *btf)  	return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);  } -int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr) +int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)  {  	struct btf *btf;  	int ret; -	btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel), -			attr->btf_size, attr->btf_log_level, -			u64_to_user_ptr(attr->btf_log_buf), -			attr->btf_log_size); +	btf = btf_parse(attr, uattr, uattr_size);  	if (IS_ERR(btf))  		return PTR_ERR(btf); @@ -7578,6 +7564,108 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)  BTF_TRACING_TYPE_xxx  #undef BTF_TRACING_TYPE +static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name, +				 const struct btf_type *func, u32 func_flags) +{ +	u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY); +	const char *name, *sfx, *iter_name; +	const struct btf_param *arg; +	const struct btf_type *t; +	char exp_name[128]; +	u32 nr_args; + +	/* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */ +	if (!flags || (flags & (flags - 1))) +		return -EINVAL; + +	/* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */ +	nr_args = btf_type_vlen(func); +	if (nr_args < 1) +		return -EINVAL; + +	arg = &btf_params(func)[0]; +	t = btf_type_skip_modifiers(btf, arg->type, NULL); +	if (!t || !btf_type_is_ptr(t)) +		return -EINVAL; +	t = btf_type_skip_modifiers(btf, t->type, NULL); +	if (!t || !__btf_type_is_struct(t)) +		return -EINVAL; + +	name = btf_name_by_offset(btf, t->name_off); +	if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1)) +		return -EINVAL; + +	/* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to +	 * fit nicely in stack slots +	 */ +	if (t->size == 0 || (t->size % 8)) +		return -EINVAL; + +	/* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *) +	 * naming pattern +	 */ +	iter_name = name + sizeof(ITER_PREFIX) - 1; +	if (flags & KF_ITER_NEW) +		sfx = "new"; +	else if (flags & KF_ITER_NEXT) +		sfx = "next"; +	else /* (flags & KF_ITER_DESTROY) */ +		sfx = "destroy"; + +	snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx); +	if (strcmp(func_name, exp_name)) +		return -EINVAL; + +	/* only iter constructor should have extra arguments */ +	if (!(flags & KF_ITER_NEW) && nr_args != 1) +		return -EINVAL; + +	if (flags & KF_ITER_NEXT) { +		/* bpf_iter_<type>_next() should return pointer */ +		t = btf_type_skip_modifiers(btf, func->type, NULL); +		if (!t || !btf_type_is_ptr(t)) +			return -EINVAL; +	} + +	if (flags & KF_ITER_DESTROY) { +		/* bpf_iter_<type>_destroy() should return void */ +		t = btf_type_by_id(btf, func->type); +		if (!t || !btf_type_is_void(t)) +			return -EINVAL; +	} + +	return 0; +} + +static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags) +{ +	const struct btf_type *func; +	const char *func_name; +	int err; + +	/* any kfunc should be FUNC -> FUNC_PROTO */ +	func = btf_type_by_id(btf, func_id); +	if (!func || !btf_type_is_func(func)) +		return -EINVAL; + +	/* sanity check kfunc name */ +	func_name = btf_name_by_offset(btf, func->name_off); +	if (!func_name || !func_name[0]) +		return -EINVAL; + +	func = btf_type_by_id(btf, func->type); +	if (!func || !btf_type_is_func_proto(func)) +		return -EINVAL; + +	if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) { +		err = btf_check_iter_kfuncs(btf, func_name, func, func_flags); +		if (err) +			return err; +	} + +	return 0; +} +  /* Kernel Function (kfunc) BTF ID set registration API */  static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, @@ -7705,6 +7793,21 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)  		return BTF_KFUNC_HOOK_TRACING;  	case BPF_PROG_TYPE_SYSCALL:  		return BTF_KFUNC_HOOK_SYSCALL; +	case BPF_PROG_TYPE_CGROUP_SKB: +		return BTF_KFUNC_HOOK_CGROUP_SKB; +	case BPF_PROG_TYPE_SCHED_ACT: +		return BTF_KFUNC_HOOK_SCHED_ACT; +	case BPF_PROG_TYPE_SK_SKB: +		return BTF_KFUNC_HOOK_SK_SKB; +	case BPF_PROG_TYPE_SOCKET_FILTER: +		return BTF_KFUNC_HOOK_SOCKET_FILTER; +	case BPF_PROG_TYPE_LWT_OUT: +	case BPF_PROG_TYPE_LWT_IN: +	case BPF_PROG_TYPE_LWT_XMIT: +	case BPF_PROG_TYPE_LWT_SEG6LOCAL: +		return BTF_KFUNC_HOOK_LWT; +	case BPF_PROG_TYPE_NETFILTER: +		return BTF_KFUNC_HOOK_NETFILTER;  	default:  		return BTF_KFUNC_HOOK_MAX;  	} @@ -7741,7 +7844,7 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,  				       const struct btf_kfunc_id_set *kset)  {  	struct btf *btf; -	int ret; +	int ret, i;  	btf = btf_get_module_btf(kset->owner);  	if (!btf) { @@ -7758,7 +7861,15 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,  	if (IS_ERR(btf))  		return PTR_ERR(btf); +	for (i = 0; i < kset->set->cnt; i++) { +		ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id, +					     kset->set->pairs[i].flags); +		if (ret) +			goto err_out; +	} +  	ret = btf_populate_kfunc_set(btf, hook, kset->set); +err_out:  	btf_put(btf);  	return ret;  } @@ -8249,12 +8360,10 @@ check_modules:  		btf_get(mod_btf);  		spin_unlock_bh(&btf_idr_lock);  		cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf)); -		if (IS_ERR(cands)) { -			btf_put(mod_btf); +		btf_put(mod_btf); +		if (IS_ERR(cands))  			return ERR_CAST(cands); -		}  		spin_lock_bh(&btf_idr_lock); -		btf_put(mod_btf);  	}  	spin_unlock_bh(&btf_idr_lock);  	/* cands is a pointer to kmalloced memory here if cands->cnt > 0 @@ -8336,16 +8445,15 @@ out:  bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,  				const struct bpf_reg_state *reg, -				int off) +				const char *field_name, u32 btf_id, const char *suffix)  {  	struct btf *btf = reg->btf;  	const struct btf_type *walk_type, *safe_type;  	const char *tname;  	char safe_tname[64];  	long ret, safe_id; -	const struct btf_member *member, *m_walk = NULL; +	const struct btf_member *member;  	u32 i; -	const char *walk_name;  	walk_type = btf_type_by_id(btf, reg->btf_id);  	if (!walk_type) @@ -8353,7 +8461,7 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,  	tname = btf_name_by_offset(btf, walk_type->name_off); -	ret = snprintf(safe_tname, sizeof(safe_tname), "%s__safe_fields", tname); +	ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix);  	if (ret < 0)  		return false; @@ -8365,30 +8473,17 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,  	if (!safe_type)  		return false; -	for_each_member(i, walk_type, member) { -		u32 moff; - -		/* We're looking for the PTR_TO_BTF_ID member in the struct -		 * type we're walking which matches the specified offset. -		 * Below, we'll iterate over the fields in the safe variant of -		 * the struct and see if any of them has a matching type / -		 * name. -		 */ -		moff = __btf_member_bit_offset(walk_type, member) / 8; -		if (off == moff) { -			m_walk = member; -			break; -		} -	} -	if (m_walk == NULL) -		return false; - -	walk_name = __btf_name_by_offset(btf, m_walk->name_off);  	for_each_member(i, safe_type, member) {  		const char *m_name = __btf_name_by_offset(btf, member->name_off); +		const struct btf_type *mtype = btf_type_by_id(btf, member->type); +		u32 id; + +		if (!btf_type_is_ptr(mtype)) +			continue; +		btf_type_skip_modifiers(btf, mtype->type, &id);  		/* If we match on both type and name, the field is considered trusted. */ -		if (m_walk->type == member->type && !strcmp(walk_name, m_name)) +		if (btf_id == id && !strcmp(field_name, m_name))  			return true;  	} diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index bf2fdb33fb31..517b6a5928cc 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -173,11 +173,11 @@ void bpf_cgroup_atype_put(int cgroup_atype)  {  	int i = cgroup_atype - CGROUP_LSM_START; -	mutex_lock(&cgroup_mutex); +	cgroup_lock();  	if (--cgroup_lsm_atype[i].refcnt <= 0)  		cgroup_lsm_atype[i].attach_btf_id = 0;  	WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0); -	mutex_unlock(&cgroup_mutex); +	cgroup_unlock();  }  #else  static enum cgroup_bpf_attach_type @@ -282,7 +282,7 @@ static void cgroup_bpf_release(struct work_struct *work)  	unsigned int atype; -	mutex_lock(&cgroup_mutex); +	cgroup_lock();  	for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {  		struct hlist_head *progs = &cgrp->bpf.progs[atype]; @@ -315,7 +315,7 @@ static void cgroup_bpf_release(struct work_struct *work)  		bpf_cgroup_storage_free(storage);  	} -	mutex_unlock(&cgroup_mutex); +	cgroup_unlock();  	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))  		cgroup_bpf_put(p); @@ -729,9 +729,9 @@ static int cgroup_bpf_attach(struct cgroup *cgrp,  {  	int ret; -	mutex_lock(&cgroup_mutex); +	cgroup_lock();  	ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); -	mutex_unlock(&cgroup_mutex); +	cgroup_unlock();  	return ret;  } @@ -831,7 +831,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,  	cg_link = container_of(link, struct bpf_cgroup_link, link); -	mutex_lock(&cgroup_mutex); +	cgroup_lock();  	/* link might have been auto-released by dying cgroup, so fail */  	if (!cg_link->cgroup) {  		ret = -ENOLINK; @@ -843,7 +843,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,  	}  	ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);  out_unlock: -	mutex_unlock(&cgroup_mutex); +	cgroup_unlock();  	return ret;  } @@ -1009,9 +1009,9 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,  {  	int ret; -	mutex_lock(&cgroup_mutex); +	cgroup_lock();  	ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); -	mutex_unlock(&cgroup_mutex); +	cgroup_unlock();  	return ret;  } @@ -1120,9 +1120,9 @@ static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,  {  	int ret; -	mutex_lock(&cgroup_mutex); +	cgroup_lock();  	ret = __cgroup_bpf_query(cgrp, attr, uattr); -	mutex_unlock(&cgroup_mutex); +	cgroup_unlock();  	return ret;  } @@ -1189,11 +1189,11 @@ static void bpf_cgroup_link_release(struct bpf_link *link)  	if (!cg_link->cgroup)  		return; -	mutex_lock(&cgroup_mutex); +	cgroup_lock();  	/* re-check cgroup under lock again */  	if (!cg_link->cgroup) { -		mutex_unlock(&cgroup_mutex); +		cgroup_unlock();  		return;  	} @@ -1205,7 +1205,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link)  	cg = cg_link->cgroup;  	cg_link->cgroup = NULL; -	mutex_unlock(&cgroup_mutex); +	cgroup_unlock();  	cgroup_put(cg);  } @@ -1232,10 +1232,10 @@ static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,  		container_of(link, struct bpf_cgroup_link, link);  	u64 cg_id = 0; -	mutex_lock(&cgroup_mutex); +	cgroup_lock();  	if (cg_link->cgroup)  		cg_id = cgroup_id(cg_link->cgroup); -	mutex_unlock(&cgroup_mutex); +	cgroup_unlock();  	seq_printf(seq,  		   "cgroup_id:\t%llu\n" @@ -1251,10 +1251,10 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,  		container_of(link, struct bpf_cgroup_link, link);  	u64 cg_id = 0; -	mutex_lock(&cgroup_mutex); +	cgroup_lock();  	if (cg_link->cgroup)  		cg_id = cgroup_id(cg_link->cgroup); -	mutex_unlock(&cgroup_mutex); +	cgroup_unlock();  	info->cgroup.cgroup_id = cg_id;  	info->cgroup.attach_type = cg_link->type; @@ -1921,14 +1921,17 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,  	if (ret < 0)  		goto out; -	if (ctx.optlen > max_optlen || ctx.optlen < 0) { +	if (optval && (ctx.optlen > max_optlen || ctx.optlen < 0)) {  		ret = -EFAULT;  		goto out;  	}  	if (ctx.optlen != 0) { -		if (copy_to_user(optval, ctx.optval, ctx.optlen) || -		    put_user(ctx.optlen, optlen)) { +		if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) { +			ret = -EFAULT; +			goto out; +		} +		if (put_user(ctx.optlen, optlen)) {  			ret = -EFAULT;  			goto out;  		} @@ -2223,10 +2226,12 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,  				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),  				treg, si->dst_reg,  				offsetof(struct bpf_sysctl_kern, ppos)); -			*insn++ = BPF_STX_MEM( -				BPF_SIZEOF(u32), treg, si->src_reg, +			*insn++ = BPF_RAW_INSN( +				BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32), +				treg, si->src_reg,  				bpf_ctx_narrow_access_offset( -					0, sizeof(u32), sizeof(loff_t))); +					0, sizeof(u32), sizeof(loff_t)), +				si->imm);  			*insn++ = BPF_LDX_MEM(  				BPF_DW, treg, si->dst_reg,  				offsetof(struct bpf_sysctl_kern, tmp_reg)); @@ -2376,10 +2381,17 @@ static bool cg_sockopt_is_valid_access(int off, int size,  	return true;  } -#define CG_SOCKOPT_ACCESS_FIELD(T, F)					\ -	T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),			\ -	  si->dst_reg, si->src_reg,					\ -	  offsetof(struct bpf_sockopt_kern, F)) +#define CG_SOCKOPT_READ_FIELD(F)					\ +	BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),	\ +		    si->dst_reg, si->src_reg,				\ +		    offsetof(struct bpf_sockopt_kern, F)) + +#define CG_SOCKOPT_WRITE_FIELD(F)					\ +	BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) |	\ +		      BPF_MEM | BPF_CLASS(si->code)),			\ +		     si->dst_reg, si->src_reg,				\ +		     offsetof(struct bpf_sockopt_kern, F),		\ +		     si->imm)  static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,  					 const struct bpf_insn *si, @@ -2391,25 +2403,25 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,  	switch (si->off) {  	case offsetof(struct bpf_sockopt, sk): -		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk); +		*insn++ = CG_SOCKOPT_READ_FIELD(sk);  		break;  	case offsetof(struct bpf_sockopt, level):  		if (type == BPF_WRITE) -			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level); +			*insn++ = CG_SOCKOPT_WRITE_FIELD(level);  		else -			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level); +			*insn++ = CG_SOCKOPT_READ_FIELD(level);  		break;  	case offsetof(struct bpf_sockopt, optname):  		if (type == BPF_WRITE) -			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname); +			*insn++ = CG_SOCKOPT_WRITE_FIELD(optname);  		else -			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname); +			*insn++ = CG_SOCKOPT_READ_FIELD(optname);  		break;  	case offsetof(struct bpf_sockopt, optlen):  		if (type == BPF_WRITE) -			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen); +			*insn++ = CG_SOCKOPT_WRITE_FIELD(optlen);  		else -			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); +			*insn++ = CG_SOCKOPT_READ_FIELD(optlen);  		break;  	case offsetof(struct bpf_sockopt, retval):  		BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0); @@ -2429,9 +2441,11 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,  			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),  					      treg, treg,  					      offsetof(struct task_struct, bpf_ctx)); -			*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), -					      treg, si->src_reg, -					      offsetof(struct bpf_cg_run_ctx, retval)); +			*insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM | +					       BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), +					       treg, si->src_reg, +					       offsetof(struct bpf_cg_run_ctx, retval), +					       si->imm);  			*insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,  					      offsetof(struct bpf_sockopt_kern, tmp_reg));  		} else { @@ -2447,10 +2461,10 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,  		}  		break;  	case offsetof(struct bpf_sockopt, optval): -		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); +		*insn++ = CG_SOCKOPT_READ_FIELD(optval);  		break;  	case offsetof(struct bpf_sockopt, optval_end): -		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end); +		*insn++ = CG_SOCKOPT_READ_FIELD(optval_end);  		break;  	} @@ -2529,10 +2543,6 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  		return &bpf_get_current_pid_tgid_proto;  	case BPF_FUNC_get_current_comm:  		return &bpf_get_current_comm_proto; -	case BPF_FUNC_get_current_cgroup_id: -		return &bpf_get_current_cgroup_id_proto; -	case BPF_FUNC_get_current_ancestor_cgroup_id: -		return &bpf_get_current_ancestor_cgroup_id_proto;  #ifdef CONFIG_CGROUP_NET_CLASSID  	case BPF_FUNC_get_cgroup_classid:  		return &bpf_get_cgroup_classid_curr_proto; diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index 06989d278846..810378f04fbc 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -58,7 +58,7 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)  {  	struct cgroup_iter_priv *p = seq->private; -	mutex_lock(&cgroup_mutex); +	cgroup_lock();  	/* cgroup_iter doesn't support read across multiple sessions. */  	if (*pos > 0) { @@ -89,7 +89,7 @@ static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)  {  	struct cgroup_iter_priv *p = seq->private; -	mutex_unlock(&cgroup_mutex); +	cgroup_unlock();  	/* pass NULL to the prog for post-processing */  	if (!v) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e2d256c82072..7421487422d4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1187,6 +1187,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,  	s16 off = insn->off;  	s32 imm = insn->imm;  	u8 *addr; +	int err;  	*func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;  	if (!*func_addr_fixed) { @@ -1201,6 +1202,11 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,  			addr = (u8 *)prog->aux->func[off]->bpf_func;  		else  			return -EINVAL; +	} else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && +		   bpf_jit_supports_far_kfunc_call()) { +		err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr); +		if (err) +			return err;  	} else {  		/* Address of a BPF helper call. Since part of the core  		 * kernel, it's always at a fixed location. __bpf_call_base @@ -2732,6 +2738,11 @@ bool __weak bpf_jit_supports_kfunc_call(void)  	return false;  } +bool __weak bpf_jit_supports_far_kfunc_call(void) +{ +	return false; +} +  /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call   * skb_copy_bits(), so provide a weak definition of it for NET-less config.   */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index d2110c1f6fa6..8ec18faa74ac 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -540,7 +540,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,  	}  } -static int cpu_map_delete_elem(struct bpf_map *map, void *key) +static long cpu_map_delete_elem(struct bpf_map *map, void *key)  {  	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);  	u32 key_cpu = *(u32 *)key; @@ -553,8 +553,8 @@ static int cpu_map_delete_elem(struct bpf_map *map, void *key)  	return 0;  } -static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, -			       u64 map_flags) +static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value, +				u64 map_flags)  {  	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);  	struct bpf_cpumap_val cpumap_value = {}; @@ -667,12 +667,21 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)  	return 0;  } -static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags) +static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)  {  	return __bpf_xdp_redirect_map(map, index, flags, 0,  				      __cpu_map_lookup_elem);  } +static u64 cpu_map_mem_usage(const struct bpf_map *map) +{ +	u64 usage = sizeof(struct bpf_cpu_map); + +	/* Currently the dynamically allocated elements are not counted */ +	usage += (u64)map->max_entries * sizeof(struct bpf_cpu_map_entry *); +	return usage; +} +  BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map)  const struct bpf_map_ops cpu_map_ops = {  	.map_meta_equal		= bpf_map_meta_equal, @@ -683,6 +692,7 @@ const struct bpf_map_ops cpu_map_ops = {  	.map_lookup_elem	= cpu_map_lookup_elem,  	.map_get_next_key	= cpu_map_get_next_key,  	.map_check_btf		= map_check_no_btf, +	.map_mem_usage		= cpu_map_mem_usage,  	.map_btf_id		= &cpu_map_btf_ids[0],  	.map_redirect		= cpu_map_redirect,  }; diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 52b981512a35..7efdf5d770ca 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -9,6 +9,7 @@  /**   * struct bpf_cpumask - refcounted BPF cpumask wrapper structure   * @cpumask:	The actual cpumask embedded in the struct. + * @rcu:	The RCU head used to free the cpumask with RCU safety.   * @usage:	Object reference counter. When the refcount goes to 0, the   *		memory is released back to the BPF allocator, which provides   *		RCU safety. @@ -24,6 +25,7 @@   */  struct bpf_cpumask {  	cpumask_t cpumask; +	struct rcu_head rcu;  	refcount_t usage;  }; @@ -55,7 +57,7 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void)  	/* cpumask must be the first element so struct bpf_cpumask be cast to struct cpumask. */  	BUILD_BUG_ON(offsetof(struct bpf_cpumask, cpumask) != 0); -	cpumask = bpf_mem_alloc(&bpf_cpumask_ma, sizeof(*cpumask)); +	cpumask = bpf_mem_cache_alloc(&bpf_cpumask_ma);  	if (!cpumask)  		return NULL; @@ -80,32 +82,14 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)  	return cpumask;  } -/** - * bpf_cpumask_kptr_get() - Attempt to acquire a reference to a BPF cpumask - *			    stored in a map. - * @cpumaskp: A pointer to a BPF cpumask map value. - * - * Attempts to acquire a reference to a BPF cpumask stored in a map value. The - * cpumask returned by this function must either be embedded in a map as a - * kptr, or freed with bpf_cpumask_release(). This function may return NULL if - * no BPF cpumask was found in the specified map value. - */ -__bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumaskp) +static void cpumask_free_cb(struct rcu_head *head)  {  	struct bpf_cpumask *cpumask; -	/* The BPF memory allocator frees memory backing its caches in an RCU -	 * callback. Thus, we can safely use RCU to ensure that the cpumask is -	 * safe to read. -	 */ -	rcu_read_lock(); - -	cpumask = READ_ONCE(*cpumaskp); -	if (cpumask && !refcount_inc_not_zero(&cpumask->usage)) -		cpumask = NULL; - -	rcu_read_unlock(); -	return cpumask; +	cpumask = container_of(head, struct bpf_cpumask, rcu); +	migrate_disable(); +	bpf_mem_cache_free(&bpf_cpumask_ma, cpumask); +	migrate_enable();  }  /** @@ -118,14 +102,8 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumas   */  __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)  { -	if (!cpumask) -		return; - -	if (refcount_dec_and_test(&cpumask->usage)) { -		migrate_disable(); -		bpf_mem_free(&bpf_cpumask_ma, cpumask); -		migrate_enable(); -	} +	if (refcount_dec_and_test(&cpumask->usage)) +		call_rcu(&cpumask->rcu, cpumask_free_cb);  }  /** @@ -424,29 +402,28 @@ __diag_pop();  BTF_SET8_START(cpumask_kfunc_btf_ids)  BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)  BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_cpumask_first, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_and, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_or, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_full, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_any, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_and, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_or, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_any, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_RCU)  BTF_SET8_END(cpumask_kfunc_btf_ids)  static const struct btf_kfunc_id_set cpumask_kfunc_set = { @@ -468,7 +445,7 @@ static int __init cpumask_kfunc_init(void)  		},  	}; -	ret = bpf_mem_alloc_init(&bpf_cpumask_ma, 0, false); +	ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false);  	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set);  	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set);  	return  ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2675fefc6cb6..802692fa3905 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -809,7 +809,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)  	kfree(dev);  } -static int dev_map_delete_elem(struct bpf_map *map, void *key) +static long dev_map_delete_elem(struct bpf_map *map, void *key)  {  	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);  	struct bpf_dtab_netdev *old_dev; @@ -819,12 +819,14 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)  		return -EINVAL;  	old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL)); -	if (old_dev) +	if (old_dev) {  		call_rcu(&old_dev->rcu, __dev_map_entry_free); +		atomic_dec((atomic_t *)&dtab->items); +	}  	return 0;  } -static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) +static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)  {  	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);  	struct bpf_dtab_netdev *old_dev; @@ -895,8 +897,8 @@ err_out:  	return ERR_PTR(-EINVAL);  } -static int __dev_map_update_elem(struct net *net, struct bpf_map *map, -				 void *key, void *value, u64 map_flags) +static long __dev_map_update_elem(struct net *net, struct bpf_map *map, +				  void *key, void *value, u64 map_flags)  {  	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);  	struct bpf_dtab_netdev *dev, *old_dev; @@ -931,19 +933,21 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,  	old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev)));  	if (old_dev)  		call_rcu(&old_dev->rcu, __dev_map_entry_free); +	else +		atomic_inc((atomic_t *)&dtab->items);  	return 0;  } -static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, -			       u64 map_flags) +static long dev_map_update_elem(struct bpf_map *map, void *key, void *value, +				u64 map_flags)  {  	return __dev_map_update_elem(current->nsproxy->net_ns,  				     map, key, value, map_flags);  } -static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, -				     void *key, void *value, u64 map_flags) +static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, +				       void *key, void *value, u64 map_flags)  {  	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);  	struct bpf_dtab_netdev *dev, *old_dev; @@ -995,27 +999,41 @@ out_err:  	return err;  } -static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, -				   u64 map_flags) +static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, +				     u64 map_flags)  {  	return __dev_map_hash_update_elem(current->nsproxy->net_ns,  					 map, key, value, map_flags);  } -static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) +static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)  {  	return __bpf_xdp_redirect_map(map, ifindex, flags,  				      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,  				      __dev_map_lookup_elem);  } -static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) +static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)  {  	return __bpf_xdp_redirect_map(map, ifindex, flags,  				      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,  				      __dev_map_hash_lookup_elem);  } +static u64 dev_map_mem_usage(const struct bpf_map *map) +{ +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); +	u64 usage = sizeof(struct bpf_dtab); + +	if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) +		usage += (u64)dtab->n_buckets * sizeof(struct hlist_head); +	else +		usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *); +	usage += atomic_read((atomic_t *)&dtab->items) * +			 (u64)sizeof(struct bpf_dtab_netdev); +	return usage; +} +  BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)  const struct bpf_map_ops dev_map_ops = {  	.map_meta_equal = bpf_map_meta_equal, @@ -1026,6 +1044,7 @@ const struct bpf_map_ops dev_map_ops = {  	.map_update_elem = dev_map_update_elem,  	.map_delete_elem = dev_map_delete_elem,  	.map_check_btf = map_check_no_btf, +	.map_mem_usage = dev_map_mem_usage,  	.map_btf_id = &dev_map_btf_ids[0],  	.map_redirect = dev_map_redirect,  }; @@ -1039,6 +1058,7 @@ const struct bpf_map_ops dev_map_hash_ops = {  	.map_update_elem = dev_map_hash_update_elem,  	.map_delete_elem = dev_map_hash_delete_elem,  	.map_check_btf = map_check_no_btf, +	.map_mem_usage = dev_map_mem_usage,  	.map_btf_id = &dev_map_btf_ids[0],  	.map_redirect = dev_hash_map_redirect,  }; @@ -1109,9 +1129,11 @@ static int dev_map_notification(struct notifier_block *notifier,  				if (!dev || netdev != dev->dev)  					continue;  				odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL)); -				if (dev == odev) +				if (dev == odev) {  					call_rcu(&dev->rcu,  						 __dev_map_entry_free); +					atomic_dec((atomic_t *)&dtab->items); +				}  			}  		}  		rcu_read_unlock(); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 5dfcb5ad0d06..00c253b84bf5 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -249,7 +249,18 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab)  		struct htab_elem *elem;  		elem = get_htab_elem(htab, i); -		bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); +		if (htab_is_percpu(htab)) { +			void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); +			int cpu; + +			for_each_possible_cpu(cpu) { +				bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); +				cond_resched(); +			} +		} else { +			bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); +			cond_resched(); +		}  		cond_resched();  	}  } @@ -596,6 +607,8 @@ free_htab:  static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)  { +	if (likely(key_len % 4 == 0)) +		return jhash2(key, key_len / 4, hashrnd);  	return jhash(key, key_len, hashrnd);  } @@ -759,9 +772,17 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,  static void check_and_free_fields(struct bpf_htab *htab,  				  struct htab_elem *elem)  { -	void *map_value = elem->key + round_up(htab->map.key_size, 8); +	if (htab_is_percpu(htab)) { +		void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); +		int cpu; + +		for_each_possible_cpu(cpu) +			bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); +	} else { +		void *map_value = elem->key + round_up(htab->map.key_size, 8); -	bpf_obj_free_fields(htab->map.record, map_value); +		bpf_obj_free_fields(htab->map.record, map_value); +	}  }  /* It is called from the bpf_lru_list when the LRU needs to delete @@ -858,9 +879,9 @@ find_first_elem:  static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)  { +	check_and_free_fields(htab, l);  	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)  		bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); -	check_and_free_fields(htab, l);  	bpf_mem_cache_free(&htab->ma, l);  } @@ -918,14 +939,13 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,  {  	if (!onallcpus) {  		/* copy true value_size bytes */ -		memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); +		copy_map_value(&htab->map, this_cpu_ptr(pptr), value);  	} else {  		u32 size = round_up(htab->map.value_size, 8);  		int off = 0, cpu;  		for_each_possible_cpu(cpu) { -			bpf_long_memcpy(per_cpu_ptr(pptr, cpu), -					value + off, size); +			copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off);  			off += size;  		}  	} @@ -940,16 +960,14 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,  	 * (onallcpus=false always when coming from bpf prog).  	 */  	if (!onallcpus) { -		u32 size = round_up(htab->map.value_size, 8);  		int current_cpu = raw_smp_processor_id();  		int cpu;  		for_each_possible_cpu(cpu) {  			if (cpu == current_cpu) -				bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value, -						size); -			else -				memset(per_cpu_ptr(pptr, cpu), 0, size); +				copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value); +			else /* Since elem is preallocated, we cannot touch special fields */ +				zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));  		}  	} else {  		pcpu_copy_value(htab, pptr, value, onallcpus); @@ -1057,8 +1075,8 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,  }  /* Called from syscall or from eBPF program */ -static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, -				u64 map_flags) +static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, +				 u64 map_flags)  {  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  	struct htab_elem *l_new = NULL, *l_old; @@ -1159,8 +1177,8 @@ static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)  	bpf_lru_push_free(&htab->lru, &elem->lru_node);  } -static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, -				    u64 map_flags) +static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, +				     u64 map_flags)  {  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  	struct htab_elem *l_new, *l_old = NULL; @@ -1226,9 +1244,9 @@ err:  	return ret;  } -static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, -					 void *value, u64 map_flags, -					 bool onallcpus) +static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, +					  void *value, u64 map_flags, +					  bool onallcpus)  {  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  	struct htab_elem *l_new = NULL, *l_old; @@ -1281,9 +1299,9 @@ err:  	return ret;  } -static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, -					     void *value, u64 map_flags, -					     bool onallcpus) +static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, +					      void *value, u64 map_flags, +					      bool onallcpus)  {  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  	struct htab_elem *l_new = NULL, *l_old; @@ -1348,21 +1366,21 @@ err:  	return ret;  } -static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, -				       void *value, u64 map_flags) +static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, +					void *value, u64 map_flags)  {  	return __htab_percpu_map_update_elem(map, key, value, map_flags, false);  } -static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, -					   void *value, u64 map_flags) +static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, +					    void *value, u64 map_flags)  {  	return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,  						 false);  }  /* Called from syscall or from eBPF program */ -static int htab_map_delete_elem(struct bpf_map *map, void *key) +static long htab_map_delete_elem(struct bpf_map *map, void *key)  {  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  	struct hlist_nulls_head *head; @@ -1398,7 +1416,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)  	return ret;  } -static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) +static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)  {  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  	struct hlist_nulls_head *head; @@ -1575,9 +1593,8 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,  			pptr = htab_elem_get_ptr(l, key_size);  			for_each_possible_cpu(cpu) { -				bpf_long_memcpy(value + off, -						per_cpu_ptr(pptr, cpu), -						roundup_value_size); +				copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu)); +				check_and_init_map_value(&htab->map, value + off);  				off += roundup_value_size;  			}  		} else { @@ -1772,8 +1789,8 @@ again_nocopy:  			pptr = htab_elem_get_ptr(l, map->key_size);  			for_each_possible_cpu(cpu) { -				bpf_long_memcpy(dst_val + off, -						per_cpu_ptr(pptr, cpu), size); +				copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu)); +				check_and_init_map_value(&htab->map, dst_val + off);  				off += size;  			}  		} else { @@ -2046,9 +2063,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)  				roundup_value_size = round_up(map->value_size, 8);  				pptr = htab_elem_get_ptr(elem, map->key_size);  				for_each_possible_cpu(cpu) { -					bpf_long_memcpy(info->percpu_value_buf + off, -							per_cpu_ptr(pptr, cpu), -							roundup_value_size); +					copy_map_value_long(map, info->percpu_value_buf + off, +							    per_cpu_ptr(pptr, cpu)); +					check_and_init_map_value(map, info->percpu_value_buf + off);  					off += roundup_value_size;  				}  				ctx.value = info->percpu_value_buf; @@ -2119,8 +2136,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {  	.seq_priv_size		= sizeof(struct bpf_iter_seq_hash_map_info),  }; -static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn, -				  void *callback_ctx, u64 flags) +static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn, +				   void *callback_ctx, u64 flags)  {  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);  	struct hlist_nulls_head *head; @@ -2175,6 +2192,44 @@ out:  	return num_elems;  } +static u64 htab_map_mem_usage(const struct bpf_map *map) +{ +	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); +	u32 value_size = round_up(htab->map.value_size, 8); +	bool prealloc = htab_is_prealloc(htab); +	bool percpu = htab_is_percpu(htab); +	bool lru = htab_is_lru(htab); +	u64 num_entries; +	u64 usage = sizeof(struct bpf_htab); + +	usage += sizeof(struct bucket) * htab->n_buckets; +	usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT; +	if (prealloc) { +		num_entries = map->max_entries; +		if (htab_has_extra_elems(htab)) +			num_entries += num_possible_cpus(); + +		usage += htab->elem_size * num_entries; + +		if (percpu) +			usage += value_size * num_possible_cpus() * num_entries; +		else if (!lru) +			usage += sizeof(struct htab_elem *) * num_possible_cpus(); +	} else { +#define LLIST_NODE_SZ sizeof(struct llist_node) + +		num_entries = htab->use_percpu_counter ? +					  percpu_counter_sum(&htab->pcount) : +					  atomic_read(&htab->count); +		usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries; +		if (percpu) { +			usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries; +			usage += value_size * num_possible_cpus() * num_entries; +		} +	} +	return usage; +} +  BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)  const struct bpf_map_ops htab_map_ops = {  	.map_meta_equal = bpf_map_meta_equal, @@ -2191,6 +2246,7 @@ const struct bpf_map_ops htab_map_ops = {  	.map_seq_show_elem = htab_map_seq_show_elem,  	.map_set_for_each_callback_args = map_set_for_each_callback_args,  	.map_for_each_callback = bpf_for_each_hash_elem, +	.map_mem_usage = htab_map_mem_usage,  	BATCH_OPS(htab),  	.map_btf_id = &htab_map_btf_ids[0],  	.iter_seq_info = &iter_seq_info, @@ -2212,6 +2268,7 @@ const struct bpf_map_ops htab_lru_map_ops = {  	.map_seq_show_elem = htab_map_seq_show_elem,  	.map_set_for_each_callback_args = map_set_for_each_callback_args,  	.map_for_each_callback = bpf_for_each_hash_elem, +	.map_mem_usage = htab_map_mem_usage,  	BATCH_OPS(htab_lru),  	.map_btf_id = &htab_map_btf_ids[0],  	.iter_seq_info = &iter_seq_info, @@ -2292,8 +2349,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)  	 */  	pptr = htab_elem_get_ptr(l, map->key_size);  	for_each_possible_cpu(cpu) { -		bpf_long_memcpy(value + off, -				per_cpu_ptr(pptr, cpu), size); +		copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); +		check_and_init_map_value(map, value + off);  		off += size;  	}  	ret = 0; @@ -2363,6 +2420,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {  	.map_seq_show_elem = htab_percpu_map_seq_show_elem,  	.map_set_for_each_callback_args = map_set_for_each_callback_args,  	.map_for_each_callback = bpf_for_each_hash_elem, +	.map_mem_usage = htab_map_mem_usage,  	BATCH_OPS(htab_percpu),  	.map_btf_id = &htab_map_btf_ids[0],  	.iter_seq_info = &iter_seq_info, @@ -2382,6 +2440,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {  	.map_seq_show_elem = htab_percpu_map_seq_show_elem,  	.map_set_for_each_callback_args = map_set_for_each_callback_args,  	.map_for_each_callback = bpf_for_each_hash_elem, +	.map_mem_usage = htab_map_mem_usage,  	BATCH_OPS(htab_lru_percpu),  	.map_btf_id = &htab_map_btf_ids[0],  	.iter_seq_info = &iter_seq_info, @@ -2519,6 +2578,7 @@ const struct bpf_map_ops htab_of_maps_map_ops = {  	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,  	.map_gen_lookup = htab_of_map_gen_lookup,  	.map_check_btf = map_check_no_btf, +	.map_mem_usage = htab_map_mem_usage,  	BATCH_OPS(htab),  	.map_btf_id = &htab_map_btf_ids[0],  }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5b278a38ae58..8d368fa353f9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -18,6 +18,7 @@  #include <linux/pid_namespace.h>  #include <linux/poison.h>  #include <linux/proc_ns.h> +#include <linux/sched/task.h>  #include <linux/security.h>  #include <linux/btf_ids.h>  #include <linux/bpf_mem_alloc.h> @@ -257,7 +258,7 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)  		goto err_clear;  	/* Verifier guarantees that size > 0 */ -	strscpy(buf, task->comm, size); +	strscpy_pad(buf, task->comm, size);  	return 0;  err_clear:  	memset(buf, 0, size); @@ -571,7 +572,7 @@ static const struct bpf_func_proto bpf_strncmp_proto = {  	.func		= bpf_strncmp,  	.gpl_only	= false,  	.ret_type	= RET_INTEGER, -	.arg1_type	= ARG_PTR_TO_MEM, +	.arg1_type	= ARG_PTR_TO_MEM | MEM_RDONLY,  	.arg2_type	= ARG_CONST_SIZE,  	.arg3_type	= ARG_PTR_TO_CONST_STR,  }; @@ -1264,10 +1265,11 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla  {  	struct bpf_hrtimer *t;  	int ret = 0; +	enum hrtimer_mode mode;  	if (in_nmi())  		return -EOPNOTSUPP; -	if (flags) +	if (flags > BPF_F_TIMER_ABS)  		return -EINVAL;  	__bpf_spin_lock_irqsave(&timer->lock);  	t = timer->timer; @@ -1275,7 +1277,13 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla  		ret = -EINVAL;  		goto out;  	} -	hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT); + +	if (flags & BPF_F_TIMER_ABS) +		mode = HRTIMER_MODE_ABS_SOFT; +	else +		mode = HRTIMER_MODE_REL_SOFT; + +	hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);  out:  	__bpf_spin_unlock_irqrestore(&timer->lock);  	return ret; @@ -1420,11 +1428,21 @@ static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)  	return ptr->size & DYNPTR_RDONLY_BIT;  } +void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) +{ +	ptr->size |= DYNPTR_RDONLY_BIT; +} +  static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)  {  	ptr->size |= type << DYNPTR_TYPE_SHIFT;  } +static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr) +{ +	return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT; +} +  u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)  {  	return ptr->size & DYNPTR_SIZE_MASK; @@ -1497,6 +1515,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {  BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,  	   u32, offset, u64, flags)  { +	enum bpf_dynptr_type type;  	int err;  	if (!src->data || flags) @@ -1506,13 +1525,25 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern  	if (err)  		return err; -	/* Source and destination may possibly overlap, hence use memmove to -	 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr -	 * pointing to overlapping PTR_TO_MAP_VALUE regions. -	 */ -	memmove(dst, src->data + src->offset + offset, len); +	type = bpf_dynptr_get_type(src); -	return 0; +	switch (type) { +	case BPF_DYNPTR_TYPE_LOCAL: +	case BPF_DYNPTR_TYPE_RINGBUF: +		/* Source and destination may possibly overlap, hence use memmove to +		 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr +		 * pointing to overlapping PTR_TO_MAP_VALUE regions. +		 */ +		memmove(dst, src->data + src->offset + offset, len); +		return 0; +	case BPF_DYNPTR_TYPE_SKB: +		return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len); +	case BPF_DYNPTR_TYPE_XDP: +		return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); +	default: +		WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); +		return -EFAULT; +	}  }  static const struct bpf_func_proto bpf_dynptr_read_proto = { @@ -1529,22 +1560,40 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {  BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,  	   u32, len, u64, flags)  { +	enum bpf_dynptr_type type;  	int err; -	if (!dst->data || flags || bpf_dynptr_is_rdonly(dst)) +	if (!dst->data || bpf_dynptr_is_rdonly(dst))  		return -EINVAL;  	err = bpf_dynptr_check_off_len(dst, offset, len);  	if (err)  		return err; -	/* Source and destination may possibly overlap, hence use memmove to -	 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr -	 * pointing to overlapping PTR_TO_MAP_VALUE regions. -	 */ -	memmove(dst->data + dst->offset + offset, src, len); +	type = bpf_dynptr_get_type(dst); -	return 0; +	switch (type) { +	case BPF_DYNPTR_TYPE_LOCAL: +	case BPF_DYNPTR_TYPE_RINGBUF: +		if (flags) +			return -EINVAL; +		/* Source and destination may possibly overlap, hence use memmove to +		 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr +		 * pointing to overlapping PTR_TO_MAP_VALUE regions. +		 */ +		memmove(dst->data + dst->offset + offset, src, len); +		return 0; +	case BPF_DYNPTR_TYPE_SKB: +		return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len, +					     flags); +	case BPF_DYNPTR_TYPE_XDP: +		if (flags) +			return -EINVAL; +		return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); +	default: +		WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); +		return -EFAULT; +	}  }  static const struct bpf_func_proto bpf_dynptr_write_proto = { @@ -1560,6 +1609,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {  BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)  { +	enum bpf_dynptr_type type;  	int err;  	if (!ptr->data) @@ -1572,7 +1622,20 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3  	if (bpf_dynptr_is_rdonly(ptr))  		return 0; -	return (unsigned long)(ptr->data + ptr->offset + offset); +	type = bpf_dynptr_get_type(ptr); + +	switch (type) { +	case BPF_DYNPTR_TYPE_LOCAL: +	case BPF_DYNPTR_TYPE_RINGBUF: +		return (unsigned long)(ptr->data + ptr->offset + offset); +	case BPF_DYNPTR_TYPE_SKB: +	case BPF_DYNPTR_TYPE_XDP: +		/* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */ +		return 0; +	default: +		WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type); +		return 0; +	}  }  static const struct bpf_func_proto bpf_dynptr_data_proto = { @@ -1693,6 +1756,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)  		return &bpf_cgrp_storage_get_proto;  	case BPF_FUNC_cgrp_storage_delete:  		return &bpf_cgrp_storage_delete_proto; +	case BPF_FUNC_get_current_cgroup_id: +		return &bpf_get_current_cgroup_id_proto; +	case BPF_FUNC_get_current_ancestor_cgroup_id: +		return &bpf_get_current_ancestor_cgroup_id_proto;  #endif  	default:  		break; @@ -1731,6 +1798,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)  	}  } +void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); +  void bpf_list_head_free(const struct btf_field *field, void *list_head,  			struct bpf_spin_lock *spin_lock)  { @@ -1761,13 +1830,8 @@ unlock:  		/* The contained type can also have resources, including a  		 * bpf_list_head which needs to be freed.  		 */ -		bpf_obj_free_fields(field->graph_root.value_rec, obj); -		/* bpf_mem_free requires migrate_disable(), since we can be -		 * called from map free path as well apart from BPF program (as -		 * part of map ops doing bpf_obj_free_fields). -		 */  		migrate_disable(); -		bpf_mem_free(&bpf_global_ma, obj); +		__bpf_obj_drop_impl(obj, field->graph_root.value_rec);  		migrate_enable();  	}  } @@ -1804,10 +1868,9 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,  		obj = pos;  		obj -= field->graph_root.node_offset; -		bpf_obj_free_fields(field->graph_root.value_rec, obj);  		migrate_disable(); -		bpf_mem_free(&bpf_global_ma, obj); +		__bpf_obj_drop_impl(obj, field->graph_root.value_rec);  		migrate_enable();  	}  } @@ -1826,45 +1889,96 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)  	if (!p)  		return NULL;  	if (meta) -		bpf_obj_init(meta->field_offs, p); +		bpf_obj_init(meta->record, p);  	return p;  } +/* Must be called under migrate_disable(), as required by bpf_mem_free */ +void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) +{ +	if (rec && rec->refcount_off >= 0 && +	    !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) { +		/* Object is refcounted and refcount_dec didn't result in 0 +		 * refcount. Return without freeing the object +		 */ +		return; +	} + +	if (rec) +		bpf_obj_free_fields(rec, p); +	bpf_mem_free(&bpf_global_ma, p); +} +  __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)  {  	struct btf_struct_meta *meta = meta__ign;  	void *p = p__alloc; -	if (meta) -		bpf_obj_free_fields(meta->record, p); -	bpf_mem_free(&bpf_global_ma, p); +	__bpf_obj_drop_impl(p, meta ? meta->record : NULL);  } -static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail) +__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign) +{ +	struct btf_struct_meta *meta = meta__ign; +	struct bpf_refcount *ref; + +	/* Could just cast directly to refcount_t *, but need some code using +	 * bpf_refcount type so that it is emitted in vmlinux BTF +	 */ +	ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off); + +	refcount_inc((refcount_t *)ref); +	return (void *)p__refcounted_kptr; +} + +static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, +			  bool tail, struct btf_record *rec, u64 off)  {  	struct list_head *n = (void *)node, *h = (void *)head; +	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't +	 * called on its fields, so init here +	 */  	if (unlikely(!h->next))  		INIT_LIST_HEAD(h); -	if (unlikely(!n->next)) -		INIT_LIST_HEAD(n); +	if (!list_empty(n)) { +		/* Only called from BPF prog, no need to migrate_disable */ +		__bpf_obj_drop_impl(n - off, rec); +		return -EINVAL; +	} +  	tail ? list_add_tail(n, h) : list_add(n, h); + +	return 0;  } -__bpf_kfunc void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) +__bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head, +					 struct bpf_list_node *node, +					 void *meta__ign, u64 off)  { -	return __bpf_list_add(node, head, false); +	struct btf_struct_meta *meta = meta__ign; + +	return __bpf_list_add(node, head, false, +			      meta ? meta->record : NULL, off);  } -__bpf_kfunc void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) +__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, +					struct bpf_list_node *node, +					void *meta__ign, u64 off)  { -	return __bpf_list_add(node, head, true); +	struct btf_struct_meta *meta = meta__ign; + +	return __bpf_list_add(node, head, true, +			      meta ? meta->record : NULL, off);  }  static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)  {  	struct list_head *n, *h = (void *)head; +	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't +	 * called on its fields, so init here +	 */  	if (unlikely(!h->next))  		INIT_LIST_HEAD(h);  	if (list_empty(h)) @@ -1890,6 +2004,9 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,  	struct rb_root_cached *r = (struct rb_root_cached *)root;  	struct rb_node *n = (struct rb_node *)node; +	if (RB_EMPTY_NODE(n)) +		return NULL; +  	rb_erase_cached(n, r);  	RB_CLEAR_NODE(n);  	return (struct bpf_rb_node *)n; @@ -1898,14 +2015,20 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,  /* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF   * program   */ -static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, -			     void *less) +static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, +			    void *less, struct btf_record *rec, u64 off)  {  	struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node; +	struct rb_node *parent = NULL, *n = (struct rb_node *)node;  	bpf_callback_t cb = (bpf_callback_t)less; -	struct rb_node *parent = NULL;  	bool leftmost = true; +	if (!RB_EMPTY_NODE(n)) { +		/* Only called from BPF prog, no need to migrate_disable */ +		__bpf_obj_drop_impl(n - off, rec); +		return -EINVAL; +	} +  	while (*link) {  		parent = *link;  		if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) { @@ -1916,15 +2039,18 @@ static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,  		}  	} -	rb_link_node((struct rb_node *)node, parent, link); -	rb_insert_color_cached((struct rb_node *)node, -			       (struct rb_root_cached *)root, leftmost); +	rb_link_node(n, parent, link); +	rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost); +	return 0;  } -__bpf_kfunc void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, -				bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) +__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, +				    bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), +				    void *meta__ign, u64 off)  { -	__bpf_rbtree_add(root, node, (void *)less); +	struct btf_struct_meta *meta = meta__ign; + +	return __bpf_rbtree_add(root, node, (void *)less, meta ? meta->record : NULL, off);  }  __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) @@ -1942,73 +2068,8 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)   */  __bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)  { -	return get_task_struct(p); -} - -/** - * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task - * acquired by this kfunc which is not stored in a map as a kptr, must be - * released by calling bpf_task_release(). - * @p: The task on which a reference is being acquired. - */ -__bpf_kfunc struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) -{ -	/* For the time being this function returns NULL, as it's not currently -	 * possible to safely acquire a reference to a task with RCU protection -	 * using get_task_struct() and put_task_struct(). This is due to the -	 * slightly odd mechanics of p->rcu_users, and how task RCU protection -	 * works. -	 * -	 * A struct task_struct is refcounted by two different refcount_t -	 * fields: -	 * -	 * 1. p->usage:     The "true" refcount field which tracks a task's -	 *		    lifetime. The task is freed as soon as this -	 *		    refcount drops to 0. -	 * -	 * 2. p->rcu_users: An "RCU users" refcount field which is statically -	 *		    initialized to 2, and is co-located in a union with -	 *		    a struct rcu_head field (p->rcu). p->rcu_users -	 *		    essentially encapsulates a single p->usage -	 *		    refcount, and when p->rcu_users goes to 0, an RCU -	 *		    callback is scheduled on the struct rcu_head which -	 *		    decrements the p->usage refcount. -	 * -	 * There are two important implications to this task refcounting logic -	 * described above. The first is that -	 * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as -	 * after the refcount goes to 0, the RCU callback being scheduled will -	 * cause the memory backing the refcount to again be nonzero due to the -	 * fields sharing a union. The other is that we can't rely on RCU to -	 * guarantee that a task is valid in a BPF program. This is because a -	 * task could have already transitioned to being in the TASK_DEAD -	 * state, had its rcu_users refcount go to 0, and its rcu callback -	 * invoked in which it drops its single p->usage reference. At this -	 * point the task will be freed as soon as the last p->usage reference -	 * goes to 0, without waiting for another RCU gp to elapse. The only -	 * way that a BPF program can guarantee that a task is valid is in this -	 * scenario is to hold a p->usage refcount itself. -	 * -	 * Until we're able to resolve this issue, either by pulling -	 * p->rcu_users and p->rcu out of the union, or by getting rid of -	 * p->usage and just using p->rcu_users for refcounting, we'll just -	 * return NULL here. -	 */ -	return NULL; -} - -/** - * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task - * kptr acquired by this kfunc which is not subsequently stored in a map, must - * be released by calling bpf_task_release(). - * @pp: A pointer to a task kptr on which a reference is being acquired. - */ -__bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp) -{ -	/* We must return NULL here until we have clarity on how to properly -	 * leverage RCU for ensuring a task's lifetime. See the comment above -	 * in bpf_task_acquire_not_zero() for more details. -	 */ +	if (refcount_inc_not_zero(&p->rcu_users)) +		return p;  	return NULL;  } @@ -2018,10 +2079,7 @@ __bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp)   */  __bpf_kfunc void bpf_task_release(struct task_struct *p)  { -	if (!p) -		return; - -	put_task_struct(p); +	put_task_struct_rcu_user(p);  }  #ifdef CONFIG_CGROUPS @@ -2033,39 +2091,7 @@ __bpf_kfunc void bpf_task_release(struct task_struct *p)   */  __bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)  { -	cgroup_get(cgrp); -	return cgrp; -} - -/** - * bpf_cgroup_kptr_get - Acquire a reference on a struct cgroup kptr. A cgroup - * kptr acquired by this kfunc which is not subsequently stored in a map, must - * be released by calling bpf_cgroup_release(). - * @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired. - */ -__bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp) -{ -	struct cgroup *cgrp; - -	rcu_read_lock(); -	/* Another context could remove the cgroup from the map and release it -	 * at any time, including after we've done the lookup above. This is -	 * safe because we're in an RCU read region, so the cgroup is -	 * guaranteed to remain valid until at least the rcu_read_unlock() -	 * below. -	 */ -	cgrp = READ_ONCE(*cgrpp); - -	if (cgrp && !cgroup_tryget(cgrp)) -		/* If the cgroup had been removed from the map and freed as -		 * described above, cgroup_tryget() will return false. The -		 * cgroup will be freed at some point after the current RCU gp -		 * has ended, so just return NULL to the user. -		 */ -		cgrp = NULL; -	rcu_read_unlock(); - -	return cgrp; +	return cgroup_tryget(cgrp) ? cgrp : NULL;  }  /** @@ -2077,9 +2103,6 @@ __bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)   */  __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)  { -	if (!cgrp) -		return; -  	cgroup_put(cgrp);  } @@ -2097,10 +2120,28 @@ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)  	if (level > cgrp->level || level < 0)  		return NULL; +	/* cgrp's refcnt could be 0 here, but ancestors can still be accessed */  	ancestor = cgrp->ancestors[level]; -	cgroup_get(ancestor); +	if (!cgroup_tryget(ancestor)) +		return NULL;  	return ancestor;  } + +/** + * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this + * kfunc which is not subsequently stored in a map, must be released by calling + * bpf_cgroup_release(). + * @cgid: cgroup id. + */ +__bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) +{ +	struct cgroup *cgrp; + +	cgrp = cgroup_get_from_id(cgid); +	if (IS_ERR(cgrp)) +		return NULL; +	return cgrp; +}  #endif /* CONFIG_CGROUPS */  /** @@ -2116,12 +2157,146 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)  	rcu_read_lock();  	p = find_task_by_pid_ns(pid, &init_pid_ns);  	if (p) -		bpf_task_acquire(p); +		p = bpf_task_acquire(p);  	rcu_read_unlock();  	return p;  } +/** + * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. + * @ptr: The dynptr whose data slice to retrieve + * @offset: Offset into the dynptr + * @buffer: User-provided buffer to copy contents into + * @buffer__szk: Size (in bytes) of the buffer. This is the length of the + *		 requested slice. This must be a constant. + * + * For non-skb and non-xdp type dynptrs, there is no difference between + * bpf_dynptr_slice and bpf_dynptr_data. + * + * If the intention is to write to the data slice, please use + * bpf_dynptr_slice_rdwr. + * + * The user must check that the returned pointer is not null before using it. + * + * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice + * does not change the underlying packet data pointers, so a call to + * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in + * the bpf program. + * + * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only + * data slice (can be either direct pointer to the data or a pointer to the user + * provided buffer, with its contents containing the data, if unable to obtain + * direct pointer) + */ +__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset, +				   void *buffer, u32 buffer__szk) +{ +	enum bpf_dynptr_type type; +	u32 len = buffer__szk; +	int err; + +	if (!ptr->data) +		return NULL; + +	err = bpf_dynptr_check_off_len(ptr, offset, len); +	if (err) +		return NULL; + +	type = bpf_dynptr_get_type(ptr); + +	switch (type) { +	case BPF_DYNPTR_TYPE_LOCAL: +	case BPF_DYNPTR_TYPE_RINGBUF: +		return ptr->data + ptr->offset + offset; +	case BPF_DYNPTR_TYPE_SKB: +		return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer); +	case BPF_DYNPTR_TYPE_XDP: +	{ +		void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len); +		if (xdp_ptr) +			return xdp_ptr; + +		bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false); +		return buffer; +	} +	default: +		WARN_ONCE(true, "unknown dynptr type %d\n", type); +		return NULL; +	} +} + +/** + * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data. + * @ptr: The dynptr whose data slice to retrieve + * @offset: Offset into the dynptr + * @buffer: User-provided buffer to copy contents into + * @buffer__szk: Size (in bytes) of the buffer. This is the length of the + *		 requested slice. This must be a constant. + * + * For non-skb and non-xdp type dynptrs, there is no difference between + * bpf_dynptr_slice and bpf_dynptr_data. + * + * The returned pointer is writable and may point to either directly the dynptr + * data at the requested offset or to the buffer if unable to obtain a direct + * data pointer to (example: the requested slice is to the paged area of an skb + * packet). In the case where the returned pointer is to the buffer, the user + * is responsible for persisting writes through calling bpf_dynptr_write(). This + * usually looks something like this pattern: + * + * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer)); + * if (!eth) + *	return TC_ACT_SHOT; + * + * // mutate eth header // + * + * if (eth == buffer) + *	bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0); + * + * Please note that, as in the example above, the user must check that the + * returned pointer is not null before using it. + * + * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr + * does not change the underlying packet data pointers, so a call to + * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in + * the bpf program. + * + * Return: NULL if the call failed (eg invalid dynptr), pointer to a + * data slice (can be either direct pointer to the data or a pointer to the user + * provided buffer, with its contents containing the data, if unable to obtain + * direct pointer) + */ +__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset, +					void *buffer, u32 buffer__szk) +{ +	if (!ptr->data || bpf_dynptr_is_rdonly(ptr)) +		return NULL; + +	/* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice. +	 * +	 * For skb-type dynptrs, it is safe to write into the returned pointer +	 * if the bpf program allows skb data writes. There are two possiblities +	 * that may occur when calling bpf_dynptr_slice_rdwr: +	 * +	 * 1) The requested slice is in the head of the skb. In this case, the +	 * returned pointer is directly to skb data, and if the skb is cloned, the +	 * verifier will have uncloned it (see bpf_unclone_prologue()) already. +	 * The pointer can be directly written into. +	 * +	 * 2) Some portion of the requested slice is in the paged buffer area. +	 * In this case, the requested data will be copied out into the buffer +	 * and the returned pointer will be a pointer to the buffer. The skb +	 * will not be pulled. To persist the write, the user will need to call +	 * bpf_dynptr_write(), which will pull the skb and commit the write. +	 * +	 * Similarly for xdp programs, if the requested slice is not across xdp +	 * fragments, then a direct pointer will be returned, otherwise the data +	 * will be copied out into the buffer and the user will need to call +	 * bpf_dynptr_write() to commit changes. +	 */ +	return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk); +} +  __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)  {  	return obj; @@ -2150,23 +2325,22 @@ BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)  #endif  BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)  BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_list_push_front) -BTF_ID_FLAGS(func, bpf_list_push_back) +BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_list_push_front_impl) +BTF_ID_FLAGS(func, bpf_list_push_back_impl)  BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)  BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)  BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE) -BTF_ID_FLAGS(func, bpf_rbtree_add) +BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_add_impl)  BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)  #ifdef CONFIG_CGROUPS -BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)  BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)  #endif  BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)  BTF_SET8_END(generic_btf_ids) @@ -2190,6 +2364,11 @@ BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)  BTF_ID_FLAGS(func, bpf_rdonly_cast)  BTF_ID_FLAGS(func, bpf_rcu_read_lock)  BTF_ID_FLAGS(func, bpf_rcu_read_unlock) +BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW) +BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)  BTF_SET8_END(common_btf_ids)  static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index e90d9f63edc5..a04f505aefe9 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -141,8 +141,8 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key)  	return &READ_ONCE(storage->buf)->data[0];  } -static int cgroup_storage_update_elem(struct bpf_map *map, void *key, -				      void *value, u64 flags) +static long cgroup_storage_update_elem(struct bpf_map *map, void *key, +				       void *value, u64 flags)  {  	struct bpf_cgroup_storage *storage;  	struct bpf_storage_buffer *new; @@ -333,14 +333,14 @@ static void cgroup_storage_map_free(struct bpf_map *_map)  	struct list_head *storages = &map->list;  	struct bpf_cgroup_storage *storage, *stmp; -	mutex_lock(&cgroup_mutex); +	cgroup_lock();  	list_for_each_entry_safe(storage, stmp, storages, list_map) {  		bpf_cgroup_storage_unlink(storage);  		bpf_cgroup_storage_free(storage);  	} -	mutex_unlock(&cgroup_mutex); +	cgroup_unlock();  	WARN_ON(!RB_EMPTY_ROOT(&map->root));  	WARN_ON(!list_empty(&map->list)); @@ -348,7 +348,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map)  	bpf_map_area_free(map);  } -static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) +static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)  {  	return -EINVAL;  } @@ -446,6 +446,12 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,  	rcu_read_unlock();  } +static u64 cgroup_storage_map_usage(const struct bpf_map *map) +{ +	/* Currently the dynamically allocated elements are not counted. */ +	return sizeof(struct bpf_cgroup_storage_map); +} +  BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct,  		   bpf_cgroup_storage_map)  const struct bpf_map_ops cgroup_storage_map_ops = { @@ -457,6 +463,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = {  	.map_delete_elem = cgroup_storage_delete_elem,  	.map_check_btf = cgroup_storage_check_btf,  	.map_seq_show_elem = cgroup_storage_seq_show_elem, +	.map_mem_usage = cgroup_storage_map_usage,  	.map_btf_id = &cgroup_storage_map_btf_ids[0],  }; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c new file mode 100644 index 000000000000..046ddff37a76 --- /dev/null +++ b/kernel/bpf/log.c @@ -0,0 +1,330 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io + */ +#include <uapi/linux/btf.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/bpf.h> +#include <linux/bpf_verifier.h> +#include <linux/math64.h> + +static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) +{ +	/* ubuf and len_total should both be specified (or not) together */ +	if (!!log->ubuf != !!log->len_total) +		return false; +	/* log buf without log_level is meaningless */ +	if (log->ubuf && log->level == 0) +		return false; +	if (log->level & ~BPF_LOG_MASK) +		return false; +	if (log->len_total > UINT_MAX >> 2) +		return false; +	return true; +} + +int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, +		  char __user *log_buf, u32 log_size) +{ +	log->level = log_level; +	log->ubuf = log_buf; +	log->len_total = log_size; + +	/* log attributes have to be sane */ +	if (!bpf_verifier_log_attr_valid(log)) +		return -EINVAL; + +	return 0; +} + +static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len) +{ +	/* add_len includes terminal \0, so no need for +1. */ +	u64 len = log->end_pos + add_len; + +	/* log->len_max could be larger than our current len due to +	 * bpf_vlog_reset() calls, so we maintain the max of any length at any +	 * previous point +	 */ +	if (len > UINT_MAX) +		log->len_max = UINT_MAX; +	else if (len > log->len_max) +		log->len_max = len; +} + +void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, +		       va_list args) +{ +	u64 cur_pos; +	u32 new_n, n; + +	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); + +	WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, +		  "verifier log line truncated - local buffer too short\n"); + +	if (log->level == BPF_LOG_KERNEL) { +		bool newline = n > 0 && log->kbuf[n - 1] == '\n'; + +		pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n"); +		return; +	} + +	n += 1; /* include terminating zero */ +	bpf_vlog_update_len_max(log, n); + +	if (log->level & BPF_LOG_FIXED) { +		/* check if we have at least something to put into user buf */ +		new_n = 0; +		if (log->end_pos < log->len_total) { +			new_n = min_t(u32, log->len_total - log->end_pos, n); +			log->kbuf[new_n - 1] = '\0'; +		} + +		cur_pos = log->end_pos; +		log->end_pos += n - 1; /* don't count terminating '\0' */ + +		if (log->ubuf && new_n && +		    copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n)) +			goto fail; +	} else { +		u64 new_end, new_start; +		u32 buf_start, buf_end, new_n; + +		new_end = log->end_pos + n; +		if (new_end - log->start_pos >= log->len_total) +			new_start = new_end - log->len_total; +		else +			new_start = log->start_pos; + +		log->start_pos = new_start; +		log->end_pos = new_end - 1; /* don't count terminating '\0' */ + +		if (!log->ubuf) +			return; + +		new_n = min(n, log->len_total); +		cur_pos = new_end - new_n; +		div_u64_rem(cur_pos, log->len_total, &buf_start); +		div_u64_rem(new_end, log->len_total, &buf_end); +		/* new_end and buf_end are exclusive indices, so if buf_end is +		 * exactly zero, then it actually points right to the end of +		 * ubuf and there is no wrap around +		 */ +		if (buf_end == 0) +			buf_end = log->len_total; + +		/* if buf_start > buf_end, we wrapped around; +		 * if buf_start == buf_end, then we fill ubuf completely; we +		 * can't have buf_start == buf_end to mean that there is +		 * nothing to write, because we always write at least +		 * something, even if terminal '\0' +		 */ +		if (buf_start < buf_end) { +			/* message fits within contiguous chunk of ubuf */ +			if (copy_to_user(log->ubuf + buf_start, +					 log->kbuf + n - new_n, +					 buf_end - buf_start)) +				goto fail; +		} else { +			/* message wraps around the end of ubuf, copy in two chunks */ +			if (copy_to_user(log->ubuf + buf_start, +					 log->kbuf + n - new_n, +					 log->len_total - buf_start)) +				goto fail; +			if (copy_to_user(log->ubuf, +					 log->kbuf + n - buf_end, +					 buf_end)) +				goto fail; +		} +	} + +	return; +fail: +	log->ubuf = NULL; +} + +void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos) +{ +	char zero = 0; +	u32 pos; + +	if (WARN_ON_ONCE(new_pos > log->end_pos)) +		return; + +	if (!bpf_verifier_log_needed(log) || log->level == BPF_LOG_KERNEL) +		return; + +	/* if position to which we reset is beyond current log window, +	 * then we didn't preserve any useful content and should adjust +	 * start_pos to end up with an empty log (start_pos == end_pos) +	 */ +	log->end_pos = new_pos; +	if (log->end_pos < log->start_pos) +		log->start_pos = log->end_pos; + +	if (!log->ubuf) +		return; + +	if (log->level & BPF_LOG_FIXED) +		pos = log->end_pos + 1; +	else +		div_u64_rem(new_pos, log->len_total, &pos); + +	if (pos < log->len_total && put_user(zero, log->ubuf + pos)) +		log->ubuf = NULL; +} + +static void bpf_vlog_reverse_kbuf(char *buf, int len) +{ +	int i, j; + +	for (i = 0, j = len - 1; i < j; i++, j--) +		swap(buf[i], buf[j]); +} + +static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end) +{ +	/* we split log->kbuf into two equal parts for both ends of array */ +	int n = sizeof(log->kbuf) / 2, nn; +	char *lbuf = log->kbuf, *rbuf = log->kbuf + n; + +	/* Read ubuf's section [start, end) two chunks at a time, from left +	 * and right side; within each chunk, swap all the bytes; after that +	 * reverse the order of lbuf and rbuf and write result back to ubuf. +	 * This way we'll end up with swapped contents of specified +	 * [start, end) ubuf segment. +	 */ +	while (end - start > 1) { +		nn = min(n, (end - start ) / 2); + +		if (copy_from_user(lbuf, log->ubuf + start, nn)) +			return -EFAULT; +		if (copy_from_user(rbuf, log->ubuf + end - nn, nn)) +			return -EFAULT; + +		bpf_vlog_reverse_kbuf(lbuf, nn); +		bpf_vlog_reverse_kbuf(rbuf, nn); + +		/* we write lbuf to the right end of ubuf, while rbuf to the +		 * left one to end up with properly reversed overall ubuf +		 */ +		if (copy_to_user(log->ubuf + start, rbuf, nn)) +			return -EFAULT; +		if (copy_to_user(log->ubuf + end - nn, lbuf, nn)) +			return -EFAULT; + +		start += nn; +		end -= nn; +	} + +	return 0; +} + +int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual) +{ +	u32 sublen; +	int err; + +	*log_size_actual = 0; +	if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL) +		return 0; + +	if (!log->ubuf) +		goto skip_log_rotate; +	/* If we never truncated log, there is nothing to move around. */ +	if (log->start_pos == 0) +		goto skip_log_rotate; + +	/* Otherwise we need to rotate log contents to make it start from the +	 * buffer beginning and be a continuous zero-terminated string. Note +	 * that if log->start_pos != 0 then we definitely filled up entire log +	 * buffer with no gaps, and we just need to shift buffer contents to +	 * the left by (log->start_pos % log->len_total) bytes. +	 * +	 * Unfortunately, user buffer could be huge and we don't want to +	 * allocate temporary kernel memory of the same size just to shift +	 * contents in a straightforward fashion. Instead, we'll be clever and +	 * do in-place array rotation. This is a leetcode-style problem, which +	 * could be solved by three rotations. +	 * +	 * Let's say we have log buffer that has to be shifted left by 7 bytes +	 * (spaces and vertical bar is just for demonstrative purposes): +	 *   E F G H I J K | A B C D +	 * +	 * First, we reverse entire array: +	 *   D C B A | K J I H G F E +	 * +	 * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes +	 * (KJIHGFE), resulting in a properly rotated array: +	 *   A B C D | E F G H I J K +	 * +	 * We'll utilize log->kbuf to read user memory chunk by chunk, swap +	 * bytes, and write them back. Doing it byte-by-byte would be +	 * unnecessarily inefficient. Altogether we are going to read and +	 * write each byte twice, for total 4 memory copies between kernel and +	 * user space. +	 */ + +	/* length of the chopped off part that will be the beginning; +	 * len(ABCD) in the example above +	 */ +	div_u64_rem(log->start_pos, log->len_total, &sublen); +	sublen = log->len_total - sublen; + +	err = bpf_vlog_reverse_ubuf(log, 0, log->len_total); +	err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen); +	err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total); +	if (err) +		log->ubuf = NULL; + +skip_log_rotate: +	*log_size_actual = log->len_max; + +	/* properly initialized log has either both ubuf!=NULL and len_total>0 +	 * or ubuf==NULL and len_total==0, so if this condition doesn't hold, +	 * we got a fault somewhere along the way, so report it back +	 */ +	if (!!log->ubuf != !!log->len_total) +		return -EFAULT; + +	/* did truncation actually happen? */ +	if (log->ubuf && log->len_max > log->len_total) +		return -ENOSPC; + +	return 0; +} + +/* log_level controls verbosity level of eBPF verifier. + * bpf_verifier_log_write() is used to dump the verification trace to the log, + * so the user can figure out what's wrong with the program + */ +__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, +					   const char *fmt, ...) +{ +	va_list args; + +	if (!bpf_verifier_log_needed(&env->log)) +		return; + +	va_start(args, fmt); +	bpf_verifier_vlog(&env->log, fmt, args); +	va_end(args); +} +EXPORT_SYMBOL_GPL(bpf_verifier_log_write); + +__printf(2, 3) void bpf_log(struct bpf_verifier_log *log, +			    const char *fmt, ...) +{ +	va_list args; + +	if (!bpf_verifier_log_needed(log)) +		return; + +	va_start(args, fmt); +	bpf_verifier_vlog(log, fmt, args); +	va_end(args); +} +EXPORT_SYMBOL_GPL(bpf_log); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index d833496e9e42..e0d3ddf2037a 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -300,8 +300,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,  }  /* Called from syscall or from eBPF program */ -static int trie_update_elem(struct bpf_map *map, -			    void *_key, void *value, u64 flags) +static long trie_update_elem(struct bpf_map *map, +			     void *_key, void *value, u64 flags)  {  	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);  	struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; @@ -431,7 +431,7 @@ out:  }  /* Called from syscall or from eBPF program */ -static int trie_delete_elem(struct bpf_map *map, void *_key) +static long trie_delete_elem(struct bpf_map *map, void *_key)  {  	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);  	struct bpf_lpm_trie_key *key = _key; @@ -720,6 +720,16 @@ static int trie_check_btf(const struct bpf_map *map,  	       -EINVAL : 0;  } +static u64 trie_mem_usage(const struct bpf_map *map) +{ +	struct lpm_trie *trie = container_of(map, struct lpm_trie, map); +	u64 elem_size; + +	elem_size = sizeof(struct lpm_trie_node) + trie->data_size + +			    trie->map.value_size; +	return elem_size * READ_ONCE(trie->n_entries); +} +  BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie)  const struct bpf_map_ops trie_map_ops = {  	.map_meta_equal = bpf_map_meta_equal, @@ -733,5 +743,6 @@ const struct bpf_map_ops trie_map_ops = {  	.map_update_batch = generic_map_update_batch,  	.map_delete_batch = generic_map_delete_batch,  	.map_check_btf = trie_check_btf, +	.map_mem_usage = trie_mem_usage,  	.map_btf_id = &trie_map_btf_ids[0],  }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 38136ec4e095..2c5c64c2a53b 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -56,18 +56,6 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)  		ret = PTR_ERR(inner_map_meta->record);  		goto free;  	} -	if (inner_map_meta->record) { -		struct btf_field_offs *field_offs; -		/* If btf_record is !IS_ERR_OR_NULL, then field_offs is always -		 * valid. -		 */ -		field_offs = kmemdup(inner_map->field_offs, sizeof(*inner_map->field_offs), GFP_KERNEL | __GFP_NOWARN); -		if (!field_offs) { -			ret = -ENOMEM; -			goto free_rec; -		} -		inner_map_meta->field_offs = field_offs; -	}  	/* Note: We must use the same BTF, as we also used btf_record_dup above  	 * which relies on BTF being same for both maps, as some members like  	 * record->fields.list_head have pointers like value_rec pointing into @@ -88,8 +76,6 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)  	fdput(f);  	return inner_map_meta; -free_rec: -	btf_record_free(inner_map_meta->record);  free:  	kfree(inner_map_meta);  put: @@ -99,7 +85,6 @@ put:  void bpf_map_meta_free(struct bpf_map *map_meta)  { -	kfree(map_meta->field_offs);  	bpf_map_free_record(map_meta);  	btf_put(map_meta->btf);  	kfree(map_meta); diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 5fcdacbb8439..410637c225fb 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -121,15 +121,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head)  	return entry;  } -static void *__alloc(struct bpf_mem_cache *c, int node) +static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)  { -	/* Allocate, but don't deplete atomic reserves that typical -	 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc -	 * will allocate from the current numa node which is what we -	 * want here. -	 */ -	gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT; -  	if (c->percpu_size) {  		void **obj = kmalloc_node(c->percpu_size, flags, node);  		void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); @@ -185,7 +178,12 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)  		 */  		obj = __llist_del_first(&c->free_by_rcu);  		if (!obj) { -			obj = __alloc(c, node); +			/* Allocate, but don't deplete atomic reserves that typical +			 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc +			 * will allocate from the current numa node which is what we +			 * want here. +			 */ +			obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT);  			if (!obj)  				break;  		} @@ -676,3 +674,46 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)  	unit_free(this_cpu_ptr(ma->cache), ptr);  } + +/* Directly does a kfree() without putting 'ptr' back to the free_llist + * for reuse and without waiting for a rcu_tasks_trace gp. + * The caller must first go through the rcu_tasks_trace gp for 'ptr' + * before calling bpf_mem_cache_raw_free(). + * It could be used when the rcu_tasks_trace callback does not have + * a hold on the original bpf_mem_alloc object that allocated the + * 'ptr'. This should only be used in the uncommon code path. + * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled + * and may affect performance. + */ +void bpf_mem_cache_raw_free(void *ptr) +{ +	if (!ptr) +		return; + +	kfree(ptr - LLIST_NODE_SZ); +} + +/* When flags == GFP_KERNEL, it signals that the caller will not cause + * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use + * kmalloc if the free_llist is empty. + */ +void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) +{ +	struct bpf_mem_cache *c; +	void *ret; + +	c = this_cpu_ptr(ma->cache); + +	ret = unit_alloc(c); +	if (!ret && flags == GFP_KERNEL) { +		struct mem_cgroup *memcg, *old_memcg; + +		memcg = get_memcg(c); +		old_memcg = set_active_memcg(memcg); +		ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); +		set_active_memcg(old_memcg); +		mem_cgroup_put(memcg); +	} + +	return !ret ? NULL : ret + LLIST_NODE_SZ; +} diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 0c85e06f7ea7..d9c9f45e3529 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -563,6 +563,12 @@ void bpf_map_offload_map_free(struct bpf_map *map)  	bpf_map_area_free(offmap);  } +u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map) +{ +	/* The memory dynamically allocated in netdev dev_ops is not counted */ +	return sizeof(struct bpf_offloaded_map); +} +  int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)  {  	struct bpf_offloaded_map *offmap = map_to_offmap(map); diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 8a5e060de63b..601609164ef3 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -95,7 +95,7 @@ static void queue_stack_map_free(struct bpf_map *map)  	bpf_map_area_free(qs);  } -static int __queue_map_get(struct bpf_map *map, void *value, bool delete) +static long __queue_map_get(struct bpf_map *map, void *value, bool delete)  {  	struct bpf_queue_stack *qs = bpf_queue_stack(map);  	unsigned long flags; @@ -124,7 +124,7 @@ out:  } -static int __stack_map_get(struct bpf_map *map, void *value, bool delete) +static long __stack_map_get(struct bpf_map *map, void *value, bool delete)  {  	struct bpf_queue_stack *qs = bpf_queue_stack(map);  	unsigned long flags; @@ -156,32 +156,32 @@ out:  }  /* Called from syscall or from eBPF program */ -static int queue_map_peek_elem(struct bpf_map *map, void *value) +static long queue_map_peek_elem(struct bpf_map *map, void *value)  {  	return __queue_map_get(map, value, false);  }  /* Called from syscall or from eBPF program */ -static int stack_map_peek_elem(struct bpf_map *map, void *value) +static long stack_map_peek_elem(struct bpf_map *map, void *value)  {  	return __stack_map_get(map, value, false);  }  /* Called from syscall or from eBPF program */ -static int queue_map_pop_elem(struct bpf_map *map, void *value) +static long queue_map_pop_elem(struct bpf_map *map, void *value)  {  	return __queue_map_get(map, value, true);  }  /* Called from syscall or from eBPF program */ -static int stack_map_pop_elem(struct bpf_map *map, void *value) +static long stack_map_pop_elem(struct bpf_map *map, void *value)  {  	return __stack_map_get(map, value, true);  }  /* Called from syscall or from eBPF program */ -static int queue_stack_map_push_elem(struct bpf_map *map, void *value, -				     u64 flags) +static long queue_stack_map_push_elem(struct bpf_map *map, void *value, +				      u64 flags)  {  	struct bpf_queue_stack *qs = bpf_queue_stack(map);  	unsigned long irq_flags; @@ -227,14 +227,14 @@ static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key)  }  /* Called from syscall or from eBPF program */ -static int queue_stack_map_update_elem(struct bpf_map *map, void *key, -				       void *value, u64 flags) +static long queue_stack_map_update_elem(struct bpf_map *map, void *key, +					void *value, u64 flags)  {  	return -EINVAL;  }  /* Called from syscall or from eBPF program */ -static int queue_stack_map_delete_elem(struct bpf_map *map, void *key) +static long queue_stack_map_delete_elem(struct bpf_map *map, void *key)  {  	return -EINVAL;  } @@ -246,6 +246,14 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key,  	return -EINVAL;  } +static u64 queue_stack_map_mem_usage(const struct bpf_map *map) +{ +	u64 usage = sizeof(struct bpf_queue_stack); + +	usage += ((u64)map->max_entries + 1) * map->value_size; +	return usage; +} +  BTF_ID_LIST_SINGLE(queue_map_btf_ids, struct, bpf_queue_stack)  const struct bpf_map_ops queue_map_ops = {  	.map_meta_equal = bpf_map_meta_equal, @@ -259,6 +267,7 @@ const struct bpf_map_ops queue_map_ops = {  	.map_pop_elem = queue_map_pop_elem,  	.map_peek_elem = queue_map_peek_elem,  	.map_get_next_key = queue_stack_map_get_next_key, +	.map_mem_usage = queue_stack_map_mem_usage,  	.map_btf_id = &queue_map_btf_ids[0],  }; @@ -274,5 +283,6 @@ const struct bpf_map_ops stack_map_ops = {  	.map_pop_elem = stack_map_pop_elem,  	.map_peek_elem = stack_map_peek_elem,  	.map_get_next_key = queue_stack_map_get_next_key, +	.map_mem_usage = queue_stack_map_mem_usage,  	.map_btf_id = &queue_map_btf_ids[0],  }; diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 82c61612f382..cbf2d8d784b8 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -59,7 +59,7 @@ static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)  }  /* Called from syscall only */ -static int reuseport_array_delete_elem(struct bpf_map *map, void *key) +static long reuseport_array_delete_elem(struct bpf_map *map, void *key)  {  	struct reuseport_array *array = reuseport_array(map);  	u32 index = *(u32 *)key; @@ -335,6 +335,13 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key,  	return 0;  } +static u64 reuseport_array_mem_usage(const struct bpf_map *map) +{ +	struct reuseport_array *array; + +	return struct_size(array, ptrs, map->max_entries); +} +  BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array)  const struct bpf_map_ops reuseport_array_ops = {  	.map_meta_equal = bpf_map_meta_equal, @@ -344,5 +351,6 @@ const struct bpf_map_ops reuseport_array_ops = {  	.map_lookup_elem = reuseport_array_lookup_elem,  	.map_get_next_key = reuseport_array_get_next_key,  	.map_delete_elem = reuseport_array_delete_elem, +	.map_mem_usage = reuseport_array_mem_usage,  	.map_btf_id = &reuseport_array_map_btf_ids[0],  }; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 8732e0aadf36..875ac9b698d9 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -19,6 +19,7 @@  	(offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)  /* consumer page and producer page */  #define RINGBUF_POS_PAGES 2 +#define RINGBUF_NR_META_PAGES (RINGBUF_PGOFF + RINGBUF_POS_PAGES)  #define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) @@ -96,7 +97,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)  {  	const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL |  			    __GFP_NOWARN | __GFP_ZERO; -	int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; +	int nr_meta_pages = RINGBUF_NR_META_PAGES;  	int nr_data_pages = data_sz >> PAGE_SHIFT;  	int nr_pages = nr_meta_pages + nr_data_pages;  	struct page **pages, *page; @@ -241,13 +242,13 @@ static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)  	return ERR_PTR(-ENOTSUPP);  } -static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, -				   u64 flags) +static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, +				    u64 flags)  {  	return -ENOTSUPP;  } -static int ringbuf_map_delete_elem(struct bpf_map *map, void *key) +static long ringbuf_map_delete_elem(struct bpf_map *map, void *key)  {  	return -ENOTSUPP;  } @@ -336,6 +337,21 @@ static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp,  	return 0;  } +static u64 ringbuf_map_mem_usage(const struct bpf_map *map) +{ +	struct bpf_ringbuf *rb; +	int nr_data_pages; +	int nr_meta_pages; +	u64 usage = sizeof(struct bpf_ringbuf_map); + +	rb = container_of(map, struct bpf_ringbuf_map, map)->rb; +	usage += (u64)rb->nr_pages << PAGE_SHIFT; +	nr_meta_pages = RINGBUF_NR_META_PAGES; +	nr_data_pages = map->max_entries >> PAGE_SHIFT; +	usage += (nr_meta_pages + 2 * nr_data_pages) * sizeof(struct page *); +	return usage; +} +  BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)  const struct bpf_map_ops ringbuf_map_ops = {  	.map_meta_equal = bpf_map_meta_equal, @@ -347,6 +363,7 @@ const struct bpf_map_ops ringbuf_map_ops = {  	.map_update_elem = ringbuf_map_update_elem,  	.map_delete_elem = ringbuf_map_delete_elem,  	.map_get_next_key = ringbuf_map_get_next_key, +	.map_mem_usage = ringbuf_map_mem_usage,  	.map_btf_id = &ringbuf_map_btf_ids[0],  }; @@ -361,6 +378,7 @@ const struct bpf_map_ops user_ringbuf_map_ops = {  	.map_update_elem = ringbuf_map_update_elem,  	.map_delete_elem = ringbuf_map_delete_elem,  	.map_get_next_key = ringbuf_map_get_next_key, +	.map_mem_usage = ringbuf_map_mem_usage,  	.map_btf_id = &user_ringbuf_map_btf_ids[0],  }; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index aecea7451b61..b25fce425b2c 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -618,14 +618,14 @@ static int stack_map_get_next_key(struct bpf_map *map, void *key,  	return 0;  } -static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, -				 u64 map_flags) +static long stack_map_update_elem(struct bpf_map *map, void *key, void *value, +				  u64 map_flags)  {  	return -EINVAL;  }  /* Called from syscall or from eBPF program */ -static int stack_map_delete_elem(struct bpf_map *map, void *key) +static long stack_map_delete_elem(struct bpf_map *map, void *key)  {  	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);  	struct stack_map_bucket *old_bucket; @@ -654,6 +654,19 @@ static void stack_map_free(struct bpf_map *map)  	put_callchain_buffers();  } +static u64 stack_map_mem_usage(const struct bpf_map *map) +{ +	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); +	u64 value_size = map->value_size; +	u64 n_buckets = smap->n_buckets; +	u64 enties = map->max_entries; +	u64 usage = sizeof(*smap); + +	usage += n_buckets * sizeof(struct stack_map_bucket *); +	usage += enties * (sizeof(struct stack_map_bucket) + value_size); +	return usage; +} +  BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map)  const struct bpf_map_ops stack_trace_map_ops = {  	.map_meta_equal = bpf_map_meta_equal, @@ -664,5 +677,6 @@ const struct bpf_map_ops stack_trace_map_ops = {  	.map_update_elem = stack_map_update_elem,  	.map_delete_elem = stack_map_delete_elem,  	.map_check_btf = map_check_no_btf, +	.map_mem_usage = stack_map_mem_usage,  	.map_btf_id = &stack_trace_map_btf_ids[0],  }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index adc83cb82f37..14f39c1e573e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -35,6 +35,7 @@  #include <linux/rcupdate_trace.h>  #include <linux/memcontrol.h>  #include <linux/trace_events.h> +#include <net/netfilter/nf_bpf_link.h>  #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \  			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -105,6 +106,7 @@ const struct bpf_map_ops bpf_map_offload_ops = {  	.map_alloc = bpf_map_offload_map_alloc,  	.map_free = bpf_map_offload_map_free,  	.map_check_btf = map_check_no_btf, +	.map_mem_usage = bpf_map_offload_map_mem_usage,  };  static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) @@ -128,6 +130,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)  	}  	if (attr->map_ifindex)  		ops = &bpf_map_offload_ops; +	if (!ops->map_mem_usage) +		return ERR_PTR(-EINVAL);  	map = ops->map_alloc(attr);  	if (IS_ERR(map))  		return map; @@ -517,14 +521,14 @@ static int btf_field_cmp(const void *a, const void *b)  }  struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, -				  enum btf_field_type type) +				  u32 field_mask)  {  	struct btf_field *field; -	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type)) +	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))  		return NULL;  	field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp); -	if (!field || !(field->type & type)) +	if (!field || !(field->type & field_mask))  		return NULL;  	return field;  } @@ -549,6 +553,7 @@ void btf_record_free(struct btf_record *rec)  		case BPF_RB_NODE:  		case BPF_SPIN_LOCK:  		case BPF_TIMER: +		case BPF_REFCOUNT:  			/* Nothing to release */  			break;  		default: @@ -596,6 +601,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)  		case BPF_RB_NODE:  		case BPF_SPIN_LOCK:  		case BPF_TIMER: +		case BPF_REFCOUNT:  			/* Nothing to acquire */  			break;  		default: @@ -647,6 +653,8 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)  	bpf_timer_cancel_and_free(obj + rec->timer_off);  } +extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); +  void bpf_obj_free_fields(const struct btf_record *rec, void *obj)  {  	const struct btf_field *fields; @@ -656,8 +664,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)  		return;  	fields = rec->fields;  	for (i = 0; i < rec->cnt; i++) { +		struct btf_struct_meta *pointee_struct_meta;  		const struct btf_field *field = &fields[i];  		void *field_ptr = obj + field->offset; +		void *xchgd_field;  		switch (fields[i].type) {  		case BPF_SPIN_LOCK: @@ -669,7 +679,22 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)  			WRITE_ONCE(*(u64 *)field_ptr, 0);  			break;  		case BPF_KPTR_REF: -			field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0)); +			xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0); +			if (!xchgd_field) +				break; + +			if (!btf_is_kernel(field->kptr.btf)) { +				pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, +									   field->kptr.btf_id); +				WARN_ON_ONCE(!pointee_struct_meta); +				migrate_disable(); +				__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? +								 pointee_struct_meta->record : +								 NULL); +				migrate_enable(); +			} else { +				field->kptr.dtor(xchgd_field); +			}  			break;  		case BPF_LIST_HEAD:  			if (WARN_ON_ONCE(rec->spin_lock_off < 0)) @@ -683,6 +708,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)  			break;  		case BPF_LIST_NODE:  		case BPF_RB_NODE: +		case BPF_REFCOUNT:  			break;  		default:  			WARN_ON_ONCE(1); @@ -695,14 +721,13 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)  static void bpf_map_free_deferred(struct work_struct *work)  {  	struct bpf_map *map = container_of(work, struct bpf_map, work); -	struct btf_field_offs *foffs = map->field_offs;  	struct btf_record *rec = map->record;  	security_bpf_map_free(map);  	bpf_map_release_memcg(map);  	/* implementation dependent freeing */  	map->ops->map_free(map); -	/* Delay freeing of field_offs and btf_record for maps, as map_free +	/* Delay freeing of btf_record for maps, as map_free  	 * callback usually needs access to them. It is better to do it here  	 * than require each callback to do the free itself manually.  	 * @@ -711,7 +736,6 @@ static void bpf_map_free_deferred(struct work_struct *work)  	 * eventually calls bpf_map_free_meta, since inner_map_meta is only a  	 * template bpf_map struct used during verification.  	 */ -	kfree(foffs);  	btf_record_free(rec);  } @@ -771,17 +795,10 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)  }  #ifdef CONFIG_PROC_FS -/* Provides an approximation of the map's memory footprint. - * Used only to provide a backward compatibility and display - * a reasonable "memlock" info. - */ -static unsigned long bpf_map_memory_footprint(const struct bpf_map *map) +/* Show the memory usage of a bpf map */ +static u64 bpf_map_memory_usage(const struct bpf_map *map)  { -	unsigned long size; - -	size = round_up(map->key_size + bpf_map_value_size(map), 8); - -	return round_up(map->max_entries * size, PAGE_SIZE); +	return map->ops->map_mem_usage(map);  }  static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) @@ -803,7 +820,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)  		   "max_entries:\t%u\n"  		   "map_flags:\t%#x\n"  		   "map_extra:\t%#llx\n" -		   "memlock:\t%lu\n" +		   "memlock:\t%llu\n"  		   "map_id:\t%u\n"  		   "frozen:\t%u\n",  		   map->map_type, @@ -812,7 +829,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)  		   map->max_entries,  		   map->map_flags,  		   (unsigned long long)map->map_extra, -		   bpf_map_memory_footprint(map), +		   bpf_map_memory_usage(map),  		   map->id,  		   READ_ONCE(map->frozen));  	if (type) { @@ -1019,7 +1036,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,  	map->record = btf_parse_fields(btf, value_type,  				       BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | -				       BPF_RB_ROOT, +				       BPF_RB_ROOT | BPF_REFCOUNT,  				       map->value_size);  	if (!IS_ERR_OR_NULL(map->record)) {  		int i; @@ -1058,10 +1075,17 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,  				break;  			case BPF_KPTR_UNREF:  			case BPF_KPTR_REF: +			case BPF_REFCOUNT:  				if (map->map_type != BPF_MAP_TYPE_HASH && +				    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&  				    map->map_type != BPF_MAP_TYPE_LRU_HASH && +				    map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&  				    map->map_type != BPF_MAP_TYPE_ARRAY && -				    map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { +				    map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY && +				    map->map_type != BPF_MAP_TYPE_SK_STORAGE && +				    map->map_type != BPF_MAP_TYPE_INODE_STORAGE && +				    map->map_type != BPF_MAP_TYPE_TASK_STORAGE && +				    map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {  					ret = -EOPNOTSUPP;  					goto free_map_tab;  				} @@ -1104,7 +1128,6 @@ free_map_tab:  static int map_create(union bpf_attr *attr)  {  	int numa_node = bpf_map_attr_numa_node(attr); -	struct btf_field_offs *foffs;  	struct bpf_map *map;  	int f_flags;  	int err; @@ -1184,17 +1207,9 @@ static int map_create(union bpf_attr *attr)  			attr->btf_vmlinux_value_type_id;  	} - -	foffs = btf_parse_field_offs(map->record); -	if (IS_ERR(foffs)) { -		err = PTR_ERR(foffs); -		goto free_map; -	} -	map->field_offs = foffs; -  	err = security_bpf_map_alloc(map);  	if (err) -		goto free_map_field_offs; +		goto free_map;  	err = bpf_map_alloc_id(map);  	if (err) @@ -1218,8 +1233,6 @@ static int map_create(union bpf_attr *attr)  free_map_sec:  	security_bpf_map_free(map); -free_map_field_offs: -	kfree(map->field_offs);  free_map:  	btf_put(map->btf);  	map->ops->map_free(map); @@ -1285,8 +1298,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)  	return map;  } -/* map_idr_lock should have been held */ -static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) +/* map_idr_lock should have been held or the map should have been + * protected by rcu read lock. + */ +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)  {  	int refold; @@ -2049,6 +2064,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)  {  	bpf_prog_kallsyms_del_all(prog);  	btf_put(prog->aux->btf); +	module_put(prog->aux->mod);  	kvfree(prog->aux->jited_linfo);  	kvfree(prog->aux->linfo);  	kfree(prog->aux->kfunc_tab); @@ -2439,7 +2455,6 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)  	case BPF_PROG_TYPE_LWT_SEG6LOCAL:  	case BPF_PROG_TYPE_SK_SKB:  	case BPF_PROG_TYPE_SK_MSG: -	case BPF_PROG_TYPE_LIRC_MODE2:  	case BPF_PROG_TYPE_FLOW_DISSECTOR:  	case BPF_PROG_TYPE_CGROUP_DEVICE:  	case BPF_PROG_TYPE_CGROUP_SOCK: @@ -2448,6 +2463,7 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)  	case BPF_PROG_TYPE_CGROUP_SYSCTL:  	case BPF_PROG_TYPE_SOCK_OPS:  	case BPF_PROG_TYPE_EXT: /* extends any prog */ +	case BPF_PROG_TYPE_NETFILTER:  		return true;  	case BPF_PROG_TYPE_CGROUP_SKB:  		/* always unpriv */ @@ -2477,9 +2493,9 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)  }  /* last field in 'union bpf_attr' used by this command */ -#define	BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size +#define	BPF_PROG_LOAD_LAST_FIELD log_true_size -static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) +static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)  {  	enum bpf_prog_type type = attr->prog_type;  	struct bpf_prog *prog, *dst_prog = NULL; @@ -2629,7 +2645,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)  		goto free_prog_sec;  	/* run eBPF verifier */ -	err = bpf_check(&prog, attr, uattr); +	err = bpf_check(&prog, attr, uattr, uattr_size);  	if (err < 0)  		goto free_used_maps; @@ -2804,16 +2820,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)  	const struct bpf_prog *prog = link->prog;  	char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; -	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));  	seq_printf(m,  		   "link_type:\t%s\n" -		   "link_id:\t%u\n" -		   "prog_tag:\t%s\n" -		   "prog_id:\t%u\n", +		   "link_id:\t%u\n",  		   bpf_link_type_strs[link->type], -		   link->id, -		   prog_tag, -		   prog->aux->id); +		   link->id); +	if (prog) { +		bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); +		seq_printf(m, +			   "prog_tag:\t%s\n" +			   "prog_id:\t%u\n", +			   prog_tag, +			   prog->aux->id); +	}  	if (link->ops->show_fdinfo)  		link->ops->show_fdinfo(link, m);  } @@ -3095,6 +3114,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,  		if (err)  			goto out_unlock; +		if (tgt_info.tgt_mod) { +			module_put(prog->aux->mod); +			prog->aux->mod = tgt_info.tgt_mod; +		} +  		tr = bpf_trampoline_get(key, &tgt_info);  		if (!tr) {  			err = -ENOMEM; @@ -4288,7 +4312,8 @@ static int bpf_link_get_info_by_fd(struct file *file,  	info.type = link->type;  	info.id = link->id; -	info.prog_id = link->prog->aux->id; +	if (link->prog) +		info.prog_id = link->prog->aux->id;  	if (link->ops->fill_link_info) {  		err = link->ops->fill_link_info(link, &info); @@ -4338,9 +4363,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,  	return err;  } -#define BPF_BTF_LOAD_LAST_FIELD btf_log_level +#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size -static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr) +static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)  {  	if (CHECK_ATTR(BPF_BTF_LOAD))  		return -EINVAL; @@ -4348,7 +4373,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)  	if (!bpf_capable())  		return -EPERM; -	return btf_new_fd(attr, uattr); +	return btf_new_fd(attr, uattr, uattr_size);  }  #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id @@ -4551,6 +4576,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)  	if (CHECK_ATTR(BPF_LINK_CREATE))  		return -EINVAL; +	if (attr->link_create.attach_type == BPF_STRUCT_OPS) +		return bpf_struct_ops_link_create(attr); +  	prog = bpf_prog_get(attr->link_create.prog_fd);  	if (IS_ERR(prog))  		return PTR_ERR(prog); @@ -4562,6 +4590,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)  	switch (prog->type) {  	case BPF_PROG_TYPE_EXT: +	case BPF_PROG_TYPE_NETFILTER:  		break;  	case BPF_PROG_TYPE_PERF_EVENT:  	case BPF_PROG_TYPE_TRACEPOINT: @@ -4628,6 +4657,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)  	case BPF_PROG_TYPE_XDP:  		ret = bpf_xdp_link_attach(attr, prog);  		break; +	case BPF_PROG_TYPE_NETFILTER: +		ret = bpf_nf_link_attach(attr, prog); +		break;  #endif  	case BPF_PROG_TYPE_PERF_EVENT:  	case BPF_PROG_TYPE_TRACEPOINT: @@ -4649,6 +4681,35 @@ out:  	return ret;  } +static int link_update_map(struct bpf_link *link, union bpf_attr *attr) +{ +	struct bpf_map *new_map, *old_map = NULL; +	int ret; + +	new_map = bpf_map_get(attr->link_update.new_map_fd); +	if (IS_ERR(new_map)) +		return PTR_ERR(new_map); + +	if (attr->link_update.flags & BPF_F_REPLACE) { +		old_map = bpf_map_get(attr->link_update.old_map_fd); +		if (IS_ERR(old_map)) { +			ret = PTR_ERR(old_map); +			goto out_put; +		} +	} else if (attr->link_update.old_map_fd) { +		ret = -EINVAL; +		goto out_put; +	} + +	ret = link->ops->update_map(link, new_map, old_map); + +	if (old_map) +		bpf_map_put(old_map); +out_put: +	bpf_map_put(new_map); +	return ret; +} +  #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd  static int link_update(union bpf_attr *attr) @@ -4669,6 +4730,11 @@ static int link_update(union bpf_attr *attr)  	if (IS_ERR(link))  		return PTR_ERR(link); +	if (link->ops->update_map) { +		ret = link_update_map(link, attr); +		goto out_put_link; +	} +  	new_prog = bpf_prog_get(attr->link_update.new_prog_fd);  	if (IS_ERR(new_prog)) {  		ret = PTR_ERR(new_prog); @@ -4989,7 +5055,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)  		err = map_freeze(&attr);  		break;  	case BPF_PROG_LOAD: -		err = bpf_prog_load(&attr, uattr); +		err = bpf_prog_load(&attr, uattr, size);  		break;  	case BPF_OBJ_PIN:  		err = bpf_obj_pin(&attr); @@ -5034,7 +5100,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)  		err = bpf_raw_tracepoint_open(&attr);  		break;  	case BPF_BTF_LOAD: -		err = bpf_btf_load(&attr, uattr); +		err = bpf_btf_load(&attr, uattr, size);  		break;  	case BPF_BTF_GET_FD_BY_ID:  		err = bpf_btf_get_fd_by_id(&attr); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index d0ed7d6f5eec..ac021bc43a66 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -9,7 +9,6 @@  #include <linux/btf.h>  #include <linux/rcupdate_trace.h>  #include <linux/rcupdate_wait.h> -#include <linux/module.h>  #include <linux/static_call.h>  #include <linux/bpf_verifier.h>  #include <linux/bpf_lsm.h> @@ -45,8 +44,8 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd  		lockdep_assert_held_once(&tr->mutex);  		/* Instead of updating the trampoline here, we propagate -		 * -EAGAIN to register_ftrace_direct_multi(). Then we can -		 * retry register_ftrace_direct_multi() after updating the +		 * -EAGAIN to register_ftrace_direct(). Then we can +		 * retry register_ftrace_direct() after updating the  		 * trampoline.  		 */  		if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && @@ -172,38 +171,16 @@ out:  	return tr;  } -static int bpf_trampoline_module_get(struct bpf_trampoline *tr) -{ -	struct module *mod; -	int err = 0; - -	preempt_disable(); -	mod = __module_text_address((unsigned long) tr->func.addr); -	if (mod && !try_module_get(mod)) -		err = -ENOENT; -	preempt_enable(); -	tr->mod = mod; -	return err; -} - -static void bpf_trampoline_module_put(struct bpf_trampoline *tr) -{ -	module_put(tr->mod); -	tr->mod = NULL; -} -  static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)  {  	void *ip = tr->func.addr;  	int ret;  	if (tr->func.ftrace_managed) -		ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr); +		ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);  	else  		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); -	if (!ret) -		bpf_trampoline_module_put(tr);  	return ret;  } @@ -215,9 +192,9 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad  	if (tr->func.ftrace_managed) {  		if (lock_direct_mutex) -			ret = modify_ftrace_direct_multi(tr->fops, (long)new_addr); +			ret = modify_ftrace_direct(tr->fops, (long)new_addr);  		else -			ret = modify_ftrace_direct_multi_nolock(tr->fops, (long)new_addr); +			ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);  	} else {  		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);  	} @@ -238,18 +215,13 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)  		tr->func.ftrace_managed = true;  	} -	if (bpf_trampoline_module_get(tr)) -		return -ENOENT; -  	if (tr->func.ftrace_managed) {  		ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); -		ret = register_ftrace_direct_multi(tr->fops, (long)new_addr); +		ret = register_ftrace_direct(tr->fops, (long)new_addr);  	} else {  		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);  	} -	if (ret) -		bpf_trampoline_module_put(tr);  	return ret;  } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d517d13878cf..fbcf5a4e2fcd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -24,6 +24,7 @@  #include <linux/bpf_lsm.h>  #include <linux/btf_ids.h>  #include <linux/poison.h> +#include <linux/module.h>  #include "disasm.h" @@ -194,6 +195,8 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env);  static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);  static int ref_set_non_owning(struct bpf_verifier_env *env,  			      struct bpf_reg_state *reg); +static void specialize_kfunc(struct bpf_verifier_env *env, +			     u32 func_id, u16 offset, unsigned long *addr);  static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)  { @@ -268,7 +271,50 @@ struct bpf_call_arg_meta {  	u32 ret_btf_id;  	u32 subprogno;  	struct btf_field *kptr_field; -	u8 uninit_dynptr_regno; +}; + +struct btf_and_id { +	struct btf *btf; +	u32 btf_id; +}; + +struct bpf_kfunc_call_arg_meta { +	/* In parameters */ +	struct btf *btf; +	u32 func_id; +	u32 kfunc_flags; +	const struct btf_type *func_proto; +	const char *func_name; +	/* Out parameters */ +	u32 ref_obj_id; +	u8 release_regno; +	bool r0_rdonly; +	u32 ret_btf_id; +	u64 r0_size; +	u32 subprogno; +	struct { +		u64 value; +		bool found; +	} arg_constant; +	union { +		struct btf_and_id arg_obj_drop; +		struct btf_and_id arg_refcount_acquire; +	}; +	struct { +		struct btf_field *field; +	} arg_list_head; +	struct { +		struct btf_field *field; +	} arg_rbtree_root; +	struct { +		enum bpf_dynptr_type type; +		u32 id; +	} initialized_dynptr; +	struct { +		u8 spi; +		u8 frameno; +	} iter; +	u64 mem_size;  };  struct btf *btf_vmlinux; @@ -296,61 +342,6 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off)  	return &linfo[i - 1];  } -void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, -		       va_list args) -{ -	unsigned int n; - -	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); - -	WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, -		  "verifier log line truncated - local buffer too short\n"); - -	if (log->level == BPF_LOG_KERNEL) { -		bool newline = n > 0 && log->kbuf[n - 1] == '\n'; - -		pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n"); -		return; -	} - -	n = min(log->len_total - log->len_used - 1, n); -	log->kbuf[n] = '\0'; -	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) -		log->len_used += n; -	else -		log->ubuf = NULL; -} - -static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos) -{ -	char zero = 0; - -	if (!bpf_verifier_log_needed(log)) -		return; - -	log->len_used = new_pos; -	if (put_user(zero, log->ubuf + new_pos)) -		log->ubuf = NULL; -} - -/* log_level controls verbosity level of eBPF verifier. - * bpf_verifier_log_write() is used to dump the verification trace to the log, - * so the user can figure out what's wrong with the program - */ -__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, -					   const char *fmt, ...) -{ -	va_list args; - -	if (!bpf_verifier_log_needed(&env->log)) -		return; - -	va_start(args, fmt); -	bpf_verifier_vlog(&env->log, fmt, args); -	va_end(args); -} -EXPORT_SYMBOL_GPL(bpf_verifier_log_write); -  __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)  {  	struct bpf_verifier_env *env = private_data; @@ -364,20 +355,6 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)  	va_end(args);  } -__printf(2, 3) void bpf_log(struct bpf_verifier_log *log, -			    const char *fmt, ...) -{ -	va_list args; - -	if (!bpf_verifier_log_needed(log)) -		return; - -	va_start(args, fmt); -	bpf_verifier_vlog(log, fmt, args); -	va_end(args); -} -EXPORT_SYMBOL_GPL(bpf_log); -  static const char *ltrim(const char *s)  {  	while (isspace(*s)) @@ -447,13 +424,23 @@ static bool type_is_sk_pointer(enum bpf_reg_type type)  		type == PTR_TO_XDP_SOCK;  } +static bool type_may_be_null(u32 type) +{ +	return type & PTR_MAYBE_NULL; +} +  static bool reg_type_not_null(enum bpf_reg_type type)  { +	if (type_may_be_null(type)) +		return false; + +	type = base_type(type);  	return type == PTR_TO_SOCKET ||  		type == PTR_TO_TCP_SOCK ||  		type == PTR_TO_MAP_VALUE ||  		type == PTR_TO_MAP_KEY || -		type == PTR_TO_SOCK_COMMON; +		type == PTR_TO_SOCK_COMMON || +		type == PTR_TO_MEM;  }  static bool type_is_ptr_alloc_obj(u32 type) @@ -491,11 +478,6 @@ static bool type_is_rdonly_mem(u32 type)  	return type & MEM_RDONLY;  } -static bool type_may_be_null(u32 type) -{ -	return type & PTR_MAYBE_NULL; -} -  static bool is_acquire_function(enum bpf_func_id func_id,  				const struct bpf_map *map)  { @@ -633,6 +615,7 @@ static char slot_type_char[] = {  	[STACK_MISC]	= 'm',  	[STACK_ZERO]	= '0',  	[STACK_DYNPTR]	= 'd', +	[STACK_ITER]	= 'i',  };  static void print_liveness(struct bpf_verifier_env *env, @@ -675,37 +658,91 @@ static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_sl         return spi - nr_slots + 1 >= 0 && spi < allocated_slots;  } -static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, +			          const char *obj_kind, int nr_slots)  {  	int off, spi;  	if (!tnum_is_const(reg->var_off)) { -		verbose(env, "dynptr has to be at a constant offset\n"); +		verbose(env, "%s has to be at a constant offset\n", obj_kind);  		return -EINVAL;  	}  	off = reg->off + reg->var_off.value;  	if (off % BPF_REG_SIZE) { -		verbose(env, "cannot pass in dynptr at an offset=%d\n", off); +		verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);  		return -EINVAL;  	}  	spi = __get_spi(off); -	if (spi < 1) { -		verbose(env, "cannot pass in dynptr at an offset=%d\n", off); +	if (spi + 1 < nr_slots) { +		verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);  		return -EINVAL;  	} -	if (!is_spi_bounds_valid(func(env, reg), spi, BPF_DYNPTR_NR_SLOTS)) +	if (!is_spi_bounds_valid(func(env, reg), spi, nr_slots))  		return -ERANGE;  	return spi;  } -static const char *kernel_type_name(const struct btf* btf, u32 id) +static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ +	return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS); +} + +static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots) +{ +	return stack_slot_obj_get_spi(env, reg, "iter", nr_slots); +} + +static const char *btf_type_name(const struct btf *btf, u32 id)  {  	return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);  } +static const char *dynptr_type_str(enum bpf_dynptr_type type) +{ +	switch (type) { +	case BPF_DYNPTR_TYPE_LOCAL: +		return "local"; +	case BPF_DYNPTR_TYPE_RINGBUF: +		return "ringbuf"; +	case BPF_DYNPTR_TYPE_SKB: +		return "skb"; +	case BPF_DYNPTR_TYPE_XDP: +		return "xdp"; +	case BPF_DYNPTR_TYPE_INVALID: +		return "<invalid>"; +	default: +		WARN_ONCE(1, "unknown dynptr type %d\n", type); +		return "<unknown>"; +	} +} + +static const char *iter_type_str(const struct btf *btf, u32 btf_id) +{ +	if (!btf || btf_id == 0) +		return "<invalid>"; + +	/* we already validated that type is valid and has conforming name */ +	return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1; +} + +static const char *iter_state_str(enum bpf_iter_state state) +{ +	switch (state) { +	case BPF_ITER_STATE_ACTIVE: +		return "active"; +	case BPF_ITER_STATE_DRAINED: +		return "drained"; +	case BPF_ITER_STATE_INVALID: +		return "<invalid>"; +	default: +		WARN_ONCE(1, "unknown iter state %d\n", state); +		return "<unknown>"; +	} +} +  static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)  {  	env->scratched_regs |= 1U << regno; @@ -751,11 +788,31 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)  		return BPF_DYNPTR_TYPE_LOCAL;  	case DYNPTR_TYPE_RINGBUF:  		return BPF_DYNPTR_TYPE_RINGBUF; +	case DYNPTR_TYPE_SKB: +		return BPF_DYNPTR_TYPE_SKB; +	case DYNPTR_TYPE_XDP: +		return BPF_DYNPTR_TYPE_XDP;  	default:  		return BPF_DYNPTR_TYPE_INVALID;  	}  } +static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) +{ +	switch (type) { +	case BPF_DYNPTR_TYPE_LOCAL: +		return DYNPTR_TYPE_LOCAL; +	case BPF_DYNPTR_TYPE_RINGBUF: +		return DYNPTR_TYPE_RINGBUF; +	case BPF_DYNPTR_TYPE_SKB: +		return DYNPTR_TYPE_SKB; +	case BPF_DYNPTR_TYPE_XDP: +		return DYNPTR_TYPE_XDP; +	default: +		return 0; +	} +} +  static bool dynptr_type_refcounted(enum bpf_dynptr_type type)  {  	return type == BPF_DYNPTR_TYPE_RINGBUF; @@ -895,6 +952,14 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re  static void __mark_reg_unknown(const struct bpf_verifier_env *env,  			       struct bpf_reg_state *reg); +static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ +	if (!env->allow_ptr_leaks) +		__mark_reg_not_init(env, reg); +	else +		__mark_reg_unknown(env, reg); +} +  static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,  				        struct bpf_func_state *state, int spi)  { @@ -934,12 +999,8 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,  		/* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */  		if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM)  			continue; -		if (dreg->dynptr_id == dynptr_id) { -			if (!env->allow_ptr_leaks) -				__mark_reg_not_init(env, dreg); -			else -				__mark_reg_unknown(env, dreg); -		} +		if (dreg->dynptr_id == dynptr_id) +			mark_reg_invalid(env, dreg);  	}));  	/* Do not release reference state, we are destroying dynptr on stack, @@ -955,39 +1016,49 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,  	return 0;  } -static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg, -				       int spi) +static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)  { +	int spi; +  	if (reg->type == CONST_PTR_TO_DYNPTR)  		return false; -	/* For -ERANGE (i.e. spi not falling into allocated stack slots), we -	 * will do check_mem_access to check and update stack bounds later, so -	 * return true for that case. +	spi = dynptr_get_spi(env, reg); + +	/* -ERANGE (i.e. spi not falling into allocated stack slots) isn't an +	 * error because this just means the stack state hasn't been updated yet. +	 * We will do check_mem_access to check and update stack bounds later.  	 */ -	if (spi < 0) -		return spi == -ERANGE; -	/* We allow overwriting existing unreferenced STACK_DYNPTR slots, see -	 * mark_stack_slots_dynptr which calls destroy_if_dynptr_stack_slot to -	 * ensure dynptr objects at the slots we are touching are completely -	 * destructed before we reinitialize them for a new one. For referenced -	 * ones, destroy_if_dynptr_stack_slot returns an error early instead of -	 * delaying it until the end where the user will get "Unreleased +	if (spi < 0 && spi != -ERANGE) +		return false; + +	/* We don't need to check if the stack slots are marked by previous +	 * dynptr initializations because we allow overwriting existing unreferenced +	 * STACK_DYNPTR slots, see mark_stack_slots_dynptr which calls +	 * destroy_if_dynptr_stack_slot to ensure dynptr objects at the slots we are +	 * touching are completely destructed before we reinitialize them for a new +	 * one. For referenced ones, destroy_if_dynptr_stack_slot returns an error early +	 * instead of delaying it until the end where the user will get "Unreleased  	 * reference" error.  	 */  	return true;  } -static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, -				     int spi) +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)  {  	struct bpf_func_state *state = func(env, reg); -	int i; +	int i, spi; -	/* This already represents first slot of initialized bpf_dynptr */ +	/* This already represents first slot of initialized bpf_dynptr. +	 * +	 * CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to +	 * check_func_arg_reg_off's logic, so we don't need to check its +	 * offset and alignment. +	 */  	if (reg->type == CONST_PTR_TO_DYNPTR)  		return true; +	spi = dynptr_get_spi(env, reg);  	if (spi < 0)  		return false;  	if (!state->stack[spi].spilled_ptr.dynptr.first_slot) @@ -1024,6 +1095,157 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg  	}  } +static void __mark_reg_known_zero(struct bpf_reg_state *reg); + +static int mark_stack_slots_iter(struct bpf_verifier_env *env, +				 struct bpf_reg_state *reg, int insn_idx, +				 struct btf *btf, u32 btf_id, int nr_slots) +{ +	struct bpf_func_state *state = func(env, reg); +	int spi, i, j, id; + +	spi = iter_get_spi(env, reg, nr_slots); +	if (spi < 0) +		return spi; + +	id = acquire_reference_state(env, insn_idx); +	if (id < 0) +		return id; + +	for (i = 0; i < nr_slots; i++) { +		struct bpf_stack_state *slot = &state->stack[spi - i]; +		struct bpf_reg_state *st = &slot->spilled_ptr; + +		__mark_reg_known_zero(st); +		st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ +		st->live |= REG_LIVE_WRITTEN; +		st->ref_obj_id = i == 0 ? id : 0; +		st->iter.btf = btf; +		st->iter.btf_id = btf_id; +		st->iter.state = BPF_ITER_STATE_ACTIVE; +		st->iter.depth = 0; + +		for (j = 0; j < BPF_REG_SIZE; j++) +			slot->slot_type[j] = STACK_ITER; + +		mark_stack_slot_scratched(env, spi - i); +	} + +	return 0; +} + +static int unmark_stack_slots_iter(struct bpf_verifier_env *env, +				   struct bpf_reg_state *reg, int nr_slots) +{ +	struct bpf_func_state *state = func(env, reg); +	int spi, i, j; + +	spi = iter_get_spi(env, reg, nr_slots); +	if (spi < 0) +		return spi; + +	for (i = 0; i < nr_slots; i++) { +		struct bpf_stack_state *slot = &state->stack[spi - i]; +		struct bpf_reg_state *st = &slot->spilled_ptr; + +		if (i == 0) +			WARN_ON_ONCE(release_reference(env, st->ref_obj_id)); + +		__mark_reg_not_init(env, st); + +		/* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ +		st->live |= REG_LIVE_WRITTEN; + +		for (j = 0; j < BPF_REG_SIZE; j++) +			slot->slot_type[j] = STACK_INVALID; + +		mark_stack_slot_scratched(env, spi - i); +	} + +	return 0; +} + +static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env, +				     struct bpf_reg_state *reg, int nr_slots) +{ +	struct bpf_func_state *state = func(env, reg); +	int spi, i, j; + +	/* For -ERANGE (i.e. spi not falling into allocated stack slots), we +	 * will do check_mem_access to check and update stack bounds later, so +	 * return true for that case. +	 */ +	spi = iter_get_spi(env, reg, nr_slots); +	if (spi == -ERANGE) +		return true; +	if (spi < 0) +		return false; + +	for (i = 0; i < nr_slots; i++) { +		struct bpf_stack_state *slot = &state->stack[spi - i]; + +		for (j = 0; j < BPF_REG_SIZE; j++) +			if (slot->slot_type[j] == STACK_ITER) +				return false; +	} + +	return true; +} + +static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, +				   struct btf *btf, u32 btf_id, int nr_slots) +{ +	struct bpf_func_state *state = func(env, reg); +	int spi, i, j; + +	spi = iter_get_spi(env, reg, nr_slots); +	if (spi < 0) +		return false; + +	for (i = 0; i < nr_slots; i++) { +		struct bpf_stack_state *slot = &state->stack[spi - i]; +		struct bpf_reg_state *st = &slot->spilled_ptr; + +		/* only main (first) slot has ref_obj_id set */ +		if (i == 0 && !st->ref_obj_id) +			return false; +		if (i != 0 && st->ref_obj_id) +			return false; +		if (st->iter.btf != btf || st->iter.btf_id != btf_id) +			return false; + +		for (j = 0; j < BPF_REG_SIZE; j++) +			if (slot->slot_type[j] != STACK_ITER) +				return false; +	} + +	return true; +} + +/* Check if given stack slot is "special": + *   - spilled register state (STACK_SPILL); + *   - dynptr state (STACK_DYNPTR); + *   - iter state (STACK_ITER). + */ +static bool is_stack_slot_special(const struct bpf_stack_state *stack) +{ +	enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1]; + +	switch (type) { +	case STACK_SPILL: +	case STACK_DYNPTR: +	case STACK_ITER: +		return true; +	case STACK_INVALID: +	case STACK_MISC: +	case STACK_ZERO: +		return false; +	default: +		WARN_ONCE(1, "unknown stack slot type %d\n", type); +		return true; +	} +} +  /* The reg state of a pointer or a bounded scalar was saved when   * it was spilled to the stack.   */ @@ -1070,7 +1292,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,  			verbose(env, "%s", reg_type_str(env, t));  			if (base_type(t) == PTR_TO_BTF_ID) -				verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); +				verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));  			verbose(env, "(");  /*   * _a stands for append, was shortened to avoid multiline statements below. @@ -1143,26 +1365,62 @@ static void print_verifier_state(struct bpf_verifier_env *env,  		for (j = 0; j < BPF_REG_SIZE; j++) {  			if (state->stack[i].slot_type[j] != STACK_INVALID)  				valid = true; -			types_buf[j] = slot_type_char[ -					state->stack[i].slot_type[j]]; +			types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];  		}  		types_buf[BPF_REG_SIZE] = 0;  		if (!valid)  			continue;  		if (!print_all && !stack_slot_scratched(env, i))  			continue; -		verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); -		print_liveness(env, state->stack[i].spilled_ptr.live); -		if (is_spilled_reg(&state->stack[i])) { +		switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) { +		case STACK_SPILL:  			reg = &state->stack[i].spilled_ptr;  			t = reg->type; + +			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); +			print_liveness(env, reg->live);  			verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));  			if (t == SCALAR_VALUE && reg->precise)  				verbose(env, "P");  			if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))  				verbose(env, "%lld", reg->var_off.value + reg->off); -		} else { +			break; +		case STACK_DYNPTR: +			i += BPF_DYNPTR_NR_SLOTS - 1; +			reg = &state->stack[i].spilled_ptr; + +			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); +			print_liveness(env, reg->live); +			verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type)); +			if (reg->ref_obj_id) +				verbose(env, "(ref_id=%d)", reg->ref_obj_id); +			break; +		case STACK_ITER: +			/* only main slot has ref_obj_id set; skip others */ +			reg = &state->stack[i].spilled_ptr; +			if (!reg->ref_obj_id) +				continue; + +			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); +			print_liveness(env, reg->live); +			verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)", +				iter_type_str(reg->iter.btf, reg->iter.btf_id), +				reg->ref_obj_id, iter_state_str(reg->iter.state), +				reg->iter.depth); +			break; +		case STACK_MISC: +		case STACK_ZERO: +		default: +			reg = &state->stack[i].spilled_ptr; + +			for (j = 0; j < BPF_REG_SIZE; j++) +				types_buf[j] = slot_type_char[state->stack[i].slot_type[j]]; +			types_buf[BPF_REG_SIZE] = 0; + +			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); +			print_liveness(env, reg->live);  			verbose(env, "=%s", types_buf); +			break;  		}  	}  	if (state->acquired_refs && state->refs[0].id) { @@ -1188,10 +1446,10 @@ static inline u32 vlog_alignment(u32 pos)  static void print_insn_state(struct bpf_verifier_env *env,  			     const struct bpf_func_state *state)  { -	if (env->prev_log_len && env->prev_log_len == env->log.len_used) { +	if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {  		/* remove new line character */ -		bpf_vlog_reset(&env->log, env->prev_log_len - 1); -		verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' '); +		bpf_vlog_reset(&env->log, env->prev_log_pos - 1); +		verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');  	} else {  		verbose(env, "%d:", env->insn_idx);  	} @@ -1499,7 +1757,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,  	elem->insn_idx = insn_idx;  	elem->prev_insn_idx = prev_insn_idx;  	elem->next = env->head; -	elem->log_pos = env->log.len_used; +	elem->log_pos = env->log.end_pos;  	env->head = elem;  	env->stack_size++;  	err = copy_verifier_state(&elem->st, cur); @@ -1664,6 +1922,12 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)  	       reg->type == PTR_TO_PACKET_END;  } +static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg) +{ +	return base_type(reg->type) == PTR_TO_MEM && +		(reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP); +} +  /* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */  static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,  				    enum bpf_reg_type which) @@ -1823,9 +2087,9 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)  	struct tnum var64_off = tnum_intersect(reg->var_off,  					       tnum_range(reg->umin_value,  							  reg->umax_value)); -	struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off), -						tnum_range(reg->u32_min_value, -							   reg->u32_max_value)); +	struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off), +					       tnum_range(reg->u32_min_value, +							  reg->u32_max_value));  	reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);  } @@ -2029,7 +2293,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,  	elem->insn_idx = insn_idx;  	elem->prev_insn_idx = prev_insn_idx;  	elem->next = env->head; -	elem->log_pos = env->log.len_used; +	elem->log_pos = env->log.end_pos;  	env->head = elem;  	env->stack_size++;  	if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { @@ -2117,6 +2381,7 @@ struct bpf_kfunc_desc {  	u32 func_id;  	s32 imm;  	u16 offset; +	unsigned long addr;  };  struct bpf_kfunc_btf { @@ -2126,6 +2391,11 @@ struct bpf_kfunc_btf {  };  struct bpf_kfunc_desc_tab { +	/* Sorted by func_id (BTF ID) and offset (fd_array offset) during +	 * verification. JITs do lookups by bpf_insn, where func_id may not be +	 * available, therefore at the end of verification do_misc_fixups() +	 * sorts this by imm and offset. +	 */  	struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];  	u32 nr_descs;  }; @@ -2166,6 +2436,19 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)  		       sizeof(tab->descs[0]), kfunc_desc_cmp_by_id_off);  } +int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id, +		       u16 btf_fd_idx, u8 **func_addr) +{ +	const struct bpf_kfunc_desc *desc; + +	desc = find_kfunc_desc(prog, func_id, btf_fd_idx); +	if (!desc) +		return -EFAULT; + +	*func_addr = (u8 *)desc->addr; +	return 0; +} +  static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,  					 s16 offset)  { @@ -2345,13 +2628,18 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)  			func_name);  		return -EINVAL;  	} +	specialize_kfunc(env, func_id, offset, &addr); -	call_imm = BPF_CALL_IMM(addr); -	/* Check whether or not the relative offset overflows desc->imm */ -	if ((unsigned long)(s32)call_imm != call_imm) { -		verbose(env, "address of kernel function %s is out of range\n", -			func_name); -		return -EINVAL; +	if (bpf_jit_supports_far_kfunc_call()) { +		call_imm = func_id; +	} else { +		call_imm = BPF_CALL_IMM(addr); +		/* Check whether the relative offset overflows desc->imm */ +		if ((unsigned long)(s32)call_imm != call_imm) { +			verbose(env, "address of kernel function %s is out of range\n", +				func_name); +			return -EINVAL; +		}  	}  	if (bpf_dev_bound_kfunc_id(func_id)) { @@ -2364,6 +2652,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)  	desc->func_id = func_id;  	desc->imm = call_imm;  	desc->offset = offset; +	desc->addr = addr;  	err = btf_distill_func_proto(&env->log, desc_btf,  				     func_proto, func_name,  				     &desc->func_model); @@ -2373,19 +2662,19 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)  	return err;  } -static int kfunc_desc_cmp_by_imm(const void *a, const void *b) +static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)  {  	const struct bpf_kfunc_desc *d0 = a;  	const struct bpf_kfunc_desc *d1 = b; -	if (d0->imm > d1->imm) -		return 1; -	else if (d0->imm < d1->imm) -		return -1; +	if (d0->imm != d1->imm) +		return d0->imm < d1->imm ? -1 : 1; +	if (d0->offset != d1->offset) +		return d0->offset < d1->offset ? -1 : 1;  	return 0;  } -static void sort_kfunc_descs_by_imm(struct bpf_prog *prog) +static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog)  {  	struct bpf_kfunc_desc_tab *tab; @@ -2394,7 +2683,7 @@ static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)  		return;  	sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), -	     kfunc_desc_cmp_by_imm, NULL); +	     kfunc_desc_cmp_by_imm_off, NULL);  }  bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) @@ -2408,13 +2697,14 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,  {  	const struct bpf_kfunc_desc desc = {  		.imm = insn->imm, +		.offset = insn->off,  	};  	const struct bpf_kfunc_desc *res;  	struct bpf_kfunc_desc_tab *tab;  	tab = prog->aux->kfunc_tab;  	res = bsearch(&desc, tab->descs, tab->nr_descs, -		      sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm); +		      sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);  	return res ? &res->func_model : NULL;  } @@ -2475,8 +2765,8 @@ static int check_subprogs(struct bpf_verifier_env *env)  		u8 code = insn[i].code;  		if (code == (BPF_JMP | BPF_CALL) && -		    insn[i].imm == BPF_FUNC_tail_call && -		    insn[i].src_reg != BPF_PSEUDO_CALL) +		    insn[i].src_reg == 0 && +		    insn[i].imm == BPF_FUNC_tail_call)  			subprog[cur_subprog].has_tail_call = true;  		if (BPF_CLASS(code) == BPF_LD &&  		    (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND)) @@ -2587,6 +2877,25 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *  			     state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);  } +static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, +			  int spi, int nr_slots) +{ +	struct bpf_func_state *state = func(env, reg); +	int err, i; + +	for (i = 0; i < nr_slots; i++) { +		struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr; + +		err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64); +		if (err) +			return err; + +		mark_stack_slot_scratched(env, spi - i); +	} + +	return 0; +} +  /* This function is supposed to be used by the following 32-bit optimization   * code only. It returns TRUE if the source or destination register operates   * on 64-bit, otherwise return FALSE. @@ -2967,6 +3276,21 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,  			}  		} else if (opcode == BPF_EXIT) {  			return -ENOTSUPP; +		} else if (BPF_SRC(insn->code) == BPF_X) { +			if (!(*reg_mask & (dreg | sreg))) +				return 0; +			/* dreg <cond> sreg +			 * Both dreg and sreg need precision before +			 * this insn. If only sreg was marked precise +			 * before it would be equally necessary to +			 * propagate it to dreg. +			 */ +			*reg_mask |= (sreg | dreg); +			 /* else dreg <cond> K +			  * Only dreg still needs precision before +			  * this insn, so for the K-based conditional +			  * there is nothing new to be marked. +			  */  		}  	} else if (class == BPF_LD) {  		if (!(*reg_mask & dreg)) @@ -3568,8 +3892,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,  		/* regular write of data into stack destroys any spilled ptr */  		state->stack[spi].spilled_ptr.type = NOT_INIT; -		/* Mark slots as STACK_MISC if they belonged to spilled ptr. */ -		if (is_spilled_reg(&state->stack[spi])) +		/* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */ +		if (is_stack_slot_special(&state->stack[spi]))  			for (i = 0; i < BPF_REG_SIZE; i++)  				scrub_spilled_slot(&state->stack[spi].slot_type[i]); @@ -3962,17 +4286,13 @@ static int check_stack_read(struct bpf_verifier_env *env,  	}  	/* Variable offset is prohibited for unprivileged mode for simplicity  	 * since it requires corresponding support in Spectre masking for stack -	 * ALU. See also retrieve_ptr_limit(). +	 * ALU. See also retrieve_ptr_limit(). The check in +	 * check_stack_access_for_ptr_arithmetic() called by +	 * adjust_ptr_min_max_vals() prevents users from creating stack pointers +	 * with variable offsets, therefore no check is required here. Further, +	 * just checking it here would be insufficient as speculative stack +	 * writes could still lead to unsafe speculative behaviour.  	 */ -	if (!env->bypass_spec_v1 && var_off) { -		char tn_buf[48]; - -		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); -		verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n", -				ptr_regno, tn_buf); -		return -EACCES; -	} -  	if (!var_off) {  		off += reg->var_off.value;  		err = check_stack_read_fixed_off(env, state, off, size, @@ -4178,8 +4498,8 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,  			       struct btf_field *kptr_field,  			       struct bpf_reg_state *reg, u32 regno)  { -	const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); -	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED; +	const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); +	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;  	const char *reg_name = "";  	/* Only unreferenced case accepts untrusted pointers */ @@ -4194,7 +4514,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,  		return -EINVAL;  	}  	/* We need to verify reg->type and reg->btf, before accessing reg->btf */ -	reg_name = kernel_type_name(reg->btf, reg->btf_id); +	reg_name = btf_type_name(reg->btf, reg->btf_id);  	/* For ref_ptr case, release function check should ensure we get one  	 * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the @@ -4246,6 +4566,36 @@ bad_type:  	return -EINVAL;  } +/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock() + * can dereference RCU protected pointers and result is PTR_TRUSTED. + */ +static bool in_rcu_cs(struct bpf_verifier_env *env) +{ +	return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable; +} + +/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */ +BTF_SET_START(rcu_protected_types) +BTF_ID(struct, prog_test_ref_kfunc) +BTF_ID(struct, cgroup) +BTF_ID(struct, bpf_cpumask) +BTF_ID(struct, task_struct) +BTF_SET_END(rcu_protected_types) + +static bool rcu_protected_object(const struct btf *btf, u32 btf_id) +{ +	if (!btf_is_kernel(btf)) +		return false; +	return btf_id_set_contains(&rcu_protected_types, btf_id); +} + +static bool rcu_safe_kptr(const struct btf_field *field) +{ +	const struct btf_field_kptr *kptr = &field->kptr; + +	return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id); +} +  static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,  				 int value_regno, int insn_idx,  				 struct btf_field *kptr_field) @@ -4280,7 +4630,10 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,  		 * value from map as PTR_TO_BTF_ID, with the correct type.  		 */  		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, -				kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); +				kptr_field->kptr.btf_id, +				rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ? +				PTR_MAYBE_NULL | MEM_RCU : +				PTR_MAYBE_NULL | PTR_UNTRUSTED);  		/* For mark_ptr_or_null_reg */  		val_reg->id = ++env->id_gen;  	} else if (class == BPF_STX) { @@ -4600,6 +4953,11 @@ static bool is_rcu_reg(const struct bpf_reg_state *reg)  	return reg->type & MEM_RCU;  } +static void clear_trusted_flags(enum bpf_type_flag *flag) +{ +	*flag &= ~(BPF_REG_TRUSTED_MODIFIERS | MEM_RCU); +} +  static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,  				   const struct bpf_reg_state *reg,  				   int off, int size, bool strict) @@ -5003,23 +5361,110 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)  	return 0;  } -#define BTF_TYPE_SAFE_NESTED(__type)  __PASTE(__type, __safe_fields) +#define BTF_TYPE_SAFE_RCU(__type)  __PASTE(__type, __safe_rcu) +#define BTF_TYPE_SAFE_RCU_OR_NULL(__type)  __PASTE(__type, __safe_rcu_or_null) +#define BTF_TYPE_SAFE_TRUSTED(__type)  __PASTE(__type, __safe_trusted) + +/* + * Allow list few fields as RCU trusted or full trusted. + * This logic doesn't allow mix tagging and will be removed once GCC supports + * btf_type_tag. + */ -BTF_TYPE_SAFE_NESTED(struct task_struct) { +/* RCU trusted: these fields are trusted in RCU CS and never NULL */ +BTF_TYPE_SAFE_RCU(struct task_struct) {  	const cpumask_t *cpus_ptr; +	struct css_set __rcu *cgroups; +	struct task_struct __rcu *real_parent; +	struct task_struct *group_leader;  }; -static bool nested_ptr_is_trusted(struct bpf_verifier_env *env, -				  struct bpf_reg_state *reg, -				  int off) +BTF_TYPE_SAFE_RCU(struct cgroup) { +	/* cgrp->kn is always accessible as documented in kernel/cgroup/cgroup.c */ +	struct kernfs_node *kn; +}; + +BTF_TYPE_SAFE_RCU(struct css_set) { +	struct cgroup *dfl_cgrp; +}; + +/* RCU trusted: these fields are trusted in RCU CS and can be NULL */ +BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) { +	struct file __rcu *exe_file; +}; + +/* skb->sk, req->sk are not RCU protected, but we mark them as such + * because bpf prog accessible sockets are SOCK_RCU_FREE. + */ +BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff) { +	struct sock *sk; +}; + +BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock) { +	struct sock *sk; +}; + +/* full trusted: these fields are trusted even outside of RCU CS and never NULL */ +BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) { +	struct seq_file *seq; +}; + +BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) { +	struct bpf_iter_meta *meta; +	struct task_struct *task; +}; + +BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) { +	struct file *file; +}; + +BTF_TYPE_SAFE_TRUSTED(struct file) { +	struct inode *f_inode; +}; + +BTF_TYPE_SAFE_TRUSTED(struct dentry) { +	/* no negative dentry-s in places where bpf can see it */ +	struct inode *d_inode; +}; + +BTF_TYPE_SAFE_TRUSTED(struct socket) { +	struct sock *sk; +}; + +static bool type_is_rcu(struct bpf_verifier_env *env, +			struct bpf_reg_state *reg, +			const char *field_name, u32 btf_id)  { -	/* If its parent is not trusted, it can't regain its trusted status. */ -	if (!is_trusted_reg(reg)) -		return false; +	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct)); +	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup)); +	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set)); + +	return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu"); +} + +static bool type_is_rcu_or_null(struct bpf_verifier_env *env, +				struct bpf_reg_state *reg, +				const char *field_name, u32 btf_id) +{ +	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct)); +	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff)); +	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock)); + +	return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu_or_null"); +} -	BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct task_struct)); +static bool type_is_trusted(struct bpf_verifier_env *env, +			    struct bpf_reg_state *reg, +			    const char *field_name, u32 btf_id) +{ +	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta)); +	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task)); +	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm)); +	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file)); +	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); +	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket)); -	return btf_nested_type_is_trusted(&env->log, reg, off); +	return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");  }  static int check_ptr_to_btf_access(struct bpf_verifier_env *env, @@ -5031,8 +5476,9 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,  	struct bpf_reg_state *reg = regs + regno;  	const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);  	const char *tname = btf_name_by_offset(reg->btf, t->name_off); +	const char *field_name = NULL;  	enum bpf_type_flag flag = 0; -	u32 btf_id; +	u32 btf_id = 0;  	int ret;  	if (!env->allow_ptr_leaks) { @@ -5077,12 +5523,12 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,  		return -EACCES;  	} -	if (env->ops->btf_struct_access && !type_is_alloc(reg->type)) { +	if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) {  		if (!btf_is_kernel(reg->btf)) {  			verbose(env, "verifier internal error: reg->btf must be kernel btf\n");  			return -EFAULT;  		} -		ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag); +		ret = env->ops->btf_struct_access(&env->log, reg, off, size);  	} else {  		/* Writes are permitted with default btf_struct_access for  		 * program allocated objects (which always have ref_obj_id > 0), @@ -5099,47 +5545,63 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,  			return -EFAULT;  		} -		ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag); +		ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name);  	}  	if (ret < 0)  		return ret; -	/* If this is an untrusted pointer, all pointers formed by walking it -	 * also inherit the untrusted flag. -	 */ -	if (type_flag(reg->type) & PTR_UNTRUSTED) -		flag |= PTR_UNTRUSTED; +	if (ret != PTR_TO_BTF_ID) { +		/* just mark; */ -	/* By default any pointer obtained from walking a trusted pointer is no -	 * longer trusted, unless the field being accessed has explicitly been -	 * marked as inheriting its parent's state of trust. -	 * -	 * An RCU-protected pointer can also be deemed trusted if we are in an -	 * RCU read region. This case is handled below. -	 */ -	if (nested_ptr_is_trusted(env, reg, off)) -		flag |= PTR_TRUSTED; -	else -		flag &= ~PTR_TRUSTED; - -	if (flag & MEM_RCU) { -		/* Mark value register as MEM_RCU only if it is protected by -		 * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU -		 * itself can already indicate trustedness inside the rcu -		 * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since -		 * it could be null in some cases. +	} else if (type_flag(reg->type) & PTR_UNTRUSTED) { +		/* If this is an untrusted pointer, all pointers formed by walking it +		 * also inherit the untrusted flag.  		 */ -		if (!env->cur_state->active_rcu_lock || -		    !(is_trusted_reg(reg) || is_rcu_reg(reg))) -			flag &= ~MEM_RCU; -		else -			flag |= PTR_MAYBE_NULL; -	} else if (reg->type & MEM_RCU) { -		/* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged -		 * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively. +		flag = PTR_UNTRUSTED; + +	} else if (is_trusted_reg(reg) || is_rcu_reg(reg)) { +		/* By default any pointer obtained from walking a trusted pointer is no +		 * longer trusted, unless the field being accessed has explicitly been +		 * marked as inheriting its parent's state of trust (either full or RCU). +		 * For example: +		 * 'cgroups' pointer is untrusted if task->cgroups dereference +		 * happened in a sleepable program outside of bpf_rcu_read_lock() +		 * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU). +		 * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED. +		 * +		 * A regular RCU-protected pointer with __rcu tag can also be deemed +		 * trusted if we are in an RCU CS. Such pointer can be NULL.  		 */ -		flag |= PTR_UNTRUSTED; +		if (type_is_trusted(env, reg, field_name, btf_id)) { +			flag |= PTR_TRUSTED; +		} else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) { +			if (type_is_rcu(env, reg, field_name, btf_id)) { +				/* ignore __rcu tag and mark it MEM_RCU */ +				flag |= MEM_RCU; +			} else if (flag & MEM_RCU || +				   type_is_rcu_or_null(env, reg, field_name, btf_id)) { +				/* __rcu tagged pointers can be NULL */ +				flag |= MEM_RCU | PTR_MAYBE_NULL; +			} else if (flag & (MEM_PERCPU | MEM_USER)) { +				/* keep as-is */ +			} else { +				/* walking unknown pointers yields old deprecated PTR_TO_BTF_ID */ +				clear_trusted_flags(&flag); +			} +		} else { +			/* +			 * If not in RCU CS or MEM_RCU pointer can be NULL then +			 * aggressively mark as untrusted otherwise such +			 * pointers will be plain PTR_TO_BTF_ID without flags +			 * and will be allowed to be passed into helpers for +			 * compat reasons. +			 */ +			flag = PTR_UNTRUSTED; +		} +	} else { +		/* Old compat. Deprecated */ +		clear_trusted_flags(&flag);  	}  	if (atype == BPF_READ && value_regno >= 0) @@ -5198,7 +5660,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,  	/* Simulate access to a PTR_TO_BTF_ID */  	memset(&map_reg, 0, sizeof(map_reg));  	mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0); -	ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag); +	ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL);  	if (ret < 0)  		return ret; @@ -5864,6 +6326,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,  				env,  				regno, reg->off, access_size,  				zero_size_allowed, ACCESS_HELPER, meta); +	case PTR_TO_BTF_ID: +		return check_ptr_to_btf_access(env, regs, regno, reg->off, +					       access_size, BPF_READ, -1);  	case PTR_TO_CTX:  		/* in case the function doesn't know how to access the context,  		 * (because we are in a program of type SYSCALL for example), we @@ -6211,11 +6676,11 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,   * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument   * type, and declare it as 'const struct bpf_dynptr *' in their prototype.   */ -int process_dynptr_func(struct bpf_verifier_env *env, int regno, -			enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) +static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx, +			       enum bpf_arg_type arg_type)  {  	struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; -	int spi = 0; +	int err;  	/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an  	 * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): @@ -6224,15 +6689,6 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,  		verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");  		return -EFAULT;  	} -	/* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to -	 * check_func_arg_reg_off's logic. We only need to check offset -	 * and its alignment for PTR_TO_STACK. -	 */ -	if (reg->type == PTR_TO_STACK) { -		spi = dynptr_get_spi(env, reg); -		if (spi < 0 && spi != -ERANGE) -			return spi; -	}  	/*  MEM_UNINIT - Points to memory that is an appropriate candidate for  	 *		 constructing a mutable bpf_dynptr object. @@ -6250,30 +6706,30 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,  	 *		 to.  	 */  	if (arg_type & MEM_UNINIT) { -		if (!is_dynptr_reg_valid_uninit(env, reg, spi)) { +		int i; + +		if (!is_dynptr_reg_valid_uninit(env, reg)) {  			verbose(env, "Dynptr has to be an uninitialized dynptr\n");  			return -EINVAL;  		} -		/* We only support one dynptr being uninitialized at the moment, -		 * which is sufficient for the helper functions we have right now. -		 */ -		if (meta->uninit_dynptr_regno) { -			verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); -			return -EFAULT; +		/* we write BPF_DW bits (8 bytes) at a time */ +		for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { +			err = check_mem_access(env, insn_idx, regno, +					       i, BPF_DW, BPF_WRITE, -1, false); +			if (err) +				return err;  		} -		meta->uninit_dynptr_regno = regno; +		err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx);  	} else /* MEM_RDONLY and None case from above */ { -		int err; -  		/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */  		if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {  			verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");  			return -EINVAL;  		} -		if (!is_dynptr_reg_valid_init(env, reg, spi)) { +		if (!is_dynptr_reg_valid_init(env, reg)) {  			verbose(env,  				"Expected an initialized dynptr as arg #%d\n",  				regno); @@ -6282,29 +6738,211 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,  		/* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */  		if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) { -			const char *err_extra = ""; - -			switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { -			case DYNPTR_TYPE_LOCAL: -				err_extra = "local"; -				break; -			case DYNPTR_TYPE_RINGBUF: -				err_extra = "ringbuf"; -				break; -			default: -				err_extra = "<unknown>"; -				break; -			}  			verbose(env,  				"Expected a dynptr of type %s as arg #%d\n", -				err_extra, regno); +				dynptr_type_str(arg_to_dynptr_type(arg_type)), regno);  			return -EINVAL;  		}  		err = mark_dynptr_read(env, reg); +	} +	return err; +} + +static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi) +{ +	struct bpf_func_state *state = func(env, reg); + +	return state->stack[spi].spilled_ptr.ref_obj_id; +} + +static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ +	return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY); +} + +static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ +	return meta->kfunc_flags & KF_ITER_NEW; +} + +static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ +	return meta->kfunc_flags & KF_ITER_NEXT; +} + +static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ +	return meta->kfunc_flags & KF_ITER_DESTROY; +} + +static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg) +{ +	/* btf_check_iter_kfuncs() guarantees that first argument of any iter +	 * kfunc is iter state pointer +	 */ +	return arg == 0 && is_iter_kfunc(meta); +} + +static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx, +			    struct bpf_kfunc_call_arg_meta *meta) +{ +	struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; +	const struct btf_type *t; +	const struct btf_param *arg; +	int spi, err, i, nr_slots; +	u32 btf_id; + +	/* btf_check_iter_kfuncs() ensures we don't need to validate anything here */ +	arg = &btf_params(meta->func_proto)[0]; +	t = btf_type_skip_modifiers(meta->btf, arg->type, NULL);	/* PTR */ +	t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id);	/* STRUCT */ +	nr_slots = t->size / BPF_REG_SIZE; + +	if (is_iter_new_kfunc(meta)) { +		/* bpf_iter_<type>_new() expects pointer to uninit iter state */ +		if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) { +			verbose(env, "expected uninitialized iter_%s as arg #%d\n", +				iter_type_str(meta->btf, btf_id), regno); +			return -EINVAL; +		} + +		for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) { +			err = check_mem_access(env, insn_idx, regno, +					       i, BPF_DW, BPF_WRITE, -1, false); +			if (err) +				return err; +		} + +		err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots); +		if (err) +			return err; +	} else { +		/* iter_next() or iter_destroy() expect initialized iter state*/ +		if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) { +			verbose(env, "expected an initialized iter_%s as arg #%d\n", +				iter_type_str(meta->btf, btf_id), regno); +			return -EINVAL; +		} + +		spi = iter_get_spi(env, reg, nr_slots); +		if (spi < 0) +			return spi; + +		err = mark_iter_read(env, reg, spi, nr_slots);  		if (err)  			return err; + +		/* remember meta->iter info for process_iter_next_call() */ +		meta->iter.spi = spi; +		meta->iter.frameno = reg->frameno; +		meta->ref_obj_id = iter_ref_obj_id(env, reg, spi); + +		if (is_iter_destroy_kfunc(meta)) { +			err = unmark_stack_slots_iter(env, reg, nr_slots); +			if (err) +				return err; +		} +	} + +	return 0; +} + +/* process_iter_next_call() is called when verifier gets to iterator's next + * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer + * to it as just "iter_next()" in comments below. + * + * BPF verifier relies on a crucial contract for any iter_next() + * implementation: it should *eventually* return NULL, and once that happens + * it should keep returning NULL. That is, once iterator exhausts elements to + * iterate, it should never reset or spuriously return new elements. + * + * With the assumption of such contract, process_iter_next_call() simulates + * a fork in the verifier state to validate loop logic correctness and safety + * without having to simulate infinite amount of iterations. + * + * In current state, we first assume that iter_next() returned NULL and + * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such + * conditions we should not form an infinite loop and should eventually reach + * exit. + * + * Besides that, we also fork current state and enqueue it for later + * verification. In a forked state we keep iterator state as ACTIVE + * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We + * also bump iteration depth to prevent erroneous infinite loop detection + * later on (see iter_active_depths_differ() comment for details). In this + * state we assume that we'll eventually loop back to another iter_next() + * calls (it could be in exactly same location or in some other instruction, + * it doesn't matter, we don't make any unnecessary assumptions about this, + * everything revolves around iterator state in a stack slot, not which + * instruction is calling iter_next()). When that happens, we either will come + * to iter_next() with equivalent state and can conclude that next iteration + * will proceed in exactly the same way as we just verified, so it's safe to + * assume that loop converges. If not, we'll go on another iteration + * simulation with a different input state, until all possible starting states + * are validated or we reach maximum number of instructions limit. + * + * This way, we will either exhaustively discover all possible input states + * that iterator loop can start with and eventually will converge, or we'll + * effectively regress into bounded loop simulation logic and either reach + * maximum number of instructions if loop is not provably convergent, or there + * is some statically known limit on number of iterations (e.g., if there is + * an explicit `if n > 100 then break;` statement somewhere in the loop). + * + * One very subtle but very important aspect is that we *always* simulate NULL + * condition first (as the current state) before we simulate non-NULL case. + * This has to do with intricacies of scalar precision tracking. By simulating + * "exit condition" of iter_next() returning NULL first, we make sure all the + * relevant precision marks *that will be set **after** we exit iterator loop* + * are propagated backwards to common parent state of NULL and non-NULL + * branches. Thanks to that, state equivalence checks done later in forked + * state, when reaching iter_next() for ACTIVE iterator, can assume that + * precision marks are finalized and won't change. Because simulating another + * ACTIVE iterator iteration won't change them (because given same input + * states we'll end up with exactly same output states which we are currently + * comparing; and verification after the loop already propagated back what + * needs to be **additionally** tracked as precise). It's subtle, grok + * precision tracking for more intuitive understanding. + */ +static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, +				  struct bpf_kfunc_call_arg_meta *meta) +{ +	struct bpf_verifier_state *cur_st = env->cur_state, *queued_st; +	struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr; +	struct bpf_reg_state *cur_iter, *queued_iter; +	int iter_frameno = meta->iter.frameno; +	int iter_spi = meta->iter.spi; + +	BTF_TYPE_EMIT(struct bpf_iter); + +	cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr; + +	if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE && +	    cur_iter->iter.state != BPF_ITER_STATE_DRAINED) { +		verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n", +			cur_iter->iter.state, iter_state_str(cur_iter->iter.state)); +		return -EFAULT; +	} + +	if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) { +		/* branch out active iter state */ +		queued_st = push_stack(env, insn_idx + 1, insn_idx, false); +		if (!queued_st) +			return -ENOMEM; + +		queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr; +		queued_iter->iter.state = BPF_ITER_STATE_ACTIVE; +		queued_iter->iter.depth++; + +		queued_fr = queued_st->frame[queued_st->curframe]; +		mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);  	} + +	/* switch to DRAINED state, but keep the depth unchanged */ +	/* mark current iter state as drained and assume returned NULL */ +	cur_iter->iter.state = BPF_ITER_STATE_DRAINED; +	__mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]); +  	return 0;  } @@ -6402,6 +7040,7 @@ static const struct bpf_reg_types mem_types = {  		PTR_TO_MEM,  		PTR_TO_MEM | MEM_RINGBUF,  		PTR_TO_BUF, +		PTR_TO_BTF_ID | PTR_TRUSTED,  	},  }; @@ -6511,6 +7150,9 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,  	if (arg_type & PTR_MAYBE_NULL)  		type &= ~PTR_MAYBE_NULL; +	if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC) +		type &= ~MEM_ALLOC; +  	for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {  		expected = compatible->types[i];  		if (expected == NOT_INIT) @@ -6527,7 +7169,27 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,  	return -EACCES;  found: -	if (reg->type == PTR_TO_BTF_ID || reg->type & PTR_TRUSTED) { +	if (base_type(reg->type) != PTR_TO_BTF_ID) +		return 0; + +	if (compatible == &mem_types) { +		if (!(arg_type & MEM_RDONLY)) { +			verbose(env, +				"%s() may write into memory pointed by R%d type=%s\n", +				func_id_name(meta->func_id), +				regno, reg_type_str(env, reg->type)); +			return -EACCES; +		} +		return 0; +	} + +	switch ((int)reg->type) { +	case PTR_TO_BTF_ID: +	case PTR_TO_BTF_ID | PTR_TRUSTED: +	case PTR_TO_BTF_ID | MEM_RCU: +	case PTR_TO_BTF_ID | PTR_MAYBE_NULL: +	case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU: +	{  		/* For bpf_sk_release, it needs to match against first member  		 * 'struct sock_common', hence make an exception for it. This  		 * allows bpf_sk_release to work for multiple socket types. @@ -6535,6 +7197,12 @@ found:  		bool strict_type_match = arg_type_is_release(arg_type) &&  					 meta->func_id != BPF_FUNC_sk_release; +		if (type_may_be_null(reg->type) && +		    (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) { +			verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno); +			return -EACCES; +		} +  		if (!arg_btf_id) {  			if (!compatible->btf_id) {  				verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); @@ -6558,18 +7226,29 @@ found:  						  btf_vmlinux, *arg_btf_id,  						  strict_type_match)) {  				verbose(env, "R%d is of type %s but %s is expected\n", -					regno, kernel_type_name(reg->btf, reg->btf_id), -					kernel_type_name(btf_vmlinux, *arg_btf_id)); +					regno, btf_type_name(reg->btf, reg->btf_id), +					btf_type_name(btf_vmlinux, *arg_btf_id));  				return -EACCES;  			}  		} -	} else if (type_is_alloc(reg->type)) { -		if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) { +		break; +	} +	case PTR_TO_BTF_ID | MEM_ALLOC: +		if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock && +		    meta->func_id != BPF_FUNC_kptr_xchg) {  			verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");  			return -EFAULT;  		} +		/* Handled by helper specific checks */ +		break; +	case PTR_TO_BTF_ID | MEM_PERCPU: +	case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED: +		/* Handled by helper specific checks */ +		break; +	default: +		verbose(env, "verifier internal error: invalid PTR_TO_BTF_ID register for type match\n"); +		return -EFAULT;  	} -  	return 0;  } @@ -6619,7 +7298,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,  			verbose(env, "R%d must have zero offset when passed to release func\n",  				regno);  			verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno, -				kernel_type_name(reg->btf, reg->btf_id), reg->off); +				btf_type_name(reg->btf, reg->btf_id), reg->off);  			return -EINVAL;  		} @@ -6656,7 +7335,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,  	case PTR_TO_BTF_ID | MEM_ALLOC:  	case PTR_TO_BTF_ID | PTR_TRUSTED:  	case PTR_TO_BTF_ID | MEM_RCU: -	case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:  	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:  		/* When referenced PTR_TO_BTF_ID is passed to release function,  		 * its fixed offset must be 0. In the other cases, fixed offset @@ -6671,6 +7349,28 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,  	}  } +static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env, +						const struct bpf_func_proto *fn, +						struct bpf_reg_state *regs) +{ +	struct bpf_reg_state *state = NULL; +	int i; + +	for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) +		if (arg_type_is_dynptr(fn->arg_type[i])) { +			if (state) { +				verbose(env, "verifier internal error: multiple dynptr args\n"); +				return NULL; +			} +			state = ®s[BPF_REG_1 + i]; +		} + +	if (!state) +		verbose(env, "verifier internal error: no dynptr arg found\n"); + +	return state; +} +  static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)  {  	struct bpf_func_state *state = func(env, reg); @@ -6697,9 +7397,28 @@ static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state  	return state->stack[spi].spilled_ptr.ref_obj_id;  } +static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env, +					    struct bpf_reg_state *reg) +{ +	struct bpf_func_state *state = func(env, reg); +	int spi; + +	if (reg->type == CONST_PTR_TO_DYNPTR) +		return reg->dynptr.type; + +	spi = __get_spi(reg->off); +	if (spi < 0) { +		verbose(env, "verifier internal error: invalid spi when querying dynptr type\n"); +		return BPF_DYNPTR_TYPE_INVALID; +	} + +	return state->stack[spi].spilled_ptr.dynptr.type; +} +  static int check_func_arg(struct bpf_verifier_env *env, u32 arg,  			  struct bpf_call_arg_meta *meta, -			  const struct bpf_func_proto *fn) +			  const struct bpf_func_proto *fn, +			  int insn_idx)  {  	u32 regno = BPF_REG_1 + arg;  	struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; @@ -6912,7 +7631,7 @@ skip_type_check:  		err = check_mem_size_reg(env, reg, regno, true, meta);  		break;  	case ARG_PTR_TO_DYNPTR: -		err = process_dynptr_func(env, regno, arg_type, meta); +		err = process_dynptr_func(env, regno, insn_idx, arg_type);  		if (err)  			return err;  		break; @@ -7131,22 +7850,26 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,  		break;  	case BPF_MAP_TYPE_SK_STORAGE:  		if (func_id != BPF_FUNC_sk_storage_get && -		    func_id != BPF_FUNC_sk_storage_delete) +		    func_id != BPF_FUNC_sk_storage_delete && +		    func_id != BPF_FUNC_kptr_xchg)  			goto error;  		break;  	case BPF_MAP_TYPE_INODE_STORAGE:  		if (func_id != BPF_FUNC_inode_storage_get && -		    func_id != BPF_FUNC_inode_storage_delete) +		    func_id != BPF_FUNC_inode_storage_delete && +		    func_id != BPF_FUNC_kptr_xchg)  			goto error;  		break;  	case BPF_MAP_TYPE_TASK_STORAGE:  		if (func_id != BPF_FUNC_task_storage_get && -		    func_id != BPF_FUNC_task_storage_delete) +		    func_id != BPF_FUNC_task_storage_delete && +		    func_id != BPF_FUNC_kptr_xchg)  			goto error;  		break;  	case BPF_MAP_TYPE_CGRP_STORAGE:  		if (func_id != BPF_FUNC_cgrp_storage_get && -		    func_id != BPF_FUNC_cgrp_storage_delete) +		    func_id != BPF_FUNC_cgrp_storage_delete && +		    func_id != BPF_FUNC_kptr_xchg)  			goto error;  		break;  	case BPF_MAP_TYPE_BLOOM_FILTER: @@ -7360,6 +8083,9 @@ static int check_func_proto(const struct bpf_func_proto *fn, int func_id)  /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]   * are now invalid, so turn them into unknown SCALAR_VALUE. + * + * This also applies to dynptr slices belonging to skb and xdp dynptrs, + * since these slices point to packet data.   */  static void clear_all_pkt_pointers(struct bpf_verifier_env *env)  { @@ -7367,8 +8093,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)  	struct bpf_reg_state *reg;  	bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ -		if (reg_is_pkt_pointer_any(reg)) -			__mark_reg_unknown(env, reg); +		if (reg_is_pkt_pointer_any(reg) || reg_is_dynptr_slice_pkt(reg)) +			mark_reg_invalid(env, reg);  	}));  } @@ -7413,12 +8139,8 @@ static int release_reference(struct bpf_verifier_env *env,  		return err;  	bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ -		if (reg->ref_obj_id == ref_obj_id) { -			if (!env->allow_ptr_leaks) -				__mark_reg_not_init(env, reg); -			else -				__mark_reg_unknown(env, reg); -		} +		if (reg->ref_obj_id == ref_obj_id) +			mark_reg_invalid(env, reg);  	}));  	return 0; @@ -7431,7 +8153,7 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env)  	bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({  		if (type_is_non_owning_ref(reg->type)) -			__mark_reg_unknown(env, reg); +			mark_reg_invalid(env, reg);  	}));  } @@ -7793,10 +8515,10 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,  					 struct bpf_func_state *callee,  					 int insn_idx)  { -	/* void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, +	/* void bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,  	 *                     bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b));  	 * -	 * 'struct bpf_rb_node *node' arg to bpf_rbtree_add is the same PTR_TO_BTF_ID w/ offset +	 * 'struct bpf_rb_node *node' arg to bpf_rbtree_add_impl is the same PTR_TO_BTF_ID w/ offset  	 * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd  	 * by this point, so look at 'root'  	 */ @@ -8202,7 +8924,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn  	meta.func_id = func_id;  	/* check args */  	for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { -		err = check_func_arg(env, i, &meta, fn); +		err = check_func_arg(env, i, &meta, fn, insn_idx);  		if (err)  			return err;  	} @@ -8227,30 +8949,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn  	regs = cur_regs(env); -	/* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot -	 * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr -	 * is safe to do directly. -	 */ -	if (meta.uninit_dynptr_regno) { -		if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) { -			verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n"); -			return -EFAULT; -		} -		/* we write BPF_DW bits (8 bytes) at a time */ -		for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { -			err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno, -					       i, BPF_DW, BPF_WRITE, -1, false); -			if (err) -				return err; -		} - -		err = mark_stack_slots_dynptr(env, ®s[meta.uninit_dynptr_regno], -					      fn->arg_type[meta.uninit_dynptr_regno - BPF_REG_1], -					      insn_idx); -		if (err) -			return err; -	} -  	if (meta.release_regno) {  		err = -EINVAL;  		/* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot @@ -8335,43 +9033,62 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn  		}  		break;  	case BPF_FUNC_dynptr_data: -		for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { -			if (arg_type_is_dynptr(fn->arg_type[i])) { -				struct bpf_reg_state *reg = ®s[BPF_REG_1 + i]; -				int id, ref_obj_id; +	{ +		struct bpf_reg_state *reg; +		int id, ref_obj_id; -				if (meta.dynptr_id) { -					verbose(env, "verifier internal error: meta.dynptr_id already set\n"); -					return -EFAULT; -				} +		reg = get_dynptr_arg_reg(env, fn, regs); +		if (!reg) +			return -EFAULT; -				if (meta.ref_obj_id) { -					verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); -					return -EFAULT; -				} -				id = dynptr_id(env, reg); -				if (id < 0) { -					verbose(env, "verifier internal error: failed to obtain dynptr id\n"); -					return id; -				} +		if (meta.dynptr_id) { +			verbose(env, "verifier internal error: meta.dynptr_id already set\n"); +			return -EFAULT; +		} +		if (meta.ref_obj_id) { +			verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); +			return -EFAULT; +		} -				ref_obj_id = dynptr_ref_obj_id(env, reg); -				if (ref_obj_id < 0) { -					verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n"); -					return ref_obj_id; -				} +		id = dynptr_id(env, reg); +		if (id < 0) { +			verbose(env, "verifier internal error: failed to obtain dynptr id\n"); +			return id; +		} -				meta.dynptr_id = id; -				meta.ref_obj_id = ref_obj_id; -				break; -			} +		ref_obj_id = dynptr_ref_obj_id(env, reg); +		if (ref_obj_id < 0) { +			verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n"); +			return ref_obj_id;  		} -		if (i == MAX_BPF_FUNC_REG_ARGS) { -			verbose(env, "verifier internal error: no dynptr in bpf_dynptr_data()\n"); + +		meta.dynptr_id = id; +		meta.ref_obj_id = ref_obj_id; + +		break; +	} +	case BPF_FUNC_dynptr_write: +	{ +		enum bpf_dynptr_type dynptr_type; +		struct bpf_reg_state *reg; + +		reg = get_dynptr_arg_reg(env, fn, regs); +		if (!reg)  			return -EFAULT; -		} + +		dynptr_type = dynptr_get_type(env, reg); +		if (dynptr_type == BPF_DYNPTR_TYPE_INVALID) +			return -EFAULT; + +		if (dynptr_type == BPF_DYNPTR_TYPE_SKB) +			/* this will trigger clear_all_pkt_pointers(), which will +			 * invalidate all dynptr slices associated with the skb +			 */ +			changes_data = true; +  		break; +	}  	case BPF_FUNC_user_ringbuf_drain:  		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,  					set_user_ringbuf_callback_state); @@ -8484,6 +9201,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn  		if (func_id == BPF_FUNC_kptr_xchg) {  			ret_btf = meta.kptr_field->kptr.btf;  			ret_btf_id = meta.kptr_field->kptr.btf_id; +			if (!btf_is_kernel(ret_btf)) +				regs[BPF_REG_0].type |= MEM_ALLOC;  		} else {  			if (fn->ret_btf_id == BPF_PTR_POISON) {  				verbose(env, "verifier internal error:"); @@ -8600,36 +9319,6 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,  	}  } -struct bpf_kfunc_call_arg_meta { -	/* In parameters */ -	struct btf *btf; -	u32 func_id; -	u32 kfunc_flags; -	const struct btf_type *func_proto; -	const char *func_name; -	/* Out parameters */ -	u32 ref_obj_id; -	u8 release_regno; -	bool r0_rdonly; -	u32 ret_btf_id; -	u64 r0_size; -	u32 subprogno; -	struct { -		u64 value; -		bool found; -	} arg_constant; -	struct { -		struct btf *btf; -		u32 btf_id; -	} arg_obj_drop; -	struct { -		struct btf_field *field; -	} arg_list_head; -	struct { -		struct btf_field *field; -	} arg_rbtree_root; -}; -  static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)  {  	return meta->kfunc_flags & KF_ACQUIRE; @@ -8647,7 +9336,7 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)  static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)  { -	return meta->kfunc_flags & KF_TRUSTED_ARGS; +	return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta);  }  static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta) @@ -8665,11 +9354,6 @@ static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)  	return meta->kfunc_flags & KF_RCU;  } -static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg) -{ -	return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET); -} -  static bool __kfunc_param_match_suffix(const struct btf *btf,  				       const struct btf_param *arg,  				       const char *suffix) @@ -8701,6 +9385,19 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,  	return __kfunc_param_match_suffix(btf, arg, "__sz");  } +static bool is_kfunc_arg_const_mem_size(const struct btf *btf, +					const struct btf_param *arg, +					const struct bpf_reg_state *reg) +{ +	const struct btf_type *t; + +	t = btf_type_skip_modifiers(btf, arg->type, NULL); +	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) +		return false; + +	return __kfunc_param_match_suffix(btf, arg, "__szk"); +} +  static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)  {  	return __kfunc_param_match_suffix(btf, arg, "__k"); @@ -8716,6 +9413,16 @@ static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param  	return __kfunc_param_match_suffix(btf, arg, "__alloc");  } +static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg) +{ +	return __kfunc_param_match_suffix(btf, arg, "__uninit"); +} + +static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf_param *arg) +{ +	return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr"); +} +  static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,  					  const struct btf_param *arg,  					  const char *name) @@ -8855,14 +9562,15 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {  enum kfunc_ptr_arg_type {  	KF_ARG_PTR_TO_CTX, -	KF_ARG_PTR_TO_ALLOC_BTF_ID,  /* Allocated object */ -	KF_ARG_PTR_TO_KPTR,	     /* PTR_TO_KPTR but type specific */ +	KF_ARG_PTR_TO_ALLOC_BTF_ID,    /* Allocated object */ +	KF_ARG_PTR_TO_REFCOUNTED_KPTR, /* Refcounted local kptr */  	KF_ARG_PTR_TO_DYNPTR, +	KF_ARG_PTR_TO_ITER,  	KF_ARG_PTR_TO_LIST_HEAD,  	KF_ARG_PTR_TO_LIST_NODE, -	KF_ARG_PTR_TO_BTF_ID,	     /* Also covers reg2btf_ids conversions */ +	KF_ARG_PTR_TO_BTF_ID,	       /* Also covers reg2btf_ids conversions */  	KF_ARG_PTR_TO_MEM, -	KF_ARG_PTR_TO_MEM_SIZE,	     /* Size derived from next argument, skip it */ +	KF_ARG_PTR_TO_MEM_SIZE,	       /* Size derived from next argument, skip it */  	KF_ARG_PTR_TO_CALLBACK,  	KF_ARG_PTR_TO_RB_ROOT,  	KF_ARG_PTR_TO_RB_NODE, @@ -8871,8 +9579,9 @@ enum kfunc_ptr_arg_type {  enum special_kfunc_type {  	KF_bpf_obj_new_impl,  	KF_bpf_obj_drop_impl, -	KF_bpf_list_push_front, -	KF_bpf_list_push_back, +	KF_bpf_refcount_acquire_impl, +	KF_bpf_list_push_front_impl, +	KF_bpf_list_push_back_impl,  	KF_bpf_list_pop_front,  	KF_bpf_list_pop_back,  	KF_bpf_cast_to_kern_ctx, @@ -8880,29 +9589,39 @@ enum special_kfunc_type {  	KF_bpf_rcu_read_lock,  	KF_bpf_rcu_read_unlock,  	KF_bpf_rbtree_remove, -	KF_bpf_rbtree_add, +	KF_bpf_rbtree_add_impl,  	KF_bpf_rbtree_first, +	KF_bpf_dynptr_from_skb, +	KF_bpf_dynptr_from_xdp, +	KF_bpf_dynptr_slice, +	KF_bpf_dynptr_slice_rdwr,  };  BTF_SET_START(special_kfunc_set)  BTF_ID(func, bpf_obj_new_impl)  BTF_ID(func, bpf_obj_drop_impl) -BTF_ID(func, bpf_list_push_front) -BTF_ID(func, bpf_list_push_back) +BTF_ID(func, bpf_refcount_acquire_impl) +BTF_ID(func, bpf_list_push_front_impl) +BTF_ID(func, bpf_list_push_back_impl)  BTF_ID(func, bpf_list_pop_front)  BTF_ID(func, bpf_list_pop_back)  BTF_ID(func, bpf_cast_to_kern_ctx)  BTF_ID(func, bpf_rdonly_cast)  BTF_ID(func, bpf_rbtree_remove) -BTF_ID(func, bpf_rbtree_add) +BTF_ID(func, bpf_rbtree_add_impl)  BTF_ID(func, bpf_rbtree_first) +BTF_ID(func, bpf_dynptr_from_skb) +BTF_ID(func, bpf_dynptr_from_xdp) +BTF_ID(func, bpf_dynptr_slice) +BTF_ID(func, bpf_dynptr_slice_rdwr)  BTF_SET_END(special_kfunc_set)  BTF_ID_LIST(special_kfunc_list)  BTF_ID(func, bpf_obj_new_impl)  BTF_ID(func, bpf_obj_drop_impl) -BTF_ID(func, bpf_list_push_front) -BTF_ID(func, bpf_list_push_back) +BTF_ID(func, bpf_refcount_acquire_impl) +BTF_ID(func, bpf_list_push_front_impl) +BTF_ID(func, bpf_list_push_back_impl)  BTF_ID(func, bpf_list_pop_front)  BTF_ID(func, bpf_list_pop_back)  BTF_ID(func, bpf_cast_to_kern_ctx) @@ -8910,8 +9629,12 @@ BTF_ID(func, bpf_rdonly_cast)  BTF_ID(func, bpf_rcu_read_lock)  BTF_ID(func, bpf_rcu_read_unlock)  BTF_ID(func, bpf_rbtree_remove) -BTF_ID(func, bpf_rbtree_add) +BTF_ID(func, bpf_rbtree_add_impl)  BTF_ID(func, bpf_rbtree_first) +BTF_ID(func, bpf_dynptr_from_skb) +BTF_ID(func, bpf_dynptr_from_xdp) +BTF_ID(func, bpf_dynptr_slice) +BTF_ID(func, bpf_dynptr_slice_rdwr)  static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)  { @@ -8949,24 +9672,15 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,  	if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))  		return KF_ARG_PTR_TO_ALLOC_BTF_ID; -	if (is_kfunc_arg_kptr_get(meta, argno)) { -		if (!btf_type_is_ptr(ref_t)) { -			verbose(env, "arg#0 BTF type must be a double pointer for kptr_get kfunc\n"); -			return -EINVAL; -		} -		ref_t = btf_type_by_id(meta->btf, ref_t->type); -		ref_tname = btf_name_by_offset(meta->btf, ref_t->name_off); -		if (!btf_type_is_struct(ref_t)) { -			verbose(env, "kernel function %s args#0 pointer type %s %s is not supported\n", -				meta->func_name, btf_type_str(ref_t), ref_tname); -			return -EINVAL; -		} -		return KF_ARG_PTR_TO_KPTR; -	} +	if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno])) +		return KF_ARG_PTR_TO_REFCOUNTED_KPTR;  	if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))  		return KF_ARG_PTR_TO_DYNPTR; +	if (is_kfunc_arg_iter(meta, argno)) +		return KF_ARG_PTR_TO_ITER; +  	if (is_kfunc_arg_list_head(meta->btf, &args[argno]))  		return KF_ARG_PTR_TO_LIST_HEAD; @@ -8991,7 +9705,10 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,  	if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))  		return KF_ARG_PTR_TO_CALLBACK; -	if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1])) + +	if (argno + 1 < nargs && +	    (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || +	     is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1])))  		arg_mem_size = true;  	/* This is the catch all argument type of register types supported by @@ -9071,40 +9788,6 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,  	return 0;  } -static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env, -				      struct bpf_reg_state *reg, -				      const struct btf_type *ref_t, -				      const char *ref_tname, -				      struct bpf_kfunc_call_arg_meta *meta, -				      int argno) -{ -	struct btf_field *kptr_field; - -	/* check_func_arg_reg_off allows var_off for -	 * PTR_TO_MAP_VALUE, but we need fixed offset to find -	 * off_desc. -	 */ -	if (!tnum_is_const(reg->var_off)) { -		verbose(env, "arg#0 must have constant offset\n"); -		return -EINVAL; -	} - -	kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR); -	if (!kptr_field || kptr_field->type != BPF_KPTR_REF) { -		verbose(env, "arg#0 no referenced kptr at map value offset=%llu\n", -			reg->off + reg->var_off.value); -		return -EINVAL; -	} - -	if (!btf_struct_ids_match(&env->log, meta->btf, ref_t->type, 0, kptr_field->kptr.btf, -				  kptr_field->kptr.btf_id, true)) { -		verbose(env, "kernel function %s args#%d expected pointer to %s %s\n", -			meta->func_name, argno, btf_type_str(ref_t), ref_tname); -		return -EINVAL; -	} -	return 0; -} -  static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)  {  	struct bpf_verifier_state *state = env->cur_state; @@ -9211,7 +9894,6 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_  		ptr = reg->map_ptr;  		break;  	case PTR_TO_BTF_ID | MEM_ALLOC: -	case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:  		ptr = reg->btf;  		break;  	default: @@ -9232,27 +9914,28 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_  static bool is_bpf_list_api_kfunc(u32 btf_id)  { -	return btf_id == special_kfunc_list[KF_bpf_list_push_front] || -	       btf_id == special_kfunc_list[KF_bpf_list_push_back] || +	return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] || +	       btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||  	       btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||  	       btf_id == special_kfunc_list[KF_bpf_list_pop_back];  }  static bool is_bpf_rbtree_api_kfunc(u32 btf_id)  { -	return btf_id == special_kfunc_list[KF_bpf_rbtree_add] || +	return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||  	       btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||  	       btf_id == special_kfunc_list[KF_bpf_rbtree_first];  }  static bool is_bpf_graph_api_kfunc(u32 btf_id)  { -	return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id); +	return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) || +	       btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];  }  static bool is_callback_calling_kfunc(u32 btf_id)  { -	return btf_id == special_kfunc_list[KF_bpf_rbtree_add]; +	return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];  }  static bool is_rbtree_lock_required_kfunc(u32 btf_id) @@ -9293,12 +9976,12 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,  	switch (node_field_type) {  	case BPF_LIST_NODE: -		ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front] || -		       kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back]); +		ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] || +		       kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]);  		break;  	case BPF_RB_NODE:  		ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || -		       kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add]); +		       kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]);  		break;  	default:  		verbose(env, "verifier internal error: unexpected graph node argument type %s\n", @@ -9460,11 +10143,13 @@ static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,  						  &meta->arg_rbtree_root.field);  } -static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta) +static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, +			    int insn_idx)  {  	const char *func_name = meta->func_name, *ref_tname;  	const struct btf *btf = meta->btf;  	const struct btf_param *args; +	struct btf_record *rec;  	u32 i, nargs;  	int ret; @@ -9543,7 +10228,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_  			return -EINVAL;  		} -		if (is_kfunc_trusted_args(meta) && +		if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&  		    (register_is_null(reg) || type_may_be_null(reg->type))) {  			verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);  			return -EACCES; @@ -9590,8 +10275,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_  			/* Trusted arguments have the same offset checks as release arguments */  			arg_type |= OBJ_RELEASE;  			break; -		case KF_ARG_PTR_TO_KPTR:  		case KF_ARG_PTR_TO_DYNPTR: +		case KF_ARG_PTR_TO_ITER:  		case KF_ARG_PTR_TO_LIST_HEAD:  		case KF_ARG_PTR_TO_LIST_NODE:  		case KF_ARG_PTR_TO_RB_ROOT: @@ -9599,6 +10284,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_  		case KF_ARG_PTR_TO_MEM:  		case KF_ARG_PTR_TO_MEM_SIZE:  		case KF_ARG_PTR_TO_CALLBACK: +		case KF_ARG_PTR_TO_REFCOUNTED_KPTR:  			/* Trusted by default */  			break;  		default: @@ -9641,23 +10327,46 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_  				meta->arg_obj_drop.btf_id = reg->btf_id;  			}  			break; -		case KF_ARG_PTR_TO_KPTR: -			if (reg->type != PTR_TO_MAP_VALUE) { -				verbose(env, "arg#0 expected pointer to map value\n"); -				return -EINVAL; -			} -			ret = process_kf_arg_ptr_to_kptr(env, reg, ref_t, ref_tname, meta, i); -			if (ret < 0) -				return ret; -			break;  		case KF_ARG_PTR_TO_DYNPTR: +		{ +			enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR; +  			if (reg->type != PTR_TO_STACK &&  			    reg->type != CONST_PTR_TO_DYNPTR) {  				verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);  				return -EINVAL;  			} -			ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL); +			if (reg->type == CONST_PTR_TO_DYNPTR) +				dynptr_arg_type |= MEM_RDONLY; + +			if (is_kfunc_arg_uninit(btf, &args[i])) +				dynptr_arg_type |= MEM_UNINIT; + +			if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) +				dynptr_arg_type |= DYNPTR_TYPE_SKB; +			else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) +				dynptr_arg_type |= DYNPTR_TYPE_XDP; + +			ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type); +			if (ret < 0) +				return ret; + +			if (!(dynptr_arg_type & MEM_UNINIT)) { +				int id = dynptr_id(env, reg); + +				if (id < 0) { +					verbose(env, "verifier internal error: failed to obtain dynptr id\n"); +					return id; +				} +				meta->initialized_dynptr.id = id; +				meta->initialized_dynptr.type = dynptr_get_type(env, reg); +			} + +			break; +		} +		case KF_ARG_PTR_TO_ITER: +			ret = process_iter_arg(env, regno, insn_idx, meta);  			if (ret < 0)  				return ret;  			break; @@ -9754,17 +10463,59 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_  				return ret;  			break;  		case KF_ARG_PTR_TO_MEM_SIZE: -			ret = check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1); +		{ +			struct bpf_reg_state *size_reg = ®s[regno + 1]; +			const struct btf_param *size_arg = &args[i + 1]; + +			ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);  			if (ret < 0) {  				verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);  				return ret;  			} -			/* Skip next '__sz' argument */ + +			if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) { +				if (meta->arg_constant.found) { +					verbose(env, "verifier internal error: only one constant argument permitted\n"); +					return -EFAULT; +				} +				if (!tnum_is_const(size_reg->var_off)) { +					verbose(env, "R%d must be a known constant\n", regno + 1); +					return -EINVAL; +				} +				meta->arg_constant.found = true; +				meta->arg_constant.value = size_reg->var_off.value; +			} + +			/* Skip next '__sz' or '__szk' argument */  			i++;  			break; +		}  		case KF_ARG_PTR_TO_CALLBACK:  			meta->subprogno = reg->subprogno;  			break; +		case KF_ARG_PTR_TO_REFCOUNTED_KPTR: +			if (!type_is_ptr_alloc_obj(reg->type) && !type_is_non_owning_ref(reg->type)) { +				verbose(env, "arg#%d is neither owning or non-owning ref\n", i); +				return -EINVAL; +			} + +			rec = reg_btf_record(reg); +			if (!rec) { +				verbose(env, "verifier internal error: Couldn't find btf_record\n"); +				return -EFAULT; +			} + +			if (rec->refcount_off < 0) { +				verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i); +				return -EINVAL; +			} +			if (rec->refcount_off >= 0) { +				verbose(env, "bpf_refcount_acquire calls are disabled for now\n"); +				return -EINVAL; +			} +			meta->arg_refcount_acquire.btf = reg->btf; +			meta->arg_refcount_acquire.btf_id = reg->btf_id; +			break;  		}  	} @@ -9777,24 +10528,21 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_  	return 0;  } -static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, -			    int *insn_idx_p) +static int fetch_kfunc_meta(struct bpf_verifier_env *env, +			    struct bpf_insn *insn, +			    struct bpf_kfunc_call_arg_meta *meta, +			    const char **kfunc_name)  { -	const struct btf_type *t, *func, *func_proto, *ptr_type; -	u32 i, nargs, func_id, ptr_type_id, release_ref_obj_id; -	struct bpf_reg_state *regs = cur_regs(env); -	const char *func_name, *ptr_type_name; -	bool sleepable, rcu_lock, rcu_unlock; -	struct bpf_kfunc_call_arg_meta meta; -	int err, insn_idx = *insn_idx_p; -	const struct btf_param *args; -	const struct btf_type *ret_t; +	const struct btf_type *func, *func_proto; +	u32 func_id, *kfunc_flags; +	const char *func_name;  	struct btf *desc_btf; -	u32 *kfunc_flags; -	/* skip for now, but return error when we find this in fixup_kfunc_call */ +	if (kfunc_name) +		*kfunc_name = NULL; +  	if (!insn->imm) -		return 0; +		return -EINVAL;  	desc_btf = find_kfunc_desc_btf(env, insn->off);  	if (IS_ERR(desc_btf)) @@ -9803,22 +10551,53 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  	func_id = insn->imm;  	func = btf_type_by_id(desc_btf, func_id);  	func_name = btf_name_by_offset(desc_btf, func->name_off); +	if (kfunc_name) +		*kfunc_name = func_name;  	func_proto = btf_type_by_id(desc_btf, func->type);  	kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id);  	if (!kfunc_flags) { -		verbose(env, "calling kernel function %s is not allowed\n", -			func_name);  		return -EACCES;  	} -	/* Prepare kfunc call metadata */ -	memset(&meta, 0, sizeof(meta)); -	meta.btf = desc_btf; -	meta.func_id = func_id; -	meta.kfunc_flags = *kfunc_flags; -	meta.func_proto = func_proto; -	meta.func_name = func_name; +	memset(meta, 0, sizeof(*meta)); +	meta->btf = desc_btf; +	meta->func_id = func_id; +	meta->kfunc_flags = *kfunc_flags; +	meta->func_proto = func_proto; +	meta->func_name = func_name; + +	return 0; +} + +static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, +			    int *insn_idx_p) +{ +	const struct btf_type *t, *ptr_type; +	u32 i, nargs, ptr_type_id, release_ref_obj_id; +	struct bpf_reg_state *regs = cur_regs(env); +	const char *func_name, *ptr_type_name; +	bool sleepable, rcu_lock, rcu_unlock; +	struct bpf_kfunc_call_arg_meta meta; +	struct bpf_insn_aux_data *insn_aux; +	int err, insn_idx = *insn_idx_p; +	const struct btf_param *args; +	const struct btf_type *ret_t; +	struct btf *desc_btf; + +	/* skip for now, but return error when we find this in fixup_kfunc_call */ +	if (!insn->imm) +		return 0; + +	err = fetch_kfunc_meta(env, insn, &meta, &func_name); +	if (err == -EACCES && func_name) +		verbose(env, "calling kernel function %s is not allowed\n", func_name); +	if (err) +		return err; +	desc_btf = meta.btf; +	insn_aux = &env->insn_aux_data[insn_idx]; + +	insn_aux->is_iter_next = is_iter_next_kfunc(&meta);  	if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {  		verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n"); @@ -9833,10 +10612,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  	rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);  	rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); -	if ((rcu_lock || rcu_unlock) && !env->rcu_tag_supported) { -		verbose(env, "no vmlinux btf rcu tag support for kfunc %s\n", func_name); -		return -EACCES; -	}  	if (env->cur_state->active_rcu_lock) {  		struct bpf_func_state *state; @@ -9865,7 +10640,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  	}  	/* Check the arguments */ -	err = check_kfunc_args(env, &meta); +	err = check_kfunc_args(env, &meta, insn_idx);  	if (err < 0)  		return err;  	/* In case of release function, we get register number of refcounted @@ -9875,36 +10650,37 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  		err = release_reference(env, regs[meta.release_regno].ref_obj_id);  		if (err) {  			verbose(env, "kfunc %s#%d reference has not been acquired before\n", -				func_name, func_id); +				func_name, meta.func_id);  			return err;  		}  	} -	if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front] || -	    meta.func_id == special_kfunc_list[KF_bpf_list_push_back] || -	    meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) { +	if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || +	    meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] || +	    meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {  		release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; +		insn_aux->insert_off = regs[BPF_REG_2].off;  		err = ref_convert_owning_non_owning(env, release_ref_obj_id);  		if (err) {  			verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n", -				func_name, func_id); +				func_name, meta.func_id);  			return err;  		}  		err = release_reference(env, release_ref_obj_id);  		if (err) {  			verbose(env, "kfunc %s#%d reference has not been acquired before\n", -				func_name, func_id); +				func_name, meta.func_id);  			return err;  		}  	} -	if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) { +	if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {  		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,  					set_rbtree_add_callback_state);  		if (err) {  			verbose(env, "kfunc %s#%d failed callback verification\n", -				func_name, func_id); +				func_name, meta.func_id);  			return err;  		}  	} @@ -9913,11 +10689,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  		mark_reg_not_init(env, regs, caller_saved[i]);  	/* Check return type */ -	t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); +	t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);  	if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {  		/* Only exception is bpf_obj_new_impl */ -		if (meta.btf != btf_vmlinux || meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl]) { +		if (meta.btf != btf_vmlinux || +		    (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] && +		     meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) {  			verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");  			return -EINVAL;  		} @@ -9962,13 +10740,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  				regs[BPF_REG_0].btf = ret_btf;  				regs[BPF_REG_0].btf_id = ret_btf_id; -				env->insn_aux_data[insn_idx].obj_new_size = ret_t->size; -				env->insn_aux_data[insn_idx].kptr_struct_meta = +				insn_aux->obj_new_size = ret_t->size; +				insn_aux->kptr_struct_meta =  					btf_find_struct_meta(ret_btf, ret_btf_id); -			} else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { -				env->insn_aux_data[insn_idx].kptr_struct_meta = -					btf_find_struct_meta(meta.arg_obj_drop.btf, -							     meta.arg_obj_drop.btf_id); +			} else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { +				mark_reg_known_zero(env, regs, BPF_REG_0); +				regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; +				regs[BPF_REG_0].btf = meta.arg_refcount_acquire.btf; +				regs[BPF_REG_0].btf_id = meta.arg_refcount_acquire.btf_id; + +				insn_aux->kptr_struct_meta = +					btf_find_struct_meta(meta.arg_refcount_acquire.btf, +							     meta.arg_refcount_acquire.btf_id);  			} else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||  				   meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {  				struct btf_field *field = meta.arg_list_head.field; @@ -9996,6 +10779,42 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  				regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;  				regs[BPF_REG_0].btf = desc_btf;  				regs[BPF_REG_0].btf_id = meta.arg_constant.value; +			} else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] || +				   meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { +				enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type); + +				mark_reg_known_zero(env, regs, BPF_REG_0); + +				if (!meta.arg_constant.found) { +					verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n"); +					return -EFAULT; +				} + +				regs[BPF_REG_0].mem_size = meta.arg_constant.value; + +				/* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */ +				regs[BPF_REG_0].type = PTR_TO_MEM | type_flag; + +				if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) { +					regs[BPF_REG_0].type |= MEM_RDONLY; +				} else { +					/* this will set env->seen_direct_write to true */ +					if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { +						verbose(env, "the prog does not allow writes to packet data\n"); +						return -EINVAL; +					} +				} + +				if (!meta.initialized_dynptr.id) { +					verbose(env, "verifier internal error: no dynptr id\n"); +					return -EFAULT; +				} +				regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id; + +				/* we don't need to set BPF_REG_0's ref obj id +				 * because packet slices are not refcounted (see +				 * dynptr_type_refcounted) +				 */  			} else {  				verbose(env, "kernel function %s unhandled dynamic return type\n",  					meta.func_name); @@ -10003,6 +10822,14 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  			}  		} else if (!__btf_type_is_struct(ptr_type)) {  			if (!meta.r0_size) { +				__u32 sz; + +				if (!IS_ERR(btf_resolve_size(desc_btf, ptr_type, &sz))) { +					meta.r0_size = sz; +					meta.r0_rdonly = true; +				} +			} +			if (!meta.r0_size) {  				ptr_type_name = btf_name_by_offset(desc_btf,  								   ptr_type->name_off);  				verbose(env, @@ -10048,15 +10875,20 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  			ref_set_non_owning(env, ®s[BPF_REG_0]);  		} -		if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove]) -			invalidate_non_owning_refs(env); -  		if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id)  			regs[BPF_REG_0].id = ++env->id_gen; -	} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ +	} else if (btf_type_is_void(t)) { +		if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { +			if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { +				insn_aux->kptr_struct_meta = +					btf_find_struct_meta(meta.arg_obj_drop.btf, +							     meta.arg_obj_drop.btf_id); +			} +		} +	} -	nargs = btf_type_vlen(func_proto); -	args = (const struct btf_param *)(func_proto + 1); +	nargs = btf_type_vlen(meta.func_proto); +	args = (const struct btf_param *)(meta.func_proto + 1);  	for (i = 0; i < nargs; i++) {  		u32 regno = i + 1; @@ -10068,6 +10900,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  			mark_btf_func_reg_size(env, regno, t->size);  	} +	if (is_iter_next_kfunc(&meta)) { +		err = process_iter_next_call(env, insn_idx, &meta); +		if (err) +			return err; +	} +  	return 0;  } @@ -11601,12 +12439,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  						insn->src_reg);  					return -EACCES;  				} else if (src_reg->type == SCALAR_VALUE) { +					bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX; + +					if (is_src_reg_u32 && !src_reg->id) +						src_reg->id = ++env->id_gen;  					copy_register_state(dst_reg, src_reg); -					/* Make sure ID is cleared otherwise +					/* Make sure ID is cleared if src_reg is not in u32 range otherwise  					 * dst_reg min/max could be incorrectly  					 * propagated into src_reg by find_equal_scalars()  					 */ -					dst_reg->id = 0; +					if (!is_src_reg_u32) +						dst_reg->id = 0;  					dst_reg->live |= REG_LIVE_WRITTEN;  					dst_reg->subreg_def = env->insn_idx + 1;  				} else { @@ -11774,10 +12617,14 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)  	case BPF_JEQ:  		if (tnum_is_const(subreg))  			return !!tnum_equals_const(subreg, val); +		else if (val < reg->u32_min_value || val > reg->u32_max_value) +			return 0;  		break;  	case BPF_JNE:  		if (tnum_is_const(subreg))  			return !tnum_equals_const(subreg, val); +		else if (val < reg->u32_min_value || val > reg->u32_max_value) +			return 1;  		break;  	case BPF_JSET:  		if ((~subreg.mask & subreg.value) & val) @@ -11847,10 +12694,14 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)  	case BPF_JEQ:  		if (tnum_is_const(reg->var_off))  			return !!tnum_equals_const(reg->var_off, val); +		else if (val < reg->umin_value || val > reg->umax_value) +			return 0;  		break;  	case BPF_JNE:  		if (tnum_is_const(reg->var_off))  			return !tnum_equals_const(reg->var_off, val); +		else if (val < reg->umin_value || val > reg->umax_value) +			return 1;  		break;  	case BPF_JSET:  		if ((~reg->var_off.mask & reg->var_off.value) & val) @@ -12471,6 +13322,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  				       src_reg->var_off.value,  				       opcode,  				       is_jmp32); +	} else if (dst_reg->type == SCALAR_VALUE && +		   is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) { +		pred = is_branch_taken(src_reg, +				       tnum_subreg(dst_reg->var_off).value, +				       flip_opcode(opcode), +				       is_jmp32); +	} else if (dst_reg->type == SCALAR_VALUE && +		   !is_jmp32 && tnum_is_const(dst_reg->var_off)) { +		pred = is_branch_taken(src_reg, +				       dst_reg->var_off.value, +				       flip_opcode(opcode), +				       is_jmp32);  	} else if (reg_is_pkt_pointer_any(dst_reg) &&  		   reg_is_pkt_pointer_any(src_reg) &&  		   !is_jmp32) { @@ -12971,6 +13834,9 @@ static int check_return_code(struct bpf_verifier_env *env)  		}  		break; +	case BPF_PROG_TYPE_NETFILTER: +		range = tnum_range(NF_DROP, NF_ACCEPT); +		break;  	case BPF_PROG_TYPE_EXT:  		/* freplace program can return anything as its return value  		 * depends on the to-be-replaced kernel func or bpf program. @@ -13065,6 +13931,17 @@ static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)  	return env->insn_aux_data[insn_idx].prune_point;  } +static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx) +{ +	env->insn_aux_data[idx].force_checkpoint = true; +} + +static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx) +{ +	return env->insn_aux_data[insn_idx].force_checkpoint; +} + +  enum {  	DONE_EXPLORING = 0,  	KEEP_EXPLORING = 1, @@ -13157,44 +14034,63 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,   */  static int visit_insn(int t, struct bpf_verifier_env *env)  { -	struct bpf_insn *insns = env->prog->insnsi; +	struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];  	int ret; -	if (bpf_pseudo_func(insns + t)) +	if (bpf_pseudo_func(insn))  		return visit_func_call_insn(t, insns, env, true);  	/* All non-branch instructions have a single fall-through edge. */ -	if (BPF_CLASS(insns[t].code) != BPF_JMP && -	    BPF_CLASS(insns[t].code) != BPF_JMP32) +	if (BPF_CLASS(insn->code) != BPF_JMP && +	    BPF_CLASS(insn->code) != BPF_JMP32)  		return push_insn(t, t + 1, FALLTHROUGH, env, false); -	switch (BPF_OP(insns[t].code)) { +	switch (BPF_OP(insn->code)) {  	case BPF_EXIT:  		return DONE_EXPLORING;  	case BPF_CALL: -		if (insns[t].imm == BPF_FUNC_timer_set_callback) +		if (insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback)  			/* Mark this call insn as a prune point to trigger  			 * is_state_visited() check before call itself is  			 * processed by __check_func_call(). Otherwise new  			 * async state will be pushed for further exploration.  			 */  			mark_prune_point(env, t); -		return visit_func_call_insn(t, insns, env, -					    insns[t].src_reg == BPF_PSEUDO_CALL); +		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { +			struct bpf_kfunc_call_arg_meta meta; + +			ret = fetch_kfunc_meta(env, insn, &meta, NULL); +			if (ret == 0 && is_iter_next_kfunc(&meta)) { +				mark_prune_point(env, t); +				/* Checking and saving state checkpoints at iter_next() call +				 * is crucial for fast convergence of open-coded iterator loop +				 * logic, so we need to force it. If we don't do that, +				 * is_state_visited() might skip saving a checkpoint, causing +				 * unnecessarily long sequence of not checkpointed +				 * instructions and jumps, leading to exhaustion of jump +				 * history buffer, and potentially other undesired outcomes. +				 * It is expected that with correct open-coded iterators +				 * convergence will happen quickly, so we don't run a risk of +				 * exhausting memory. +				 */ +				mark_force_checkpoint(env, t); +			} +		} +		return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);  	case BPF_JA: -		if (BPF_SRC(insns[t].code) != BPF_K) +		if (BPF_SRC(insn->code) != BPF_K)  			return -EINVAL;  		/* unconditional jump with single edge */ -		ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env, +		ret = push_insn(t, t + insn->off + 1, FALLTHROUGH, env,  				true);  		if (ret)  			return ret; -		mark_prune_point(env, t + insns[t].off + 1); -		mark_jmp_point(env, t + insns[t].off + 1); +		mark_prune_point(env, t + insn->off + 1); +		mark_jmp_point(env, t + insn->off + 1);  		return ret; @@ -13206,7 +14102,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env)  		if (ret)  			return ret; -		return push_insn(t, t + insns[t].off + 1, BRANCH, env, true); +		return push_insn(t, t + insn->off + 1, BRANCH, env, true);  	}  } @@ -13827,7 +14723,7 @@ static bool regs_exact(const struct bpf_reg_state *rold,  		       const struct bpf_reg_state *rcur,  		       struct bpf_id_pair *idmap)  { -	return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&  +	return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&  	       check_ids(rold->id, rcur->id, idmap) &&  	       check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);  } @@ -13882,13 +14778,17 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,  		       tnum_in(rold->var_off, rcur->var_off);  	case PTR_TO_MAP_KEY:  	case PTR_TO_MAP_VALUE: +	case PTR_TO_MEM: +	case PTR_TO_BUF: +	case PTR_TO_TP_BUFFER:  		/* If the new min/max/var_off satisfy the old ones and  		 * everything else matches, we are OK.  		 */  		return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&  		       range_within(rold, rcur) &&  		       tnum_in(rold->var_off, rcur->var_off) && -		       check_ids(rold->id, rcur->id, idmap); +		       check_ids(rold->id, rcur->id, idmap) && +		       check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);  	case PTR_TO_PACKET_META:  	case PTR_TO_PACKET:  		/* We must have at least as much range as the old ptr @@ -13930,6 +14830,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,  	 * didn't use them  	 */  	for (i = 0; i < old->allocated_stack; i++) { +		struct bpf_reg_state *old_reg, *cur_reg; +  		spi = i / BPF_REG_SIZE;  		if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) { @@ -13986,9 +14888,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,  				return false;  			break;  		case STACK_DYNPTR: -		{ -			const struct bpf_reg_state *old_reg, *cur_reg; -  			old_reg = &old->stack[spi].spilled_ptr;  			cur_reg = &cur->stack[spi].spilled_ptr;  			if (old_reg->dynptr.type != cur_reg->dynptr.type || @@ -13996,7 +14895,22 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,  			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))  				return false;  			break; -		} +		case STACK_ITER: +			old_reg = &old->stack[spi].spilled_ptr; +			cur_reg = &cur->stack[spi].spilled_ptr; +			/* iter.depth is not compared between states as it +			 * doesn't matter for correctness and would otherwise +			 * prevent convergence; we maintain it only to prevent +			 * infinite loop check triggering, see +			 * iter_active_depths_differ() +			 */ +			if (old_reg->iter.btf != cur_reg->iter.btf || +			    old_reg->iter.btf_id != cur_reg->iter.btf_id || +			    old_reg->iter.state != cur_reg->iter.state || +			    /* ignore {old_reg,cur_reg}->iter.depth, see above */ +			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) +				return false; +			break;  		case STACK_MISC:  		case STACK_ZERO:  		case STACK_INVALID: @@ -14210,10 +15124,11 @@ static int propagate_precision(struct bpf_verifier_env *env,  		state_reg = state->regs;  		for (i = 0; i < BPF_REG_FP; i++, state_reg++) {  			if (state_reg->type != SCALAR_VALUE || -			    !state_reg->precise) +			    !state_reg->precise || +			    !(state_reg->live & REG_LIVE_READ))  				continue;  			if (env->log.level & BPF_LOG_LEVEL2) -				verbose(env, "frame %d: propagating r%d\n", i, fr); +				verbose(env, "frame %d: propagating r%d\n", fr, i);  			err = mark_chain_precision_frame(env, fr, i);  			if (err < 0)  				return err; @@ -14224,11 +15139,12 @@ static int propagate_precision(struct bpf_verifier_env *env,  				continue;  			state_reg = &state->stack[i].spilled_ptr;  			if (state_reg->type != SCALAR_VALUE || -			    !state_reg->precise) +			    !state_reg->precise || +			    !(state_reg->live & REG_LIVE_READ))  				continue;  			if (env->log.level & BPF_LOG_LEVEL2)  				verbose(env, "frame %d: propagating fp%d\n", -					(-i - 1) * BPF_REG_SIZE, fr); +					fr, (-i - 1) * BPF_REG_SIZE);  			err = mark_chain_precision_stack_frame(env, fr, i);  			if (err < 0)  				return err; @@ -14255,6 +15171,92 @@ static bool states_maybe_looping(struct bpf_verifier_state *old,  	return true;  } +static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx) +{ +	return env->insn_aux_data[insn_idx].is_iter_next; +} + +/* is_state_visited() handles iter_next() (see process_iter_next_call() for + * terminology) calls specially: as opposed to bounded BPF loops, it *expects* + * states to match, which otherwise would look like an infinite loop. So while + * iter_next() calls are taken care of, we still need to be careful and + * prevent erroneous and too eager declaration of "ininite loop", when + * iterators are involved. + * + * Here's a situation in pseudo-BPF assembly form: + * + *   0: again:                          ; set up iter_next() call args + *   1:   r1 = &it                      ; <CHECKPOINT HERE> + *   2:   call bpf_iter_num_next        ; this is iter_next() call + *   3:   if r0 == 0 goto done + *   4:   ... something useful here ... + *   5:   goto again                    ; another iteration + *   6: done: + *   7:   r1 = &it + *   8:   call bpf_iter_num_destroy     ; clean up iter state + *   9:   exit + * + * This is a typical loop. Let's assume that we have a prune point at 1:, + * before we get to `call bpf_iter_num_next` (e.g., because of that `goto + * again`, assuming other heuristics don't get in a way). + * + * When we first time come to 1:, let's say we have some state X. We proceed + * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit. + * Now we come back to validate that forked ACTIVE state. We proceed through + * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we + * are converging. But the problem is that we don't know that yet, as this + * convergence has to happen at iter_next() call site only. So if nothing is + * done, at 1: verifier will use bounded loop logic and declare infinite + * looping (and would be *technically* correct, if not for iterator's + * "eventual sticky NULL" contract, see process_iter_next_call()). But we + * don't want that. So what we do in process_iter_next_call() when we go on + * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's + * a different iteration. So when we suspect an infinite loop, we additionally + * check if any of the *ACTIVE* iterator states depths differ. If yes, we + * pretend we are not looping and wait for next iter_next() call. + * + * This only applies to ACTIVE state. In DRAINED state we don't expect to + * loop, because that would actually mean infinite loop, as DRAINED state is + * "sticky", and so we'll keep returning into the same instruction with the + * same state (at least in one of possible code paths). + * + * This approach allows to keep infinite loop heuristic even in the face of + * active iterator. E.g., C snippet below is and will be detected as + * inifintely looping: + * + *   struct bpf_iter_num it; + *   int *p, x; + * + *   bpf_iter_num_new(&it, 0, 10); + *   while ((p = bpf_iter_num_next(&t))) { + *       x = p; + *       while (x--) {} // <<-- infinite loop here + *   } + * + */ +static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) +{ +	struct bpf_reg_state *slot, *cur_slot; +	struct bpf_func_state *state; +	int i, fr; + +	for (fr = old->curframe; fr >= 0; fr--) { +		state = old->frame[fr]; +		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { +			if (state->stack[i].slot_type[0] != STACK_ITER) +				continue; + +			slot = &state->stack[i].spilled_ptr; +			if (slot->iter.state != BPF_ITER_STATE_ACTIVE) +				continue; + +			cur_slot = &cur->frame[fr]->stack[i].spilled_ptr; +			if (cur_slot->iter.depth != slot->iter.depth) +				return true; +		} +	} +	return false; +}  static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  { @@ -14262,7 +15264,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  	struct bpf_verifier_state_list *sl, **pprev;  	struct bpf_verifier_state *cur = env->cur_state, *new;  	int i, j, err, states_cnt = 0; -	bool add_new_state = env->test_state_freq ? true : false; +	bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx); +	bool add_new_state = force_new_state;  	/* bpf progs typically have pruning point every 4 instructions  	 * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -14302,8 +15305,46 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  				 * Since the verifier still needs to catch infinite loops  				 * inside async callbacks.  				 */ -			} else if (states_maybe_looping(&sl->state, cur) && -				   states_equal(env, &sl->state, cur)) { +				goto skip_inf_loop_check; +			} +			/* BPF open-coded iterators loop detection is special. +			 * states_maybe_looping() logic is too simplistic in detecting +			 * states that *might* be equivalent, because it doesn't know +			 * about ID remapping, so don't even perform it. +			 * See process_iter_next_call() and iter_active_depths_differ() +			 * for overview of the logic. When current and one of parent +			 * states are detected as equivalent, it's a good thing: we prove +			 * convergence and can stop simulating further iterations. +			 * It's safe to assume that iterator loop will finish, taking into +			 * account iter_next() contract of eventually returning +			 * sticky NULL result. +			 */ +			if (is_iter_next_insn(env, insn_idx)) { +				if (states_equal(env, &sl->state, cur)) { +					struct bpf_func_state *cur_frame; +					struct bpf_reg_state *iter_state, *iter_reg; +					int spi; + +					cur_frame = cur->frame[cur->curframe]; +					/* btf_check_iter_kfuncs() enforces that +					 * iter state pointer is always the first arg +					 */ +					iter_reg = &cur_frame->regs[BPF_REG_1]; +					/* current state is valid due to states_equal(), +					 * so we can assume valid iter and reg state, +					 * no need for extra (re-)validations +					 */ +					spi = __get_spi(iter_reg->off + iter_reg->var_off.value); +					iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; +					if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) +						goto hit; +				} +				goto skip_inf_loop_check; +			} +			/* attempt to detect infinite loop to avoid unnecessary doomed work */ +			if (states_maybe_looping(&sl->state, cur) && +			    states_equal(env, &sl->state, cur) && +			    !iter_active_depths_differ(&sl->state, cur)) {  				verbose_linfo(env, insn_idx, "; ");  				verbose(env, "infinite loop detected at insn %d\n", insn_idx);  				return -EINVAL; @@ -14320,12 +15361,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  			 * This threshold shouldn't be too high either, since states  			 * at the end of the loop are likely to be useful in pruning.  			 */ -			if (env->jmps_processed - env->prev_jmps_processed < 20 && +skip_inf_loop_check: +			if (!force_new_state && +			    env->jmps_processed - env->prev_jmps_processed < 20 &&  			    env->insn_processed - env->prev_insn_processed < 100)  				add_new_state = false;  			goto miss;  		}  		if (states_equal(env, &sl->state, cur)) { +hit:  			sl->hit_cnt++;  			/* reached equivalent register/stack state,  			 * prune the search. @@ -14509,6 +15553,44 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)  			       !reg_type_mismatch_ok(prev));  } +static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, +			     bool allow_trust_missmatch) +{ +	enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type; + +	if (*prev_type == NOT_INIT) { +		/* Saw a valid insn +		 * dst_reg = *(u32 *)(src_reg + off) +		 * save type to validate intersecting paths +		 */ +		*prev_type = type; +	} else if (reg_type_mismatch(type, *prev_type)) { +		/* Abuser program is trying to use the same insn +		 * dst_reg = *(u32*) (src_reg + off) +		 * with different pointer types: +		 * src_reg == ctx in one branch and +		 * src_reg == stack|map in some other branch. +		 * Reject it. +		 */ +		if (allow_trust_missmatch && +		    base_type(type) == PTR_TO_BTF_ID && +		    base_type(*prev_type) == PTR_TO_BTF_ID) { +			/* +			 * Have to support a use case when one path through +			 * the program yields TRUSTED pointer while another +			 * is UNTRUSTED. Fallback to UNTRUSTED to generate +			 * BPF_PROBE_MEM. +			 */ +			*prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED; +		} else { +			verbose(env, "same insn cannot be used with different pointers\n"); +			return -EINVAL; +		} +	} + +	return 0; +} +  static int do_check(struct bpf_verifier_env *env)  {  	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); @@ -14594,11 +15676,11 @@ static int do_check(struct bpf_verifier_env *env)  				print_insn_state(env, state->frame[state->curframe]);  			verbose_linfo(env, env->insn_idx, "; "); -			env->prev_log_len = env->log.len_used; +			env->prev_log_pos = env->log.end_pos;  			verbose(env, "%d: ", env->insn_idx);  			print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); -			env->prev_insn_print_len = env->log.len_used - env->prev_log_len; -			env->prev_log_len = env->log.len_used; +			env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos; +			env->prev_log_pos = env->log.end_pos;  		}  		if (bpf_prog_is_offloaded(env->prog->aux)) { @@ -14618,7 +15700,7 @@ static int do_check(struct bpf_verifier_env *env)  				return err;  		} else if (class == BPF_LDX) { -			enum bpf_reg_type *prev_src_type, src_reg_type; +			enum bpf_reg_type src_reg_type;  			/* check for reserved fields is already done */ @@ -14642,29 +15724,11 @@ static int do_check(struct bpf_verifier_env *env)  			if (err)  				return err; -			prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type; - -			if (*prev_src_type == NOT_INIT) { -				/* saw a valid insn -				 * dst_reg = *(u32 *)(src_reg + off) -				 * save type to validate intersecting paths -				 */ -				*prev_src_type = src_reg_type; - -			} else if (reg_type_mismatch(src_reg_type, *prev_src_type)) { -				/* ABuser program is trying to use the same insn -				 * dst_reg = *(u32*) (src_reg + off) -				 * with different pointer types: -				 * src_reg == ctx in one branch and -				 * src_reg == stack|map in some other branch. -				 * Reject it. -				 */ -				verbose(env, "same insn cannot be used with different pointers\n"); -				return -EINVAL; -			} - +			err = save_aux_ptr_type(env, src_reg_type, true); +			if (err) +				return err;  		} else if (class == BPF_STX) { -			enum bpf_reg_type *prev_dst_type, dst_reg_type; +			enum bpf_reg_type dst_reg_type;  			if (BPF_MODE(insn->code) == BPF_ATOMIC) {  				err = check_atomic(env, env->insn_idx, insn); @@ -14697,16 +15761,12 @@ static int do_check(struct bpf_verifier_env *env)  			if (err)  				return err; -			prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type; - -			if (*prev_dst_type == NOT_INIT) { -				*prev_dst_type = dst_reg_type; -			} else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) { -				verbose(env, "same insn cannot be used with different pointers\n"); -				return -EINVAL; -			} - +			err = save_aux_ptr_type(env, dst_reg_type, false); +			if (err) +				return err;  		} else if (class == BPF_ST) { +			enum bpf_reg_type dst_reg_type; +  			if (BPF_MODE(insn->code) != BPF_MEM ||  			    insn->src_reg != BPF_REG_0) {  				verbose(env, "BPF_ST uses reserved fields\n"); @@ -14717,12 +15777,7 @@ static int do_check(struct bpf_verifier_env *env)  			if (err)  				return err; -			if (is_ctx_reg(env, insn->dst_reg)) { -				verbose(env, "BPF_ST stores into R%d %s is not allowed\n", -					insn->dst_reg, -					reg_type_str(env, reg_state(env, insn->dst_reg)->type)); -				return -EACCES; -			} +			dst_reg_type = regs[insn->dst_reg].type;  			/* check that memory (dst_reg + off) is writeable */  			err = check_mem_access(env, env->insn_idx, insn->dst_reg, @@ -14731,6 +15786,9 @@ static int do_check(struct bpf_verifier_env *env)  			if (err)  				return err; +			err = save_aux_ptr_type(env, dst_reg_type, false); +			if (err) +				return err;  		} else if (class == BPF_JMP || class == BPF_JMP32) {  			u8 opcode = BPF_OP(insn->code); @@ -14765,6 +15823,8 @@ static int do_check(struct bpf_verifier_env *env)  					err = check_helper_call(env, insn, &env->insn_idx);  				if (err)  					return err; + +				mark_reg_scratched(env, BPF_REG_0);  			} else if (opcode == BPF_JA) {  				if (BPF_SRC(insn->code) != BPF_K ||  				    insn->imm != 0 || @@ -14939,8 +15999,8 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,  		goto err_put;  	} -	if (!btf_type_is_var(t)) { -		verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id); +	if (!btf_type_is_var(t) && !btf_type_is_func(t)) { +		verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id);  		err = -EINVAL;  		goto err_put;  	} @@ -14953,6 +16013,14 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,  		err = -ENOENT;  		goto err_put;  	} +	insn[0].imm = (u32)addr; +	insn[1].imm = addr >> 32; + +	if (btf_type_is_func(t)) { +		aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY; +		aux->btf_var.mem_size = 0; +		goto check_btf; +	}  	datasec_id = find_btf_percpu_datasec(btf);  	if (datasec_id > 0) { @@ -14965,9 +16033,6 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,  		}  	} -	insn[0].imm = (u32)addr; -	insn[1].imm = addr >> 32; -  	type = t->type;  	t = btf_type_skip_modifiers(btf, type, NULL);  	if (percpu) { @@ -14995,7 +16060,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,  		aux->btf_var.btf = btf;  		aux->btf_var.btf_id = type;  	} - +check_btf:  	/* check whether we recorded this BTF (and maybe module) already */  	for (i = 0; i < env->used_btf_cnt; i++) {  		if (env->used_btfs[i].btf == btf) { @@ -15839,14 +16904,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  	for (i = 0; i < insn_cnt; i++, insn++) {  		bpf_convert_ctx_access_t convert_ctx_access; -		bool ctx_access;  		if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||  		    insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||  		    insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||  		    insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) {  			type = BPF_READ; -			ctx_access = true;  		} else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||  			   insn->code == (BPF_STX | BPF_MEM | BPF_H) ||  			   insn->code == (BPF_STX | BPF_MEM | BPF_W) || @@ -15856,7 +16919,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  			   insn->code == (BPF_ST | BPF_MEM | BPF_W) ||  			   insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {  			type = BPF_WRITE; -			ctx_access = BPF_CLASS(insn->code) == BPF_STX;  		} else {  			continue;  		} @@ -15879,9 +16941,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  			continue;  		} -		if (!ctx_access) -			continue; -  		switch ((int)env->insn_aux_data[i + delta].ptr_type) {  		case PTR_TO_CTX:  			if (!ops->convert_ctx_access) @@ -16272,11 +17331,62 @@ static int fixup_call_args(struct bpf_verifier_env *env)  	return err;  } +/* replace a generic kfunc with a specialized version if necessary */ +static void specialize_kfunc(struct bpf_verifier_env *env, +			     u32 func_id, u16 offset, unsigned long *addr) +{ +	struct bpf_prog *prog = env->prog; +	bool seen_direct_write; +	void *xdp_kfunc; +	bool is_rdonly; + +	if (bpf_dev_bound_kfunc_id(func_id)) { +		xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id); +		if (xdp_kfunc) { +			*addr = (unsigned long)xdp_kfunc; +			return; +		} +		/* fallback to default kfunc when not supported by netdev */ +	} + +	if (offset) +		return; + +	if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { +		seen_direct_write = env->seen_direct_write; +		is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE); + +		if (is_rdonly) +			*addr = (unsigned long)bpf_dynptr_from_skb_rdonly; + +		/* restore env->seen_direct_write to its original value, since +		 * may_access_direct_pkt_data mutates it +		 */ +		env->seen_direct_write = seen_direct_write; +	} +} + +static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, +					    u16 struct_meta_reg, +					    u16 node_offset_reg, +					    struct bpf_insn *insn, +					    struct bpf_insn *insn_buf, +					    int *cnt) +{ +	struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta; +	struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) }; + +	insn_buf[0] = addr[0]; +	insn_buf[1] = addr[1]; +	insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off); +	insn_buf[3] = *insn; +	*cnt = 4; +} +  static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  			    struct bpf_insn *insn_buf, int insn_idx, int *cnt)  {  	const struct bpf_kfunc_desc *desc; -	void *xdp_kfunc;  	if (!insn->imm) {  		verbose(env, "invalid kernel function call not eliminated in verifier pass\n"); @@ -16285,18 +17395,9 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  	*cnt = 0; -	if (bpf_dev_bound_kfunc_id(insn->imm)) { -		xdp_kfunc = bpf_dev_bound_resolve_kfunc(env->prog, insn->imm); -		if (xdp_kfunc) { -			insn->imm = BPF_CALL_IMM(xdp_kfunc); -			return 0; -		} - -		/* fallback to default kfunc when not supported by netdev */ -	} - -	/* insn->imm has the btf func_id. Replace it with -	 * an address (relative to __bpf_call_base). +	/* insn->imm has the btf func_id. Replace it with an offset relative to +	 * __bpf_call_base, unless the JIT needs to call functions that are +	 * further than 32 bits away (bpf_jit_supports_far_kfunc_call()).  	 */  	desc = find_kfunc_desc(env->prog, insn->imm, insn->off);  	if (!desc) { @@ -16305,7 +17406,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  		return -EFAULT;  	} -	insn->imm = desc->imm; +	if (!bpf_jit_supports_far_kfunc_call()) +		insn->imm = BPF_CALL_IMM(desc->addr);  	if (insn->off)  		return 0;  	if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { @@ -16318,7 +17420,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  		insn_buf[2] = addr[1];  		insn_buf[3] = *insn;  		*cnt = 4; -	} else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { +	} else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || +		   desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {  		struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;  		struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; @@ -16326,6 +17429,20 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  		insn_buf[1] = addr[1];  		insn_buf[2] = *insn;  		*cnt = 3; +	} else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] || +		   desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || +		   desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { +		int struct_meta_reg = BPF_REG_3; +		int node_offset_reg = BPF_REG_4; + +		/* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */ +		if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { +			struct_meta_reg = BPF_REG_4; +			node_offset_reg = BPF_REG_5; +		} + +		__fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg, +						node_offset_reg, insn, insn_buf, cnt);  	} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||  		   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {  		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); @@ -16665,21 +17782,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env)  			BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,  				     (void *(*)(struct bpf_map *map, void *key))NULL));  			BUILD_BUG_ON(!__same_type(ops->map_delete_elem, -				     (int (*)(struct bpf_map *map, void *key))NULL)); +				     (long (*)(struct bpf_map *map, void *key))NULL));  			BUILD_BUG_ON(!__same_type(ops->map_update_elem, -				     (int (*)(struct bpf_map *map, void *key, void *value, +				     (long (*)(struct bpf_map *map, void *key, void *value,  					      u64 flags))NULL));  			BUILD_BUG_ON(!__same_type(ops->map_push_elem, -				     (int (*)(struct bpf_map *map, void *value, +				     (long (*)(struct bpf_map *map, void *value,  					      u64 flags))NULL));  			BUILD_BUG_ON(!__same_type(ops->map_pop_elem, -				     (int (*)(struct bpf_map *map, void *value))NULL)); +				     (long (*)(struct bpf_map *map, void *value))NULL));  			BUILD_BUG_ON(!__same_type(ops->map_peek_elem, -				     (int (*)(struct bpf_map *map, void *value))NULL)); +				     (long (*)(struct bpf_map *map, void *value))NULL));  			BUILD_BUG_ON(!__same_type(ops->map_redirect, -				     (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL)); +				     (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));  			BUILD_BUG_ON(!__same_type(ops->map_for_each_callback, -				     (int (*)(struct bpf_map *map, +				     (long (*)(struct bpf_map *map,  					      bpf_callback_t callback_fn,  					      void *callback_ctx,  					      u64 flags))NULL)); @@ -16859,7 +17976,7 @@ patch_call_imm:  		}  	} -	sort_kfunc_descs_by_imm(env->prog); +	sort_kfunc_descs_by_imm_off(env->prog);  	return 0;  } @@ -17287,6 +18404,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,  	const char *tname;  	struct btf *btf;  	long addr = 0; +	struct module *mod = NULL;  	if (!btf_id) {  		bpf_log(log, "Tracing programs must provide btf_id\n"); @@ -17460,8 +18578,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,  			else  				addr = (long) tgt_prog->aux->func[subprog]->bpf_func;  		} else { -			addr = kallsyms_lookup_name(tname); +			if (btf_is_module(btf)) { +				mod = btf_try_get_module(btf); +				if (mod) +					addr = find_kallsyms_symbol_value(mod, tname); +				else +					addr = 0; +			} else { +				addr = kallsyms_lookup_name(tname); +			}  			if (!addr) { +				module_put(mod);  				bpf_log(log,  					"The address of function %s cannot be found\n",  					tname); @@ -17501,11 +18628,13 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,  				break;  			}  			if (ret) { +				module_put(mod);  				bpf_log(log, "%s is not sleepable\n", tname);  				return ret;  			}  		} else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {  			if (tgt_prog) { +				module_put(mod);  				bpf_log(log, "can't modify return codes of BPF programs\n");  				return -EINVAL;  			} @@ -17514,6 +18643,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,  			    !check_attach_modify_return(addr, tname))  				ret = 0;  			if (ret) { +				module_put(mod);  				bpf_log(log, "%s() is not modifiable\n", tname);  				return ret;  			} @@ -17524,6 +18654,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,  	tgt_info->tgt_addr = addr;  	tgt_info->tgt_name = tname;  	tgt_info->tgt_type = t; +	tgt_info->tgt_mod = mod;  	return 0;  } @@ -17536,6 +18667,14 @@ BTF_ID(func, migrate_enable)  #if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU  BTF_ID(func, rcu_read_unlock_strict)  #endif +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE) +BTF_ID(func, preempt_count_add) +BTF_ID(func, preempt_count_sub) +#endif +#ifdef CONFIG_PREEMPT_RCU +BTF_ID(func, __rcu_read_lock) +BTF_ID(func, __rcu_read_unlock) +#endif  BTF_SET_END(btf_id_deny)  static bool can_be_sleepable(struct bpf_prog *prog) @@ -17603,6 +18742,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)  	/* store info about the attachment target that will be used later */  	prog->aux->attach_func_proto = tgt_info.tgt_type;  	prog->aux->attach_func_name = tgt_info.tgt_name; +	prog->aux->mod = tgt_info.tgt_mod;  	if (tgt_prog) {  		prog->aux->saved_dst_prog_type = tgt_prog->type; @@ -17647,12 +18787,12 @@ struct btf *bpf_get_btf_vmlinux(void)  	return btf_vmlinux;  } -int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)  {  	u64 start_time = ktime_get_ns();  	struct bpf_verifier_env *env; -	struct bpf_verifier_log *log; -	int i, len, ret = -EINVAL; +	int i, len, ret = -EINVAL, err; +	u32 log_true_size;  	bool is_priv;  	/* no program is valid */ @@ -17665,7 +18805,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)  	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);  	if (!env)  		return -ENOMEM; -	log = &env->log;  	len = (*prog)->len;  	env->insn_aux_data = @@ -17686,20 +18825,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)  	if (!is_priv)  		mutex_lock(&bpf_verifier_lock); -	if (attr->log_level || attr->log_buf || attr->log_size) { -		/* user requested verbose verifier output -		 * and supplied buffer to store the verification trace -		 */ -		log->level = attr->log_level; -		log->ubuf = (char __user *) (unsigned long) attr->log_buf; -		log->len_total = attr->log_size; - -		/* log attributes have to be sane */ -		if (!bpf_verifier_log_attr_valid(log)) { -			ret = -EINVAL; -			goto err_unlock; -		} -	} +	/* user could have requested verbose verifier output +	 * and supplied buffer to store the verification trace +	 */ +	ret = bpf_vlog_init(&env->log, attr->log_level, +			    (char __user *) (unsigned long) attr->log_buf, +			    attr->log_size); +	if (ret) +		goto err_unlock;  	mark_verifier_state_clean(env); @@ -17721,8 +18854,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)  	env->bypass_spec_v1 = bpf_bypass_spec_v1();  	env->bypass_spec_v4 = bpf_bypass_spec_v4();  	env->bpf_capable = bpf_capable(); -	env->rcu_tag_supported = btf_vmlinux && -		btf_find_by_name_kind(btf_vmlinux, "rcu", BTF_KIND_TYPE_TAG) > 0;  	if (is_priv)  		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; @@ -17815,9 +18946,14 @@ skip_full_check:  	print_verification_stats(env);  	env->prog->aux->verified_insns = env->insn_processed; -	if (log->level && bpf_verifier_log_full(log)) -		ret = -ENOSPC; -	if (log->level && !log->ubuf) { +	/* preserve original error even if log finalization is successful */ +	err = bpf_vlog_finalize(&env->log, &log_true_size); +	if (err) +		ret = err; + +	if (uattr_size >= offsetofend(union bpf_attr, log_true_size) && +	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size), +				  &log_true_size, sizeof(log_true_size))) {  		ret = -EFAULT;  		goto err_release_maps;  	} |