diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/Makefile | 7 | ||||
| -rw-r--r-- | kernel/bpf/arraymap.c | 51 | ||||
| -rw-r--r-- | kernel/bpf/btf.c | 3 | ||||
| -rw-r--r-- | kernel/bpf/cgroup.c | 82 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 5 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 31 | ||||
| -rw-r--r-- | kernel/bpf/helpers.c | 68 | ||||
| -rw-r--r-- | kernel/bpf/local_storage.c | 169 | ||||
| -rw-r--r-- | kernel/bpf/map_in_map.c | 3 | ||||
| -rw-r--r-- | kernel/bpf/offload.c | 18 | ||||
| -rw-r--r-- | kernel/bpf/queue_stack_maps.c | 288 | ||||
| -rw-r--r-- | kernel/bpf/sockmap.c | 2631 | ||||
| -rw-r--r-- | kernel/bpf/stackmap.c | 4 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 148 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 973 | ||||
| -rw-r--r-- | kernel/bpf/xskmap.c | 2 | ||||
| -rw-r--r-- | kernel/umh.c | 16 | 
17 files changed, 1498 insertions, 3001 deletions
| diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 0488b8258321..4c2fa3ac56f6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -3,7 +3,7 @@ obj-y := core.o  obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o  obj-$(CONFIG_BPF_SYSCALL) += disasm.o  obj-$(CONFIG_BPF_SYSCALL) += btf.o  ifeq ($(CONFIG_NET),y) @@ -13,11 +13,6 @@ ifeq ($(CONFIG_XDP_SOCKETS),y)  obj-$(CONFIG_BPF_SYSCALL) += xskmap.o  endif  obj-$(CONFIG_BPF_SYSCALL) += offload.o -ifeq ($(CONFIG_STREAM_PARSER),y) -ifeq ($(CONFIG_INET),y) -obj-$(CONFIG_BPF_SYSCALL) += sockmap.o -endif -endif  endif  ifeq ($(CONFIG_PERF_EVENTS),y)  obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 0c17aab3ce5f..24583da9ffd1 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -358,6 +358,29 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,  	rcu_read_unlock();  } +static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, +					   struct seq_file *m) +{ +	struct bpf_array *array = container_of(map, struct bpf_array, map); +	u32 index = *(u32 *)key; +	void __percpu *pptr; +	int cpu; + +	rcu_read_lock(); + +	seq_printf(m, "%u: {\n", *(u32 *)key); +	pptr = array->pptrs[index & array->index_mask]; +	for_each_possible_cpu(cpu) { +		seq_printf(m, "\tcpu%d: ", cpu); +		btf_type_seq_show(map->btf, map->btf_value_type_id, +				  per_cpu_ptr(pptr, cpu), m); +		seq_puts(m, "\n"); +	} +	seq_puts(m, "}\n"); + +	rcu_read_unlock(); +} +  static int array_map_check_btf(const struct bpf_map *map,  			       const struct btf_type *key_type,  			       const struct btf_type *value_type) @@ -398,6 +421,7 @@ const struct bpf_map_ops percpu_array_map_ops = {  	.map_lookup_elem = percpu_array_map_lookup_elem,  	.map_update_elem = array_map_update_elem,  	.map_delete_elem = array_map_delete_elem, +	.map_seq_show_elem = percpu_array_map_seq_show_elem,  	.map_check_btf = array_map_check_btf,  }; @@ -425,7 +449,7 @@ static void fd_array_map_free(struct bpf_map *map)  static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)  { -	return NULL; +	return ERR_PTR(-EOPNOTSUPP);  }  /* only called from syscall */ @@ -529,6 +553,29 @@ static void bpf_fd_array_map_clear(struct bpf_map *map)  		fd_array_map_delete_elem(map, &i);  } +static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, +					 struct seq_file *m) +{ +	void **elem, *ptr; +	u32 prog_id; + +	rcu_read_lock(); + +	elem = array_map_lookup_elem(map, key); +	if (elem) { +		ptr = READ_ONCE(*elem); +		if (ptr) { +			seq_printf(m, "%u: ", *(u32 *)key); +			prog_id = prog_fd_array_sys_lookup_elem(ptr); +			btf_type_seq_show(map->btf, map->btf_value_type_id, +					  &prog_id, m); +			seq_puts(m, "\n"); +		} +	} + +	rcu_read_unlock(); +} +  const struct bpf_map_ops prog_array_map_ops = {  	.map_alloc_check = fd_array_map_alloc_check,  	.map_alloc = array_map_alloc, @@ -540,7 +587,7 @@ const struct bpf_map_ops prog_array_map_ops = {  	.map_fd_put_ptr = prog_fd_array_put_ptr,  	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,  	.map_release_uref = bpf_fd_array_map_clear, -	.map_check_btf = map_check_no_btf, +	.map_seq_show_elem = prog_array_map_seq_show_elem,  };  static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 138f0302692e..378cef70341c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2114,6 +2114,9 @@ static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data,  	hdr = &btf->hdr; +	if (hdr->hdr_len != hdr_len) +		return -EINVAL; +  	btf_verifier_log_hdr(env, btf_data_size);  	if (hdr->magic != BTF_MAGIC) { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6a7d931bbc55..9425c2fb872f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -25,6 +25,7 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key);   */  void cgroup_bpf_put(struct cgroup *cgrp)  { +	enum bpf_cgroup_storage_type stype;  	unsigned int type;  	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { @@ -34,8 +35,10 @@ void cgroup_bpf_put(struct cgroup *cgrp)  		list_for_each_entry_safe(pl, tmp, progs, node) {  			list_del(&pl->node);  			bpf_prog_put(pl->prog); -			bpf_cgroup_storage_unlink(pl->storage); -			bpf_cgroup_storage_free(pl->storage); +			for_each_cgroup_storage_type(stype) { +				bpf_cgroup_storage_unlink(pl->storage[stype]); +				bpf_cgroup_storage_free(pl->storage[stype]); +			}  			kfree(pl);  			static_branch_dec(&cgroup_bpf_enabled_key);  		} @@ -97,6 +100,7 @@ static int compute_effective_progs(struct cgroup *cgrp,  				   enum bpf_attach_type type,  				   struct bpf_prog_array __rcu **array)  { +	enum bpf_cgroup_storage_type stype;  	struct bpf_prog_array *progs;  	struct bpf_prog_list *pl;  	struct cgroup *p = cgrp; @@ -125,7 +129,9 @@ static int compute_effective_progs(struct cgroup *cgrp,  				continue;  			progs->items[cnt].prog = pl->prog; -			progs->items[cnt].cgroup_storage = pl->storage; +			for_each_cgroup_storage_type(stype) +				progs->items[cnt].cgroup_storage[stype] = +					pl->storage[stype];  			cnt++;  		}  	} while ((p = cgroup_parent(p))); @@ -232,7 +238,9 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,  {  	struct list_head *progs = &cgrp->bpf.progs[type];  	struct bpf_prog *old_prog = NULL; -	struct bpf_cgroup_storage *storage, *old_storage = NULL; +	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], +		*old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; +	enum bpf_cgroup_storage_type stype;  	struct bpf_prog_list *pl;  	bool pl_was_allocated;  	int err; @@ -254,34 +262,44 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,  	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)  		return -E2BIG; -	storage = bpf_cgroup_storage_alloc(prog); -	if (IS_ERR(storage)) -		return -ENOMEM; +	for_each_cgroup_storage_type(stype) { +		storage[stype] = bpf_cgroup_storage_alloc(prog, stype); +		if (IS_ERR(storage[stype])) { +			storage[stype] = NULL; +			for_each_cgroup_storage_type(stype) +				bpf_cgroup_storage_free(storage[stype]); +			return -ENOMEM; +		} +	}  	if (flags & BPF_F_ALLOW_MULTI) {  		list_for_each_entry(pl, progs, node) {  			if (pl->prog == prog) {  				/* disallow attaching the same prog twice */ -				bpf_cgroup_storage_free(storage); +				for_each_cgroup_storage_type(stype) +					bpf_cgroup_storage_free(storage[stype]);  				return -EINVAL;  			}  		}  		pl = kmalloc(sizeof(*pl), GFP_KERNEL);  		if (!pl) { -			bpf_cgroup_storage_free(storage); +			for_each_cgroup_storage_type(stype) +				bpf_cgroup_storage_free(storage[stype]);  			return -ENOMEM;  		}  		pl_was_allocated = true;  		pl->prog = prog; -		pl->storage = storage; +		for_each_cgroup_storage_type(stype) +			pl->storage[stype] = storage[stype];  		list_add_tail(&pl->node, progs);  	} else {  		if (list_empty(progs)) {  			pl = kmalloc(sizeof(*pl), GFP_KERNEL);  			if (!pl) { -				bpf_cgroup_storage_free(storage); +				for_each_cgroup_storage_type(stype) +					bpf_cgroup_storage_free(storage[stype]);  				return -ENOMEM;  			}  			pl_was_allocated = true; @@ -289,12 +307,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,  		} else {  			pl = list_first_entry(progs, typeof(*pl), node);  			old_prog = pl->prog; -			old_storage = pl->storage; -			bpf_cgroup_storage_unlink(old_storage); +			for_each_cgroup_storage_type(stype) { +				old_storage[stype] = pl->storage[stype]; +				bpf_cgroup_storage_unlink(old_storage[stype]); +			}  			pl_was_allocated = false;  		}  		pl->prog = prog; -		pl->storage = storage; +		for_each_cgroup_storage_type(stype) +			pl->storage[stype] = storage[stype];  	}  	cgrp->bpf.flags[type] = flags; @@ -304,21 +325,27 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,  		goto cleanup;  	static_branch_inc(&cgroup_bpf_enabled_key); -	if (old_storage) -		bpf_cgroup_storage_free(old_storage); +	for_each_cgroup_storage_type(stype) { +		if (!old_storage[stype]) +			continue; +		bpf_cgroup_storage_free(old_storage[stype]); +	}  	if (old_prog) {  		bpf_prog_put(old_prog);  		static_branch_dec(&cgroup_bpf_enabled_key);  	} -	bpf_cgroup_storage_link(storage, cgrp, type); +	for_each_cgroup_storage_type(stype) +		bpf_cgroup_storage_link(storage[stype], cgrp, type);  	return 0;  cleanup:  	/* and cleanup the prog list */  	pl->prog = old_prog; -	bpf_cgroup_storage_free(pl->storage); -	pl->storage = old_storage; -	bpf_cgroup_storage_link(old_storage, cgrp, type); +	for_each_cgroup_storage_type(stype) { +		bpf_cgroup_storage_free(pl->storage[stype]); +		pl->storage[stype] = old_storage[stype]; +		bpf_cgroup_storage_link(old_storage[stype], cgrp, type); +	}  	if (pl_was_allocated) {  		list_del(&pl->node);  		kfree(pl); @@ -339,6 +366,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,  			enum bpf_attach_type type, u32 unused_flags)  {  	struct list_head *progs = &cgrp->bpf.progs[type]; +	enum bpf_cgroup_storage_type stype;  	u32 flags = cgrp->bpf.flags[type];  	struct bpf_prog *old_prog = NULL;  	struct bpf_prog_list *pl; @@ -385,8 +413,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,  	/* now can actually delete it from this cgroup list */  	list_del(&pl->node); -	bpf_cgroup_storage_unlink(pl->storage); -	bpf_cgroup_storage_free(pl->storage); +	for_each_cgroup_storage_type(stype) { +		bpf_cgroup_storage_unlink(pl->storage[stype]); +		bpf_cgroup_storage_free(pl->storage[stype]); +	}  	kfree(pl);  	if (list_empty(progs))  		/* last program was detached, reset flags to zero */ @@ -523,6 +553,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,  {  	unsigned int offset = skb->data - skb_network_header(skb);  	struct sock *save_sk; +	void *saved_data_end;  	struct cgroup *cgrp;  	int ret; @@ -536,8 +567,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,  	save_sk = skb->sk;  	skb->sk = sk;  	__skb_push(skb, offset); + +	/* compute pointers for the bpf prog */ +	bpf_compute_and_save_data_end(skb, &saved_data_end); +  	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,  				 bpf_prog_run_save_cb); +	bpf_restore_data_end(skb, saved_data_end);  	__skb_pull(skb, offset);  	skb->sk = save_sk;  	return ret == 1 ? 0 : -EPERM; @@ -677,6 +713,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  		return &bpf_get_current_uid_gid_proto;  	case BPF_FUNC_get_local_storage:  		return &bpf_get_local_storage_proto; +	case BPF_FUNC_get_current_cgroup_id: +		return &bpf_get_current_cgroup_id_proto;  	case BPF_FUNC_trace_printk:  		if (capable(CAP_SYS_ADMIN))  			return bpf_get_trace_printk_proto(); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3f5bf1af0826..7c7eeea8cffc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1783,6 +1783,9 @@ BPF_CALL_0(bpf_user_rnd_u32)  const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;  const struct bpf_func_proto bpf_map_update_elem_proto __weak;  const struct bpf_func_proto bpf_map_delete_elem_proto __weak; +const struct bpf_func_proto bpf_map_push_elem_proto __weak; +const struct bpf_func_proto bpf_map_pop_elem_proto __weak; +const struct bpf_func_proto bpf_map_peek_elem_proto __weak;  const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;  const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; @@ -1792,8 +1795,6 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;  const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;  const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;  const struct bpf_func_proto bpf_get_current_comm_proto __weak; -const struct bpf_func_proto bpf_sock_map_update_proto __weak; -const struct bpf_func_proto bpf_sock_hash_update_proto __weak;  const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;  const struct bpf_func_proto bpf_get_local_storage_proto __weak; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 03cc59ee9c95..2c1790288138 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1285,6 +1285,35 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,  	return ret;  } +static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, +					  struct seq_file *m) +{ +	struct htab_elem *l; +	void __percpu *pptr; +	int cpu; + +	rcu_read_lock(); + +	l = __htab_map_lookup_elem(map, key); +	if (!l) { +		rcu_read_unlock(); +		return; +	} + +	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); +	seq_puts(m, ": {\n"); +	pptr = htab_elem_get_ptr(l, map->key_size); +	for_each_possible_cpu(cpu) { +		seq_printf(m, "\tcpu%d: ", cpu); +		btf_type_seq_show(map->btf, map->btf_value_type_id, +				  per_cpu_ptr(pptr, cpu), m); +		seq_puts(m, "\n"); +	} +	seq_puts(m, "}\n"); + +	rcu_read_unlock(); +} +  const struct bpf_map_ops htab_percpu_map_ops = {  	.map_alloc_check = htab_map_alloc_check,  	.map_alloc = htab_map_alloc, @@ -1293,6 +1322,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {  	.map_lookup_elem = htab_percpu_map_lookup_elem,  	.map_update_elem = htab_percpu_map_update_elem,  	.map_delete_elem = htab_map_delete_elem, +	.map_seq_show_elem = htab_percpu_map_seq_show_elem,  };  const struct bpf_map_ops htab_lru_percpu_map_ops = { @@ -1303,6 +1333,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {  	.map_lookup_elem = htab_lru_percpu_map_lookup_elem,  	.map_update_elem = htab_lru_percpu_map_update_elem,  	.map_delete_elem = htab_lru_map_delete_elem, +	.map_seq_show_elem = htab_percpu_map_seq_show_elem,  };  static int fd_htab_map_alloc_check(union bpf_attr *attr) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1991466b8327..ab0d5e3f9892 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -76,6 +76,49 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = {  	.arg2_type	= ARG_PTR_TO_MAP_KEY,  }; +BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags) +{ +	return map->ops->map_push_elem(map, value, flags); +} + +const struct bpf_func_proto bpf_map_push_elem_proto = { +	.func		= bpf_map_push_elem, +	.gpl_only	= false, +	.pkt_access	= true, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_CONST_MAP_PTR, +	.arg2_type	= ARG_PTR_TO_MAP_VALUE, +	.arg3_type	= ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) +{ +	return map->ops->map_pop_elem(map, value); +} + +const struct bpf_func_proto bpf_map_pop_elem_proto = { +	.func		= bpf_map_pop_elem, +	.gpl_only	= false, +	.pkt_access	= true, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_CONST_MAP_PTR, +	.arg2_type	= ARG_PTR_TO_UNINIT_MAP_VALUE, +}; + +BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) +{ +	return map->ops->map_peek_elem(map, value); +} + +const struct bpf_func_proto bpf_map_peek_elem_proto = { +	.func		= bpf_map_pop_elem, +	.gpl_only	= false, +	.pkt_access	= true, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_CONST_MAP_PTR, +	.arg2_type	= ARG_PTR_TO_UNINIT_MAP_VALUE, +}; +  const struct bpf_func_proto bpf_get_prandom_u32_proto = {  	.func		= bpf_user_rnd_u32,  	.gpl_only	= false, @@ -194,16 +237,28 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {  	.ret_type	= RET_INTEGER,  }; -DECLARE_PER_CPU(void*, bpf_cgroup_storage); +#ifdef CONFIG_CGROUP_BPF +DECLARE_PER_CPU(struct bpf_cgroup_storage*, +		bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);  BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)  { -	/* map and flags arguments are not used now, -	 * but provide an ability to extend the API -	 * for other types of local storages. -	 * verifier checks that their values are correct. +	/* flags argument is not used now, +	 * but provides an ability to extend the API. +	 * verifier checks that its value is correct.  	 */ -	return (unsigned long) this_cpu_read(bpf_cgroup_storage); +	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); +	struct bpf_cgroup_storage *storage; +	void *ptr; + +	storage = this_cpu_read(bpf_cgroup_storage[stype]); + +	if (stype == BPF_CGROUP_STORAGE_SHARED) +		ptr = &READ_ONCE(storage->buf)->data[0]; +	else +		ptr = this_cpu_ptr(storage->percpu_buf); + +	return (unsigned long)ptr;  }  const struct bpf_func_proto bpf_get_local_storage_proto = { @@ -214,3 +269,4 @@ const struct bpf_func_proto bpf_get_local_storage_proto = {  	.arg2_type	= ARG_ANYTHING,  };  #endif +#endif diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 830d7f095748..c97a8f968638 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -7,7 +7,8 @@  #include <linux/rbtree.h>  #include <linux/slab.h> -DEFINE_PER_CPU(void*, bpf_cgroup_storage); +DEFINE_PER_CPU(struct bpf_cgroup_storage*, +	       bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);  #ifdef CONFIG_CGROUP_BPF @@ -151,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,  	return 0;  } +int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, +				   void *value) +{ +	struct bpf_cgroup_storage_map *map = map_to_storage(_map); +	struct bpf_cgroup_storage_key *key = _key; +	struct bpf_cgroup_storage *storage; +	int cpu, off = 0; +	u32 size; + +	rcu_read_lock(); +	storage = cgroup_storage_lookup(map, key, false); +	if (!storage) { +		rcu_read_unlock(); +		return -ENOENT; +	} + +	/* per_cpu areas are zero-filled and bpf programs can only +	 * access 'value_size' of them, so copying rounded areas +	 * will not leak any kernel data +	 */ +	size = round_up(_map->value_size, 8); +	for_each_possible_cpu(cpu) { +		bpf_long_memcpy(value + off, +				per_cpu_ptr(storage->percpu_buf, cpu), size); +		off += size; +	} +	rcu_read_unlock(); +	return 0; +} + +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, +				     void *value, u64 map_flags) +{ +	struct bpf_cgroup_storage_map *map = map_to_storage(_map); +	struct bpf_cgroup_storage_key *key = _key; +	struct bpf_cgroup_storage *storage; +	int cpu, off = 0; +	u32 size; + +	if (map_flags != BPF_ANY && map_flags != BPF_EXIST) +		return -EINVAL; + +	rcu_read_lock(); +	storage = cgroup_storage_lookup(map, key, false); +	if (!storage) { +		rcu_read_unlock(); +		return -ENOENT; +	} + +	/* the user space will provide round_up(value_size, 8) bytes that +	 * will be copied into per-cpu area. bpf programs can only access +	 * value_size of it. During lookup the same extra bytes will be +	 * returned or zeros which were zero-filled by percpu_alloc, +	 * so no kernel data leaks possible +	 */ +	size = round_up(_map->value_size, 8); +	for_each_possible_cpu(cpu) { +		bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), +				value + off, size); +		off += size; +	} +	rcu_read_unlock(); +	return 0; +} +  static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,  				       void *_next_key)  { @@ -254,6 +320,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = {  int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)  { +	enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);  	struct bpf_cgroup_storage_map *map = map_to_storage(_map);  	int ret = -EBUSY; @@ -261,11 +328,12 @@ int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)  	if (map->prog && map->prog != prog)  		goto unlock; -	if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map) +	if (prog->aux->cgroup_storage[stype] && +	    prog->aux->cgroup_storage[stype] != _map)  		goto unlock;  	map->prog = prog; -	prog->aux->cgroup_storage = _map; +	prog->aux->cgroup_storage[stype] = _map;  	ret = 0;  unlock:  	spin_unlock_bh(&map->lock); @@ -275,70 +343,117 @@ unlock:  void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)  { +	enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);  	struct bpf_cgroup_storage_map *map = map_to_storage(_map);  	spin_lock_bh(&map->lock);  	if (map->prog == prog) { -		WARN_ON(prog->aux->cgroup_storage != _map); +		WARN_ON(prog->aux->cgroup_storage[stype] != _map);  		map->prog = NULL; -		prog->aux->cgroup_storage = NULL; +		prog->aux->cgroup_storage[stype] = NULL;  	}  	spin_unlock_bh(&map->lock);  } -struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog) +static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) +{ +	size_t size; + +	if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) { +		size = sizeof(struct bpf_storage_buffer) + map->value_size; +		*pages = round_up(sizeof(struct bpf_cgroup_storage) + size, +				  PAGE_SIZE) >> PAGE_SHIFT; +	} else { +		size = map->value_size; +		*pages = round_up(round_up(size, 8) * num_possible_cpus(), +				  PAGE_SIZE) >> PAGE_SHIFT; +	} + +	return size; +} + +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, +					enum bpf_cgroup_storage_type stype)  {  	struct bpf_cgroup_storage *storage;  	struct bpf_map *map; +	gfp_t flags; +	size_t size;  	u32 pages; -	map = prog->aux->cgroup_storage; +	map = prog->aux->cgroup_storage[stype];  	if (!map)  		return NULL; -	pages = round_up(sizeof(struct bpf_cgroup_storage) + -			 sizeof(struct bpf_storage_buffer) + -			 map->value_size, PAGE_SIZE) >> PAGE_SHIFT; +	size = bpf_cgroup_storage_calculate_size(map, &pages); +  	if (bpf_map_charge_memlock(map, pages))  		return ERR_PTR(-EPERM);  	storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),  			       __GFP_ZERO | GFP_USER, map->numa_node); -	if (!storage) { -		bpf_map_uncharge_memlock(map, pages); -		return ERR_PTR(-ENOMEM); -	} +	if (!storage) +		goto enomem; -	storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) + -				    map->value_size, __GFP_ZERO | GFP_USER, -				    map->numa_node); -	if (!storage->buf) { -		bpf_map_uncharge_memlock(map, pages); -		kfree(storage); -		return ERR_PTR(-ENOMEM); +	flags = __GFP_ZERO | GFP_USER; + +	if (stype == BPF_CGROUP_STORAGE_SHARED) { +		storage->buf = kmalloc_node(size, flags, map->numa_node); +		if (!storage->buf) +			goto enomem; +	} else { +		storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); +		if (!storage->percpu_buf) +			goto enomem;  	}  	storage->map = (struct bpf_cgroup_storage_map *)map;  	return storage; + +enomem: +	bpf_map_uncharge_memlock(map, pages); +	kfree(storage); +	return ERR_PTR(-ENOMEM); +} + +static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu) +{ +	struct bpf_cgroup_storage *storage = +		container_of(rcu, struct bpf_cgroup_storage, rcu); + +	kfree(storage->buf); +	kfree(storage); +} + +static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu) +{ +	struct bpf_cgroup_storage *storage = +		container_of(rcu, struct bpf_cgroup_storage, rcu); + +	free_percpu(storage->percpu_buf); +	kfree(storage);  }  void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)  { -	u32 pages; +	enum bpf_cgroup_storage_type stype;  	struct bpf_map *map; +	u32 pages;  	if (!storage)  		return;  	map = &storage->map->map; -	pages = round_up(sizeof(struct bpf_cgroup_storage) + -			 sizeof(struct bpf_storage_buffer) + -			 map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + +	bpf_cgroup_storage_calculate_size(map, &pages);  	bpf_map_uncharge_memlock(map, pages); -	kfree_rcu(storage->buf, rcu); -	kfree_rcu(storage, rcu); +	stype = cgroup_storage_type(map); +	if (stype == BPF_CGROUP_STORAGE_SHARED) +		call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); +	else +		call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu);  }  void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 3bfbf4464416..99d243e1ad6e 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -24,7 +24,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)  	 * in the verifier is not enough.  	 */  	if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || -	    inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE) { +	    inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || +	    inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {  		fdput(f);  		return ERR_PTR(-ENOTSUPP);  	} diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 177a52436394..8e93c47f0779 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -172,6 +172,24 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,  	return ret;  } +int bpf_prog_offload_finalize(struct bpf_verifier_env *env) +{ +	struct bpf_prog_offload *offload; +	int ret = -ENODEV; + +	down_read(&bpf_devs_lock); +	offload = env->prog->aux->offload; +	if (offload) { +		if (offload->dev_ops->finalize) +			ret = offload->dev_ops->finalize(env); +		else +			ret = 0; +	} +	up_read(&bpf_devs_lock); + +	return ret; +} +  static void __bpf_prog_offload_destroy(struct bpf_prog *prog)  {  	struct bpf_prog_offload *offload = prog->aux->offload; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c new file mode 100644 index 000000000000..12a93fb37449 --- /dev/null +++ b/kernel/bpf/queue_stack_maps.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * queue_stack_maps.c: BPF queue and stack maps + * + * Copyright (c) 2018 Politecnico di Torino + */ +#include <linux/bpf.h> +#include <linux/list.h> +#include <linux/slab.h> +#include "percpu_freelist.h" + +#define QUEUE_STACK_CREATE_FLAG_MASK \ +	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + + +struct bpf_queue_stack { +	struct bpf_map map; +	raw_spinlock_t lock; +	u32 head, tail; +	u32 size; /* max_entries + 1 */ + +	char elements[0] __aligned(8); +}; + +static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) +{ +	return container_of(map, struct bpf_queue_stack, map); +} + +static bool queue_stack_map_is_empty(struct bpf_queue_stack *qs) +{ +	return qs->head == qs->tail; +} + +static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) +{ +	u32 head = qs->head + 1; + +	if (unlikely(head >= qs->size)) +		head = 0; + +	return head == qs->tail; +} + +/* Called from syscall */ +static int queue_stack_map_alloc_check(union bpf_attr *attr) +{ +	/* check sanity of attributes */ +	if (attr->max_entries == 0 || attr->key_size != 0 || +	    attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) +		return -EINVAL; + +	if (attr->value_size > KMALLOC_MAX_SIZE) +		/* if value_size is bigger, the user space won't be able to +		 * access the elements. +		 */ +		return -E2BIG; + +	return 0; +} + +static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) +{ +	int ret, numa_node = bpf_map_attr_numa_node(attr); +	struct bpf_queue_stack *qs; +	u32 size, value_size; +	u64 queue_size, cost; + +	size = attr->max_entries + 1; +	value_size = attr->value_size; + +	queue_size = sizeof(*qs) + (u64) value_size * size; + +	cost = queue_size; +	if (cost >= U32_MAX - PAGE_SIZE) +		return ERR_PTR(-E2BIG); + +	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + +	ret = bpf_map_precharge_memlock(cost); +	if (ret < 0) +		return ERR_PTR(ret); + +	qs = bpf_map_area_alloc(queue_size, numa_node); +	if (!qs) +		return ERR_PTR(-ENOMEM); + +	memset(qs, 0, sizeof(*qs)); + +	bpf_map_init_from_attr(&qs->map, attr); + +	qs->map.pages = cost; +	qs->size = size; + +	raw_spin_lock_init(&qs->lock); + +	return &qs->map; +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void queue_stack_map_free(struct bpf_map *map) +{ +	struct bpf_queue_stack *qs = bpf_queue_stack(map); + +	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, +	 * so the programs (can be more than one that used this map) were +	 * disconnected from events. Wait for outstanding critical sections in +	 * these programs to complete +	 */ +	synchronize_rcu(); + +	bpf_map_area_free(qs); +} + +static int __queue_map_get(struct bpf_map *map, void *value, bool delete) +{ +	struct bpf_queue_stack *qs = bpf_queue_stack(map); +	unsigned long flags; +	int err = 0; +	void *ptr; + +	raw_spin_lock_irqsave(&qs->lock, flags); + +	if (queue_stack_map_is_empty(qs)) { +		err = -ENOENT; +		goto out; +	} + +	ptr = &qs->elements[qs->tail * qs->map.value_size]; +	memcpy(value, ptr, qs->map.value_size); + +	if (delete) { +		if (unlikely(++qs->tail >= qs->size)) +			qs->tail = 0; +	} + +out: +	raw_spin_unlock_irqrestore(&qs->lock, flags); +	return err; +} + + +static int __stack_map_get(struct bpf_map *map, void *value, bool delete) +{ +	struct bpf_queue_stack *qs = bpf_queue_stack(map); +	unsigned long flags; +	int err = 0; +	void *ptr; +	u32 index; + +	raw_spin_lock_irqsave(&qs->lock, flags); + +	if (queue_stack_map_is_empty(qs)) { +		err = -ENOENT; +		goto out; +	} + +	index = qs->head - 1; +	if (unlikely(index >= qs->size)) +		index = qs->size - 1; + +	ptr = &qs->elements[index * qs->map.value_size]; +	memcpy(value, ptr, qs->map.value_size); + +	if (delete) +		qs->head = index; + +out: +	raw_spin_unlock_irqrestore(&qs->lock, flags); +	return err; +} + +/* Called from syscall or from eBPF program */ +static int queue_map_peek_elem(struct bpf_map *map, void *value) +{ +	return __queue_map_get(map, value, false); +} + +/* Called from syscall or from eBPF program */ +static int stack_map_peek_elem(struct bpf_map *map, void *value) +{ +	return __stack_map_get(map, value, false); +} + +/* Called from syscall or from eBPF program */ +static int queue_map_pop_elem(struct bpf_map *map, void *value) +{ +	return __queue_map_get(map, value, true); +} + +/* Called from syscall or from eBPF program */ +static int stack_map_pop_elem(struct bpf_map *map, void *value) +{ +	return __stack_map_get(map, value, true); +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_push_elem(struct bpf_map *map, void *value, +				     u64 flags) +{ +	struct bpf_queue_stack *qs = bpf_queue_stack(map); +	unsigned long irq_flags; +	int err = 0; +	void *dst; + +	/* BPF_EXIST is used to force making room for a new element in case the +	 * map is full +	 */ +	bool replace = (flags & BPF_EXIST); + +	/* Check supported flags for queue and stack maps */ +	if (flags & BPF_NOEXIST || flags > BPF_EXIST) +		return -EINVAL; + +	raw_spin_lock_irqsave(&qs->lock, irq_flags); + +	if (queue_stack_map_is_full(qs)) { +		if (!replace) { +			err = -E2BIG; +			goto out; +		} +		/* advance tail pointer to overwrite oldest element */ +		if (unlikely(++qs->tail >= qs->size)) +			qs->tail = 0; +	} + +	dst = &qs->elements[qs->head * qs->map.value_size]; +	memcpy(dst, value, qs->map.value_size); + +	if (unlikely(++qs->head >= qs->size)) +		qs->head = 0; + +out: +	raw_spin_unlock_irqrestore(&qs->lock, irq_flags); +	return err; +} + +/* Called from syscall or from eBPF program */ +static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key) +{ +	return NULL; +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_update_elem(struct bpf_map *map, void *key, +				       void *value, u64 flags) +{ +	return -EINVAL; +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_delete_elem(struct bpf_map *map, void *key) +{ +	return -EINVAL; +} + +/* Called from syscall */ +static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, +					void *next_key) +{ +	return -EINVAL; +} + +const struct bpf_map_ops queue_map_ops = { +	.map_alloc_check = queue_stack_map_alloc_check, +	.map_alloc = queue_stack_map_alloc, +	.map_free = queue_stack_map_free, +	.map_lookup_elem = queue_stack_map_lookup_elem, +	.map_update_elem = queue_stack_map_update_elem, +	.map_delete_elem = queue_stack_map_delete_elem, +	.map_push_elem = queue_stack_map_push_elem, +	.map_pop_elem = queue_map_pop_elem, +	.map_peek_elem = queue_map_peek_elem, +	.map_get_next_key = queue_stack_map_get_next_key, +}; + +const struct bpf_map_ops stack_map_ops = { +	.map_alloc_check = queue_stack_map_alloc_check, +	.map_alloc = queue_stack_map_alloc, +	.map_free = queue_stack_map_free, +	.map_lookup_elem = queue_stack_map_lookup_elem, +	.map_update_elem = queue_stack_map_update_elem, +	.map_delete_elem = queue_stack_map_delete_elem, +	.map_push_elem = queue_stack_map_push_elem, +	.map_pop_elem = stack_map_pop_elem, +	.map_peek_elem = stack_map_peek_elem, +	.map_get_next_key = queue_stack_map_get_next_key, +}; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c deleted file mode 100644 index 0a0f2ec75370..000000000000 --- a/kernel/bpf/sockmap.c +++ /dev/null @@ -1,2631 +0,0 @@ -/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - -/* A BPF sock_map is used to store sock objects. This is primarly used - * for doing socket redirect with BPF helper routines. - * - * A sock map may have BPF programs attached to it, currently a program - * used to parse packets and a program to provide a verdict and redirect - * decision on the packet are supported. Any programs attached to a sock - * map are inherited by sock objects when they are added to the map. If - * no BPF programs are attached the sock object may only be used for sock - * redirect. - * - * A sock object may be in multiple maps, but can only inherit a single - * parse or verdict program. If adding a sock object to a map would result - * in having multiple parsing programs the update will return an EBUSY error. - * - * For reference this program is similar to devmap used in XDP context - * reviewing these together may be useful. For an example please review - * ./samples/bpf/sockmap/. - */ -#include <linux/bpf.h> -#include <net/sock.h> -#include <linux/filter.h> -#include <linux/errno.h> -#include <linux/file.h> -#include <linux/kernel.h> -#include <linux/net.h> -#include <linux/skbuff.h> -#include <linux/workqueue.h> -#include <linux/list.h> -#include <linux/mm.h> -#include <net/strparser.h> -#include <net/tcp.h> -#include <linux/ptr_ring.h> -#include <net/inet_common.h> -#include <linux/sched/signal.h> - -#define SOCK_CREATE_FLAG_MASK \ -	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) - -struct bpf_sock_progs { -	struct bpf_prog *bpf_tx_msg; -	struct bpf_prog *bpf_parse; -	struct bpf_prog *bpf_verdict; -}; - -struct bpf_stab { -	struct bpf_map map; -	struct sock **sock_map; -	struct bpf_sock_progs progs; -	raw_spinlock_t lock; -}; - -struct bucket { -	struct hlist_head head; -	raw_spinlock_t lock; -}; - -struct bpf_htab { -	struct bpf_map map; -	struct bucket *buckets; -	atomic_t count; -	u32 n_buckets; -	u32 elem_size; -	struct bpf_sock_progs progs; -	struct rcu_head rcu; -}; - -struct htab_elem { -	struct rcu_head rcu; -	struct hlist_node hash_node; -	u32 hash; -	struct sock *sk; -	char key[0]; -}; - -enum smap_psock_state { -	SMAP_TX_RUNNING, -}; - -struct smap_psock_map_entry { -	struct list_head list; -	struct bpf_map *map; -	struct sock **entry; -	struct htab_elem __rcu *hash_link; -}; - -struct smap_psock { -	struct rcu_head	rcu; -	refcount_t refcnt; - -	/* datapath variables */ -	struct sk_buff_head rxqueue; -	bool strp_enabled; - -	/* datapath error path cache across tx work invocations */ -	int save_rem; -	int save_off; -	struct sk_buff *save_skb; - -	/* datapath variables for tx_msg ULP */ -	struct sock *sk_redir; -	int apply_bytes; -	int cork_bytes; -	int sg_size; -	int eval; -	struct sk_msg_buff *cork; -	struct list_head ingress; - -	struct strparser strp; -	struct bpf_prog *bpf_tx_msg; -	struct bpf_prog *bpf_parse; -	struct bpf_prog *bpf_verdict; -	struct list_head maps; -	spinlock_t maps_lock; - -	/* Back reference used when sock callback trigger sockmap operations */ -	struct sock *sock; -	unsigned long state; - -	struct work_struct tx_work; -	struct work_struct gc_work; - -	struct proto *sk_proto; -	void (*save_unhash)(struct sock *sk); -	void (*save_close)(struct sock *sk, long timeout); -	void (*save_data_ready)(struct sock *sk); -	void (*save_write_space)(struct sock *sk); -}; - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, -			   int nonblock, int flags, int *addr_len); -static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); -static int bpf_tcp_sendpage(struct sock *sk, struct page *page, -			    int offset, size_t size, int flags); -static void bpf_tcp_unhash(struct sock *sk); -static void bpf_tcp_close(struct sock *sk, long timeout); - -static inline struct smap_psock *smap_psock_sk(const struct sock *sk) -{ -	return rcu_dereference_sk_user_data(sk); -} - -static bool bpf_tcp_stream_read(const struct sock *sk) -{ -	struct smap_psock *psock; -	bool empty = true; - -	rcu_read_lock(); -	psock = smap_psock_sk(sk); -	if (unlikely(!psock)) -		goto out; -	empty = list_empty(&psock->ingress); -out: -	rcu_read_unlock(); -	return !empty; -} - -enum { -	SOCKMAP_IPV4, -	SOCKMAP_IPV6, -	SOCKMAP_NUM_PROTS, -}; - -enum { -	SOCKMAP_BASE, -	SOCKMAP_TX, -	SOCKMAP_NUM_CONFIGS, -}; - -static struct proto *saved_tcpv6_prot __read_mostly; -static DEFINE_SPINLOCK(tcpv6_prot_lock); -static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS]; -static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], -			 struct proto *base) -{ -	prot[SOCKMAP_BASE]			= *base; -	prot[SOCKMAP_BASE].unhash		= bpf_tcp_unhash; -	prot[SOCKMAP_BASE].close		= bpf_tcp_close; -	prot[SOCKMAP_BASE].recvmsg		= bpf_tcp_recvmsg; -	prot[SOCKMAP_BASE].stream_memory_read	= bpf_tcp_stream_read; - -	prot[SOCKMAP_TX]			= prot[SOCKMAP_BASE]; -	prot[SOCKMAP_TX].sendmsg		= bpf_tcp_sendmsg; -	prot[SOCKMAP_TX].sendpage		= bpf_tcp_sendpage; -} - -static void update_sk_prot(struct sock *sk, struct smap_psock *psock) -{ -	int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4; -	int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE; - -	sk->sk_prot = &bpf_tcp_prots[family][conf]; -} - -static int bpf_tcp_init(struct sock *sk) -{ -	struct smap_psock *psock; - -	rcu_read_lock(); -	psock = smap_psock_sk(sk); -	if (unlikely(!psock)) { -		rcu_read_unlock(); -		return -EINVAL; -	} - -	if (unlikely(psock->sk_proto)) { -		rcu_read_unlock(); -		return -EBUSY; -	} - -	psock->save_unhash = sk->sk_prot->unhash; -	psock->save_close = sk->sk_prot->close; -	psock->sk_proto = sk->sk_prot; - -	/* Build IPv6 sockmap whenever the address of tcpv6_prot changes */ -	if (sk->sk_family == AF_INET6 && -	    unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { -		spin_lock_bh(&tcpv6_prot_lock); -		if (likely(sk->sk_prot != saved_tcpv6_prot)) { -			build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot); -			smp_store_release(&saved_tcpv6_prot, sk->sk_prot); -		} -		spin_unlock_bh(&tcpv6_prot_lock); -	} -	update_sk_prot(sk, psock); -	rcu_read_unlock(); -	return 0; -} - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge); - -static void bpf_tcp_release(struct sock *sk) -{ -	struct smap_psock *psock; - -	rcu_read_lock(); -	psock = smap_psock_sk(sk); -	if (unlikely(!psock)) -		goto out; - -	if (psock->cork) { -		free_start_sg(psock->sock, psock->cork, true); -		kfree(psock->cork); -		psock->cork = NULL; -	} - -	if (psock->sk_proto) { -		sk->sk_prot = psock->sk_proto; -		psock->sk_proto = NULL; -	} -out: -	rcu_read_unlock(); -} - -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, -					 u32 hash, void *key, u32 key_size) -{ -	struct htab_elem *l; - -	hlist_for_each_entry_rcu(l, head, hash_node) { -		if (l->hash == hash && !memcmp(&l->key, key, key_size)) -			return l; -	} - -	return NULL; -} - -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) -{ -	return &htab->buckets[hash & (htab->n_buckets - 1)]; -} - -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) -{ -	return &__select_bucket(htab, hash)->head; -} - -static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) -{ -	atomic_dec(&htab->count); -	kfree_rcu(l, rcu); -} - -static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, -						  struct smap_psock *psock) -{ -	struct smap_psock_map_entry *e; - -	spin_lock_bh(&psock->maps_lock); -	e = list_first_entry_or_null(&psock->maps, -				     struct smap_psock_map_entry, -				     list); -	if (e) -		list_del(&e->list); -	spin_unlock_bh(&psock->maps_lock); -	return e; -} - -static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock) -{ -	struct smap_psock_map_entry *e; -	struct sk_msg_buff *md, *mtmp; -	struct sock *osk; - -	if (psock->cork) { -		free_start_sg(psock->sock, psock->cork, true); -		kfree(psock->cork); -		psock->cork = NULL; -	} - -	list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { -		list_del(&md->list); -		free_start_sg(psock->sock, md, true); -		kfree(md); -	} - -	e = psock_map_pop(sk, psock); -	while (e) { -		if (e->entry) { -			struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map); - -			raw_spin_lock_bh(&stab->lock); -			osk = *e->entry; -			if (osk == sk) { -				*e->entry = NULL; -				smap_release_sock(psock, sk); -			} -			raw_spin_unlock_bh(&stab->lock); -		} else { -			struct htab_elem *link = rcu_dereference(e->hash_link); -			struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map); -			struct hlist_head *head; -			struct htab_elem *l; -			struct bucket *b; - -			b = __select_bucket(htab, link->hash); -			head = &b->head; -			raw_spin_lock_bh(&b->lock); -			l = lookup_elem_raw(head, -					    link->hash, link->key, -					    htab->map.key_size); -			/* If another thread deleted this object skip deletion. -			 * The refcnt on psock may or may not be zero. -			 */ -			if (l && l == link) { -				hlist_del_rcu(&link->hash_node); -				smap_release_sock(psock, link->sk); -				free_htab_elem(htab, link); -			} -			raw_spin_unlock_bh(&b->lock); -		} -		kfree(e); -		e = psock_map_pop(sk, psock); -	} -} - -static void bpf_tcp_unhash(struct sock *sk) -{ -	void (*unhash_fun)(struct sock *sk); -	struct smap_psock *psock; - -	rcu_read_lock(); -	psock = smap_psock_sk(sk); -	if (unlikely(!psock)) { -		rcu_read_unlock(); -		if (sk->sk_prot->unhash) -			sk->sk_prot->unhash(sk); -		return; -	} -	unhash_fun = psock->save_unhash; -	bpf_tcp_remove(sk, psock); -	rcu_read_unlock(); -	unhash_fun(sk); -} - -static void bpf_tcp_close(struct sock *sk, long timeout) -{ -	void (*close_fun)(struct sock *sk, long timeout); -	struct smap_psock *psock; - -	lock_sock(sk); -	rcu_read_lock(); -	psock = smap_psock_sk(sk); -	if (unlikely(!psock)) { -		rcu_read_unlock(); -		release_sock(sk); -		return sk->sk_prot->close(sk, timeout); -	} -	close_fun = psock->save_close; -	bpf_tcp_remove(sk, psock); -	rcu_read_unlock(); -	release_sock(sk); -	close_fun(sk, timeout); -} - -enum __sk_action { -	__SK_DROP = 0, -	__SK_PASS, -	__SK_REDIRECT, -	__SK_NONE, -}; - -static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { -	.name		= "bpf_tcp", -	.uid		= TCP_ULP_BPF, -	.user_visible	= false, -	.owner		= NULL, -	.init		= bpf_tcp_init, -	.release	= bpf_tcp_release, -}; - -static int memcopy_from_iter(struct sock *sk, -			     struct sk_msg_buff *md, -			     struct iov_iter *from, int bytes) -{ -	struct scatterlist *sg = md->sg_data; -	int i = md->sg_curr, rc = -ENOSPC; - -	do { -		int copy; -		char *to; - -		if (md->sg_copybreak >= sg[i].length) { -			md->sg_copybreak = 0; - -			if (++i == MAX_SKB_FRAGS) -				i = 0; - -			if (i == md->sg_end) -				break; -		} - -		copy = sg[i].length - md->sg_copybreak; -		to = sg_virt(&sg[i]) + md->sg_copybreak; -		md->sg_copybreak += copy; - -		if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) -			rc = copy_from_iter_nocache(to, copy, from); -		else -			rc = copy_from_iter(to, copy, from); - -		if (rc != copy) { -			rc = -EFAULT; -			goto out; -		} - -		bytes -= copy; -		if (!bytes) -			break; - -		md->sg_copybreak = 0; -		if (++i == MAX_SKB_FRAGS) -			i = 0; -	} while (i != md->sg_end); -out: -	md->sg_curr = i; -	return rc; -} - -static int bpf_tcp_push(struct sock *sk, int apply_bytes, -			struct sk_msg_buff *md, -			int flags, bool uncharge) -{ -	bool apply = apply_bytes; -	struct scatterlist *sg; -	int offset, ret = 0; -	struct page *p; -	size_t size; - -	while (1) { -		sg = md->sg_data + md->sg_start; -		size = (apply && apply_bytes < sg->length) ? -			apply_bytes : sg->length; -		offset = sg->offset; - -		tcp_rate_check_app_limited(sk); -		p = sg_page(sg); -retry: -		ret = do_tcp_sendpages(sk, p, offset, size, flags); -		if (ret != size) { -			if (ret > 0) { -				if (apply) -					apply_bytes -= ret; - -				sg->offset += ret; -				sg->length -= ret; -				size -= ret; -				offset += ret; -				if (uncharge) -					sk_mem_uncharge(sk, ret); -				goto retry; -			} - -			return ret; -		} - -		if (apply) -			apply_bytes -= ret; -		sg->offset += ret; -		sg->length -= ret; -		if (uncharge) -			sk_mem_uncharge(sk, ret); - -		if (!sg->length) { -			put_page(p); -			md->sg_start++; -			if (md->sg_start == MAX_SKB_FRAGS) -				md->sg_start = 0; -			sg_init_table(sg, 1); - -			if (md->sg_start == md->sg_end) -				break; -		} - -		if (apply && !apply_bytes) -			break; -	} -	return 0; -} - -static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md) -{ -	struct scatterlist *sg = md->sg_data + md->sg_start; - -	if (md->sg_copy[md->sg_start]) { -		md->data = md->data_end = 0; -	} else { -		md->data = sg_virt(sg); -		md->data_end = md->data + sg->length; -	} -} - -static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) -{ -	struct scatterlist *sg = md->sg_data; -	int i = md->sg_start; - -	do { -		int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length; - -		sk_mem_uncharge(sk, uncharge); -		bytes -= uncharge; -		if (!bytes) -			break; -		i++; -		if (i == MAX_SKB_FRAGS) -			i = 0; -	} while (i != md->sg_end); -} - -static void free_bytes_sg(struct sock *sk, int bytes, -			  struct sk_msg_buff *md, bool charge) -{ -	struct scatterlist *sg = md->sg_data; -	int i = md->sg_start, free; - -	while (bytes && sg[i].length) { -		free = sg[i].length; -		if (bytes < free) { -			sg[i].length -= bytes; -			sg[i].offset += bytes; -			if (charge) -				sk_mem_uncharge(sk, bytes); -			break; -		} - -		if (charge) -			sk_mem_uncharge(sk, sg[i].length); -		put_page(sg_page(&sg[i])); -		bytes -= sg[i].length; -		sg[i].length = 0; -		sg[i].page_link = 0; -		sg[i].offset = 0; -		i++; - -		if (i == MAX_SKB_FRAGS) -			i = 0; -	} -	md->sg_start = i; -} - -static int free_sg(struct sock *sk, int start, -		   struct sk_msg_buff *md, bool charge) -{ -	struct scatterlist *sg = md->sg_data; -	int i = start, free = 0; - -	while (sg[i].length) { -		free += sg[i].length; -		if (charge) -			sk_mem_uncharge(sk, sg[i].length); -		if (!md->skb) -			put_page(sg_page(&sg[i])); -		sg[i].length = 0; -		sg[i].page_link = 0; -		sg[i].offset = 0; -		i++; - -		if (i == MAX_SKB_FRAGS) -			i = 0; -	} -	if (md->skb) -		consume_skb(md->skb); - -	return free; -} - -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge) -{ -	int free = free_sg(sk, md->sg_start, md, charge); - -	md->sg_start = md->sg_end; -	return free; -} - -static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) -{ -	return free_sg(sk, md->sg_curr, md, true); -} - -static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) -{ -	return ((_rc == SK_PASS) ? -	       (md->sk_redir ? __SK_REDIRECT : __SK_PASS) : -	       __SK_DROP); -} - -static unsigned int smap_do_tx_msg(struct sock *sk, -				   struct smap_psock *psock, -				   struct sk_msg_buff *md) -{ -	struct bpf_prog *prog; -	unsigned int rc, _rc; - -	preempt_disable(); -	rcu_read_lock(); - -	/* If the policy was removed mid-send then default to 'accept' */ -	prog = READ_ONCE(psock->bpf_tx_msg); -	if (unlikely(!prog)) { -		_rc = SK_PASS; -		goto verdict; -	} - -	bpf_compute_data_pointers_sg(md); -	md->sk = sk; -	rc = (*prog->bpf_func)(md, prog->insnsi); -	psock->apply_bytes = md->apply_bytes; - -	/* Moving return codes from UAPI namespace into internal namespace */ -	_rc = bpf_map_msg_verdict(rc, md); - -	/* The psock has a refcount on the sock but not on the map and because -	 * we need to drop rcu read lock here its possible the map could be -	 * removed between here and when we need it to execute the sock -	 * redirect. So do the map lookup now for future use. -	 */ -	if (_rc == __SK_REDIRECT) { -		if (psock->sk_redir) -			sock_put(psock->sk_redir); -		psock->sk_redir = do_msg_redirect_map(md); -		if (!psock->sk_redir) { -			_rc = __SK_DROP; -			goto verdict; -		} -		sock_hold(psock->sk_redir); -	} -verdict: -	rcu_read_unlock(); -	preempt_enable(); - -	return _rc; -} - -static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, -			   struct smap_psock *psock, -			   struct sk_msg_buff *md, int flags) -{ -	bool apply = apply_bytes; -	size_t size, copied = 0; -	struct sk_msg_buff *r; -	int err = 0, i; - -	r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL); -	if (unlikely(!r)) -		return -ENOMEM; - -	lock_sock(sk); -	r->sg_start = md->sg_start; -	i = md->sg_start; - -	do { -		size = (apply && apply_bytes < md->sg_data[i].length) ? -			apply_bytes : md->sg_data[i].length; - -		if (!sk_wmem_schedule(sk, size)) { -			if (!copied) -				err = -ENOMEM; -			break; -		} - -		sk_mem_charge(sk, size); -		r->sg_data[i] = md->sg_data[i]; -		r->sg_data[i].length = size; -		md->sg_data[i].length -= size; -		md->sg_data[i].offset += size; -		copied += size; - -		if (md->sg_data[i].length) { -			get_page(sg_page(&r->sg_data[i])); -			r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1; -		} else { -			i++; -			if (i == MAX_SKB_FRAGS) -				i = 0; -			r->sg_end = i; -		} - -		if (apply) { -			apply_bytes -= size; -			if (!apply_bytes) -				break; -		} -	} while (i != md->sg_end); - -	md->sg_start = i; - -	if (!err) { -		list_add_tail(&r->list, &psock->ingress); -		sk->sk_data_ready(sk); -	} else { -		free_start_sg(sk, r, true); -		kfree(r); -	} - -	release_sock(sk); -	return err; -} - -static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, -				       struct sk_msg_buff *md, -				       int flags) -{ -	bool ingress = !!(md->flags & BPF_F_INGRESS); -	struct smap_psock *psock; -	int err = 0; - -	rcu_read_lock(); -	psock = smap_psock_sk(sk); -	if (unlikely(!psock)) -		goto out_rcu; - -	if (!refcount_inc_not_zero(&psock->refcnt)) -		goto out_rcu; - -	rcu_read_unlock(); - -	if (ingress) { -		err = bpf_tcp_ingress(sk, send, psock, md, flags); -	} else { -		lock_sock(sk); -		err = bpf_tcp_push(sk, send, md, flags, false); -		release_sock(sk); -	} -	smap_release_sock(psock, sk); -	return err; -out_rcu: -	rcu_read_unlock(); -	return 0; -} - -static inline void bpf_md_init(struct smap_psock *psock) -{ -	if (!psock->apply_bytes) { -		psock->eval =  __SK_NONE; -		if (psock->sk_redir) { -			sock_put(psock->sk_redir); -			psock->sk_redir = NULL; -		} -	} -} - -static void apply_bytes_dec(struct smap_psock *psock, int i) -{ -	if (psock->apply_bytes) { -		if (psock->apply_bytes < i) -			psock->apply_bytes = 0; -		else -			psock->apply_bytes -= i; -	} -} - -static int bpf_exec_tx_verdict(struct smap_psock *psock, -			       struct sk_msg_buff *m, -			       struct sock *sk, -			       int *copied, int flags) -{ -	bool cork = false, enospc = (m->sg_start == m->sg_end); -	struct sock *redir; -	int err = 0; -	int send; - -more_data: -	if (psock->eval == __SK_NONE) -		psock->eval = smap_do_tx_msg(sk, psock, m); - -	if (m->cork_bytes && -	    m->cork_bytes > psock->sg_size && !enospc) { -		psock->cork_bytes = m->cork_bytes - psock->sg_size; -		if (!psock->cork) { -			psock->cork = kcalloc(1, -					sizeof(struct sk_msg_buff), -					GFP_ATOMIC | __GFP_NOWARN); - -			if (!psock->cork) { -				err = -ENOMEM; -				goto out_err; -			} -		} -		memcpy(psock->cork, m, sizeof(*m)); -		goto out_err; -	} - -	send = psock->sg_size; -	if (psock->apply_bytes && psock->apply_bytes < send) -		send = psock->apply_bytes; - -	switch (psock->eval) { -	case __SK_PASS: -		err = bpf_tcp_push(sk, send, m, flags, true); -		if (unlikely(err)) { -			*copied -= free_start_sg(sk, m, true); -			break; -		} - -		apply_bytes_dec(psock, send); -		psock->sg_size -= send; -		break; -	case __SK_REDIRECT: -		redir = psock->sk_redir; -		apply_bytes_dec(psock, send); - -		if (psock->cork) { -			cork = true; -			psock->cork = NULL; -		} - -		return_mem_sg(sk, send, m); -		release_sock(sk); - -		err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); -		lock_sock(sk); - -		if (unlikely(err < 0)) { -			int free = free_start_sg(sk, m, false); - -			psock->sg_size = 0; -			if (!cork) -				*copied -= free; -		} else { -			psock->sg_size -= send; -		} - -		if (cork) { -			free_start_sg(sk, m, true); -			psock->sg_size = 0; -			kfree(m); -			m = NULL; -			err = 0; -		} -		break; -	case __SK_DROP: -	default: -		free_bytes_sg(sk, send, m, true); -		apply_bytes_dec(psock, send); -		*copied -= send; -		psock->sg_size -= send; -		err = -EACCES; -		break; -	} - -	if (likely(!err)) { -		bpf_md_init(psock); -		if (m && -		    m->sg_data[m->sg_start].page_link && -		    m->sg_data[m->sg_start].length) -			goto more_data; -	} - -out_err: -	return err; -} - -static int bpf_wait_data(struct sock *sk, -			 struct smap_psock *psk, int flags, -			 long timeo, int *err) -{ -	int rc; - -	DEFINE_WAIT_FUNC(wait, woken_wake_function); - -	add_wait_queue(sk_sleep(sk), &wait); -	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); -	rc = sk_wait_event(sk, &timeo, -			   !list_empty(&psk->ingress) || -			   !skb_queue_empty(&sk->sk_receive_queue), -			   &wait); -	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); -	remove_wait_queue(sk_sleep(sk), &wait); - -	return rc; -} - -static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, -			   int nonblock, int flags, int *addr_len) -{ -	struct iov_iter *iter = &msg->msg_iter; -	struct smap_psock *psock; -	int copied = 0; - -	if (unlikely(flags & MSG_ERRQUEUE)) -		return inet_recv_error(sk, msg, len, addr_len); -	if (!skb_queue_empty(&sk->sk_receive_queue)) -		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - -	rcu_read_lock(); -	psock = smap_psock_sk(sk); -	if (unlikely(!psock)) -		goto out; - -	if (unlikely(!refcount_inc_not_zero(&psock->refcnt))) -		goto out; -	rcu_read_unlock(); - -	lock_sock(sk); -bytes_ready: -	while (copied != len) { -		struct scatterlist *sg; -		struct sk_msg_buff *md; -		int i; - -		md = list_first_entry_or_null(&psock->ingress, -					      struct sk_msg_buff, list); -		if (unlikely(!md)) -			break; -		i = md->sg_start; -		do { -			struct page *page; -			int n, copy; - -			sg = &md->sg_data[i]; -			copy = sg->length; -			page = sg_page(sg); - -			if (copied + copy > len) -				copy = len - copied; - -			n = copy_page_to_iter(page, sg->offset, copy, iter); -			if (n != copy) { -				md->sg_start = i; -				release_sock(sk); -				smap_release_sock(psock, sk); -				return -EFAULT; -			} - -			copied += copy; -			sg->offset += copy; -			sg->length -= copy; -			sk_mem_uncharge(sk, copy); - -			if (!sg->length) { -				i++; -				if (i == MAX_SKB_FRAGS) -					i = 0; -				if (!md->skb) -					put_page(page); -			} -			if (copied == len) -				break; -		} while (i != md->sg_end); -		md->sg_start = i; - -		if (!sg->length && md->sg_start == md->sg_end) { -			list_del(&md->list); -			if (md->skb) -				consume_skb(md->skb); -			kfree(md); -		} -	} - -	if (!copied) { -		long timeo; -		int data; -		int err = 0; - -		timeo = sock_rcvtimeo(sk, nonblock); -		data = bpf_wait_data(sk, psock, flags, timeo, &err); - -		if (data) { -			if (!skb_queue_empty(&sk->sk_receive_queue)) { -				release_sock(sk); -				smap_release_sock(psock, sk); -				copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); -				return copied; -			} -			goto bytes_ready; -		} - -		if (err) -			copied = err; -	} - -	release_sock(sk); -	smap_release_sock(psock, sk); -	return copied; -out: -	rcu_read_unlock(); -	return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); -} - - -static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) -{ -	int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; -	struct sk_msg_buff md = {0}; -	unsigned int sg_copy = 0; -	struct smap_psock *psock; -	int copied = 0, err = 0; -	struct scatterlist *sg; -	long timeo; - -	/* Its possible a sock event or user removed the psock _but_ the ops -	 * have not been reprogrammed yet so we get here. In this case fallback -	 * to tcp_sendmsg. Note this only works because we _only_ ever allow -	 * a single ULP there is no hierarchy here. -	 */ -	rcu_read_lock(); -	psock = smap_psock_sk(sk); -	if (unlikely(!psock)) { -		rcu_read_unlock(); -		return tcp_sendmsg(sk, msg, size); -	} - -	/* Increment the psock refcnt to ensure its not released while sending a -	 * message. Required because sk lookup and bpf programs are used in -	 * separate rcu critical sections. Its OK if we lose the map entry -	 * but we can't lose the sock reference. -	 */ -	if (!refcount_inc_not_zero(&psock->refcnt)) { -		rcu_read_unlock(); -		return tcp_sendmsg(sk, msg, size); -	} - -	sg = md.sg_data; -	sg_init_marker(sg, MAX_SKB_FRAGS); -	rcu_read_unlock(); - -	lock_sock(sk); -	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); - -	while (msg_data_left(msg)) { -		struct sk_msg_buff *m = NULL; -		bool enospc = false; -		int copy; - -		if (sk->sk_err) { -			err = -sk->sk_err; -			goto out_err; -		} - -		copy = msg_data_left(msg); -		if (!sk_stream_memory_free(sk)) -			goto wait_for_sndbuf; - -		m = psock->cork_bytes ? psock->cork : &md; -		m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end; -		err = sk_alloc_sg(sk, copy, m->sg_data, -				  m->sg_start, &m->sg_end, &sg_copy, -				  m->sg_end - 1); -		if (err) { -			if (err != -ENOSPC) -				goto wait_for_memory; -			enospc = true; -			copy = sg_copy; -		} - -		err = memcopy_from_iter(sk, m, &msg->msg_iter, copy); -		if (err < 0) { -			free_curr_sg(sk, m); -			goto out_err; -		} - -		psock->sg_size += copy; -		copied += copy; -		sg_copy = 0; - -		/* When bytes are being corked skip running BPF program and -		 * applying verdict unless there is no more buffer space. In -		 * the ENOSPC case simply run BPF prorgram with currently -		 * accumulated data. We don't have much choice at this point -		 * we could try extending the page frags or chaining complex -		 * frags but even in these cases _eventually_ we will hit an -		 * OOM scenario. More complex recovery schemes may be -		 * implemented in the future, but BPF programs must handle -		 * the case where apply_cork requests are not honored. The -		 * canonical method to verify this is to check data length. -		 */ -		if (psock->cork_bytes) { -			if (copy > psock->cork_bytes) -				psock->cork_bytes = 0; -			else -				psock->cork_bytes -= copy; - -			if (psock->cork_bytes && !enospc) -				goto out_cork; - -			/* All cork bytes accounted for re-run filter */ -			psock->eval = __SK_NONE; -			psock->cork_bytes = 0; -		} - -		err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); -		if (unlikely(err < 0)) -			goto out_err; -		continue; -wait_for_sndbuf: -		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: -		err = sk_stream_wait_memory(sk, &timeo); -		if (err) { -			if (m && m != psock->cork) -				free_start_sg(sk, m, true); -			goto out_err; -		} -	} -out_err: -	if (err < 0) -		err = sk_stream_error(sk, msg->msg_flags, err); -out_cork: -	release_sock(sk); -	smap_release_sock(psock, sk); -	return copied ? copied : err; -} - -static int bpf_tcp_sendpage(struct sock *sk, struct page *page, -			    int offset, size_t size, int flags) -{ -	struct sk_msg_buff md = {0}, *m = NULL; -	int err = 0, copied = 0; -	struct smap_psock *psock; -	struct scatterlist *sg; -	bool enospc = false; - -	rcu_read_lock(); -	psock = smap_psock_sk(sk); -	if (unlikely(!psock)) -		goto accept; - -	if (!refcount_inc_not_zero(&psock->refcnt)) -		goto accept; -	rcu_read_unlock(); - -	lock_sock(sk); - -	if (psock->cork_bytes) { -		m = psock->cork; -		sg = &m->sg_data[m->sg_end]; -	} else { -		m = &md; -		sg = m->sg_data; -		sg_init_marker(sg, MAX_SKB_FRAGS); -	} - -	/* Catch case where ring is full and sendpage is stalled. */ -	if (unlikely(m->sg_end == m->sg_start && -	    m->sg_data[m->sg_end].length)) -		goto out_err; - -	psock->sg_size += size; -	sg_set_page(sg, page, size, offset); -	get_page(page); -	m->sg_copy[m->sg_end] = true; -	sk_mem_charge(sk, size); -	m->sg_end++; -	copied = size; - -	if (m->sg_end == MAX_SKB_FRAGS) -		m->sg_end = 0; - -	if (m->sg_end == m->sg_start) -		enospc = true; - -	if (psock->cork_bytes) { -		if (size > psock->cork_bytes) -			psock->cork_bytes = 0; -		else -			psock->cork_bytes -= size; - -		if (psock->cork_bytes && !enospc) -			goto out_err; - -		/* All cork bytes accounted for re-run filter */ -		psock->eval = __SK_NONE; -		psock->cork_bytes = 0; -	} - -	err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); -out_err: -	release_sock(sk); -	smap_release_sock(psock, sk); -	return copied ? copied : err; -accept: -	rcu_read_unlock(); -	return tcp_sendpage(sk, page, offset, size, flags); -} - -static void bpf_tcp_msg_add(struct smap_psock *psock, -			    struct sock *sk, -			    struct bpf_prog *tx_msg) -{ -	struct bpf_prog *orig_tx_msg; - -	orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg); -	if (orig_tx_msg) -		bpf_prog_put(orig_tx_msg); -} - -static int bpf_tcp_ulp_register(void) -{ -	build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); -	/* Once BPF TX ULP is registered it is never unregistered. It -	 * will be in the ULP list for the lifetime of the system. Doing -	 * duplicate registers is not a problem. -	 */ -	return tcp_register_ulp(&bpf_tcp_ulp_ops); -} - -static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) -{ -	struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); -	int rc; - -	if (unlikely(!prog)) -		return __SK_DROP; - -	skb_orphan(skb); -	/* We need to ensure that BPF metadata for maps is also cleared -	 * when we orphan the skb so that we don't have the possibility -	 * to reference a stale map. -	 */ -	TCP_SKB_CB(skb)->bpf.sk_redir = NULL; -	skb->sk = psock->sock; -	bpf_compute_data_end_sk_skb(skb); -	preempt_disable(); -	rc = (*prog->bpf_func)(skb, prog->insnsi); -	preempt_enable(); -	skb->sk = NULL; - -	/* Moving return codes from UAPI namespace into internal namespace */ -	return rc == SK_PASS ? -		(TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) : -		__SK_DROP; -} - -static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb) -{ -	struct sock *sk = psock->sock; -	int copied = 0, num_sg; -	struct sk_msg_buff *r; - -	r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC); -	if (unlikely(!r)) -		return -EAGAIN; - -	if (!sk_rmem_schedule(sk, skb, skb->len)) { -		kfree(r); -		return -EAGAIN; -	} - -	sg_init_table(r->sg_data, MAX_SKB_FRAGS); -	num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len); -	if (unlikely(num_sg < 0)) { -		kfree(r); -		return num_sg; -	} -	sk_mem_charge(sk, skb->len); -	copied = skb->len; -	r->sg_start = 0; -	r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg; -	r->skb = skb; -	list_add_tail(&r->list, &psock->ingress); -	sk->sk_data_ready(sk); -	return copied; -} - -static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) -{ -	struct smap_psock *peer; -	struct sock *sk; -	__u32 in; -	int rc; - -	rc = smap_verdict_func(psock, skb); -	switch (rc) { -	case __SK_REDIRECT: -		sk = do_sk_redirect_map(skb); -		if (!sk) { -			kfree_skb(skb); -			break; -		} - -		peer = smap_psock_sk(sk); -		in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; - -		if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) || -			     !test_bit(SMAP_TX_RUNNING, &peer->state))) { -			kfree_skb(skb); -			break; -		} - -		if (!in && sock_writeable(sk)) { -			skb_set_owner_w(skb, sk); -			skb_queue_tail(&peer->rxqueue, skb); -			schedule_work(&peer->tx_work); -			break; -		} else if (in && -			   atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { -			skb_queue_tail(&peer->rxqueue, skb); -			schedule_work(&peer->tx_work); -			break; -		} -	/* Fall through and free skb otherwise */ -	case __SK_DROP: -	default: -		kfree_skb(skb); -	} -} - -static void smap_report_sk_error(struct smap_psock *psock, int err) -{ -	struct sock *sk = psock->sock; - -	sk->sk_err = err; -	sk->sk_error_report(sk); -} - -static void smap_read_sock_strparser(struct strparser *strp, -				     struct sk_buff *skb) -{ -	struct smap_psock *psock; - -	rcu_read_lock(); -	psock = container_of(strp, struct smap_psock, strp); -	smap_do_verdict(psock, skb); -	rcu_read_unlock(); -} - -/* Called with lock held on socket */ -static void smap_data_ready(struct sock *sk) -{ -	struct smap_psock *psock; - -	rcu_read_lock(); -	psock = smap_psock_sk(sk); -	if (likely(psock)) { -		write_lock_bh(&sk->sk_callback_lock); -		strp_data_ready(&psock->strp); -		write_unlock_bh(&sk->sk_callback_lock); -	} -	rcu_read_unlock(); -} - -static void smap_tx_work(struct work_struct *w) -{ -	struct smap_psock *psock; -	struct sk_buff *skb; -	int rem, off, n; - -	psock = container_of(w, struct smap_psock, tx_work); - -	/* lock sock to avoid losing sk_socket at some point during loop */ -	lock_sock(psock->sock); -	if (psock->save_skb) { -		skb = psock->save_skb; -		rem = psock->save_rem; -		off = psock->save_off; -		psock->save_skb = NULL; -		goto start; -	} - -	while ((skb = skb_dequeue(&psock->rxqueue))) { -		__u32 flags; - -		rem = skb->len; -		off = 0; -start: -		flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; -		do { -			if (likely(psock->sock->sk_socket)) { -				if (flags) -					n = smap_do_ingress(psock, skb); -				else -					n = skb_send_sock_locked(psock->sock, -								 skb, off, rem); -			} else { -				n = -EINVAL; -			} - -			if (n <= 0) { -				if (n == -EAGAIN) { -					/* Retry when space is available */ -					psock->save_skb = skb; -					psock->save_rem = rem; -					psock->save_off = off; -					goto out; -				} -				/* Hard errors break pipe and stop xmit */ -				smap_report_sk_error(psock, n ? -n : EPIPE); -				clear_bit(SMAP_TX_RUNNING, &psock->state); -				kfree_skb(skb); -				goto out; -			} -			rem -= n; -			off += n; -		} while (rem); - -		if (!flags) -			kfree_skb(skb); -	} -out: -	release_sock(psock->sock); -} - -static void smap_write_space(struct sock *sk) -{ -	struct smap_psock *psock; -	void (*write_space)(struct sock *sk); - -	rcu_read_lock(); -	psock = smap_psock_sk(sk); -	if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state))) -		schedule_work(&psock->tx_work); -	write_space = psock->save_write_space; -	rcu_read_unlock(); -	write_space(sk); -} - -static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) -{ -	if (!psock->strp_enabled) -		return; -	sk->sk_data_ready = psock->save_data_ready; -	sk->sk_write_space = psock->save_write_space; -	psock->save_data_ready = NULL; -	psock->save_write_space = NULL; -	strp_stop(&psock->strp); -	psock->strp_enabled = false; -} - -static void smap_destroy_psock(struct rcu_head *rcu) -{ -	struct smap_psock *psock = container_of(rcu, -						  struct smap_psock, rcu); - -	/* Now that a grace period has passed there is no longer -	 * any reference to this sock in the sockmap so we can -	 * destroy the psock, strparser, and bpf programs. But, -	 * because we use workqueue sync operations we can not -	 * do it in rcu context -	 */ -	schedule_work(&psock->gc_work); -} - -static bool psock_is_smap_sk(struct sock *sk) -{ -	return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops; -} - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock) -{ -	if (refcount_dec_and_test(&psock->refcnt)) { -		if (psock_is_smap_sk(sock)) -			tcp_cleanup_ulp(sock); -		write_lock_bh(&sock->sk_callback_lock); -		smap_stop_sock(psock, sock); -		write_unlock_bh(&sock->sk_callback_lock); -		clear_bit(SMAP_TX_RUNNING, &psock->state); -		rcu_assign_sk_user_data(sock, NULL); -		call_rcu_sched(&psock->rcu, smap_destroy_psock); -	} -} - -static int smap_parse_func_strparser(struct strparser *strp, -				       struct sk_buff *skb) -{ -	struct smap_psock *psock; -	struct bpf_prog *prog; -	int rc; - -	rcu_read_lock(); -	psock = container_of(strp, struct smap_psock, strp); -	prog = READ_ONCE(psock->bpf_parse); - -	if (unlikely(!prog)) { -		rcu_read_unlock(); -		return skb->len; -	} - -	/* Attach socket for bpf program to use if needed we can do this -	 * because strparser clones the skb before handing it to a upper -	 * layer, meaning skb_orphan has been called. We NULL sk on the -	 * way out to ensure we don't trigger a BUG_ON in skb/sk operations -	 * later and because we are not charging the memory of this skb to -	 * any socket yet. -	 */ -	skb->sk = psock->sock; -	bpf_compute_data_end_sk_skb(skb); -	rc = (*prog->bpf_func)(skb, prog->insnsi); -	skb->sk = NULL; -	rcu_read_unlock(); -	return rc; -} - -static int smap_read_sock_done(struct strparser *strp, int err) -{ -	return err; -} - -static int smap_init_sock(struct smap_psock *psock, -			  struct sock *sk) -{ -	static const struct strp_callbacks cb = { -		.rcv_msg = smap_read_sock_strparser, -		.parse_msg = smap_parse_func_strparser, -		.read_sock_done = smap_read_sock_done, -	}; - -	return strp_init(&psock->strp, sk, &cb); -} - -static void smap_init_progs(struct smap_psock *psock, -			    struct bpf_prog *verdict, -			    struct bpf_prog *parse) -{ -	struct bpf_prog *orig_parse, *orig_verdict; - -	orig_parse = xchg(&psock->bpf_parse, parse); -	orig_verdict = xchg(&psock->bpf_verdict, verdict); - -	if (orig_verdict) -		bpf_prog_put(orig_verdict); -	if (orig_parse) -		bpf_prog_put(orig_parse); -} - -static void smap_start_sock(struct smap_psock *psock, struct sock *sk) -{ -	if (sk->sk_data_ready == smap_data_ready) -		return; -	psock->save_data_ready = sk->sk_data_ready; -	psock->save_write_space = sk->sk_write_space; -	sk->sk_data_ready = smap_data_ready; -	sk->sk_write_space = smap_write_space; -	psock->strp_enabled = true; -} - -static void sock_map_remove_complete(struct bpf_stab *stab) -{ -	bpf_map_area_free(stab->sock_map); -	kfree(stab); -} - -static void smap_gc_work(struct work_struct *w) -{ -	struct smap_psock_map_entry *e, *tmp; -	struct sk_msg_buff *md, *mtmp; -	struct smap_psock *psock; - -	psock = container_of(w, struct smap_psock, gc_work); - -	/* no callback lock needed because we already detached sockmap ops */ -	if (psock->strp_enabled) -		strp_done(&psock->strp); - -	cancel_work_sync(&psock->tx_work); -	__skb_queue_purge(&psock->rxqueue); - -	/* At this point all strparser and xmit work must be complete */ -	if (psock->bpf_parse) -		bpf_prog_put(psock->bpf_parse); -	if (psock->bpf_verdict) -		bpf_prog_put(psock->bpf_verdict); -	if (psock->bpf_tx_msg) -		bpf_prog_put(psock->bpf_tx_msg); - -	if (psock->cork) { -		free_start_sg(psock->sock, psock->cork, true); -		kfree(psock->cork); -	} - -	list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { -		list_del(&md->list); -		free_start_sg(psock->sock, md, true); -		kfree(md); -	} - -	list_for_each_entry_safe(e, tmp, &psock->maps, list) { -		list_del(&e->list); -		kfree(e); -	} - -	if (psock->sk_redir) -		sock_put(psock->sk_redir); - -	sock_put(psock->sock); -	kfree(psock); -} - -static struct smap_psock *smap_init_psock(struct sock *sock, int node) -{ -	struct smap_psock *psock; - -	psock = kzalloc_node(sizeof(struct smap_psock), -			     GFP_ATOMIC | __GFP_NOWARN, -			     node); -	if (!psock) -		return ERR_PTR(-ENOMEM); - -	psock->eval =  __SK_NONE; -	psock->sock = sock; -	skb_queue_head_init(&psock->rxqueue); -	INIT_WORK(&psock->tx_work, smap_tx_work); -	INIT_WORK(&psock->gc_work, smap_gc_work); -	INIT_LIST_HEAD(&psock->maps); -	INIT_LIST_HEAD(&psock->ingress); -	refcount_set(&psock->refcnt, 1); -	spin_lock_init(&psock->maps_lock); - -	rcu_assign_sk_user_data(sock, psock); -	sock_hold(sock); -	return psock; -} - -static struct bpf_map *sock_map_alloc(union bpf_attr *attr) -{ -	struct bpf_stab *stab; -	u64 cost; -	int err; - -	if (!capable(CAP_NET_ADMIN)) -		return ERR_PTR(-EPERM); - -	/* check sanity of attributes */ -	if (attr->max_entries == 0 || attr->key_size != 4 || -	    attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) -		return ERR_PTR(-EINVAL); - -	err = bpf_tcp_ulp_register(); -	if (err && err != -EEXIST) -		return ERR_PTR(err); - -	stab = kzalloc(sizeof(*stab), GFP_USER); -	if (!stab) -		return ERR_PTR(-ENOMEM); - -	bpf_map_init_from_attr(&stab->map, attr); -	raw_spin_lock_init(&stab->lock); - -	/* make sure page count doesn't overflow */ -	cost = (u64) stab->map.max_entries * sizeof(struct sock *); -	err = -EINVAL; -	if (cost >= U32_MAX - PAGE_SIZE) -		goto free_stab; - -	stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - -	/* if map size is larger than memlock limit, reject it early */ -	err = bpf_map_precharge_memlock(stab->map.pages); -	if (err) -		goto free_stab; - -	err = -ENOMEM; -	stab->sock_map = bpf_map_area_alloc(stab->map.max_entries * -					    sizeof(struct sock *), -					    stab->map.numa_node); -	if (!stab->sock_map) -		goto free_stab; - -	return &stab->map; -free_stab: -	kfree(stab); -	return ERR_PTR(err); -} - -static void smap_list_map_remove(struct smap_psock *psock, -				 struct sock **entry) -{ -	struct smap_psock_map_entry *e, *tmp; - -	spin_lock_bh(&psock->maps_lock); -	list_for_each_entry_safe(e, tmp, &psock->maps, list) { -		if (e->entry == entry) { -			list_del(&e->list); -			kfree(e); -		} -	} -	spin_unlock_bh(&psock->maps_lock); -} - -static void smap_list_hash_remove(struct smap_psock *psock, -				  struct htab_elem *hash_link) -{ -	struct smap_psock_map_entry *e, *tmp; - -	spin_lock_bh(&psock->maps_lock); -	list_for_each_entry_safe(e, tmp, &psock->maps, list) { -		struct htab_elem *c = rcu_dereference(e->hash_link); - -		if (c == hash_link) { -			list_del(&e->list); -			kfree(e); -		} -	} -	spin_unlock_bh(&psock->maps_lock); -} - -static void sock_map_free(struct bpf_map *map) -{ -	struct bpf_stab *stab = container_of(map, struct bpf_stab, map); -	int i; - -	synchronize_rcu(); - -	/* At this point no update, lookup or delete operations can happen. -	 * However, be aware we can still get a socket state event updates, -	 * and data ready callabacks that reference the psock from sk_user_data -	 * Also psock worker threads are still in-flight. So smap_release_sock -	 * will only free the psock after cancel_sync on the worker threads -	 * and a grace period expire to ensure psock is really safe to remove. -	 */ -	rcu_read_lock(); -	raw_spin_lock_bh(&stab->lock); -	for (i = 0; i < stab->map.max_entries; i++) { -		struct smap_psock *psock; -		struct sock *sock; - -		sock = stab->sock_map[i]; -		if (!sock) -			continue; -		stab->sock_map[i] = NULL; -		psock = smap_psock_sk(sock); -		/* This check handles a racing sock event that can get the -		 * sk_callback_lock before this case but after xchg happens -		 * causing the refcnt to hit zero and sock user data (psock) -		 * to be null and queued for garbage collection. -		 */ -		if (likely(psock)) { -			smap_list_map_remove(psock, &stab->sock_map[i]); -			smap_release_sock(psock, sock); -		} -	} -	raw_spin_unlock_bh(&stab->lock); -	rcu_read_unlock(); - -	sock_map_remove_complete(stab); -} - -static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key) -{ -	struct bpf_stab *stab = container_of(map, struct bpf_stab, map); -	u32 i = key ? *(u32 *)key : U32_MAX; -	u32 *next = (u32 *)next_key; - -	if (i >= stab->map.max_entries) { -		*next = 0; -		return 0; -	} - -	if (i == stab->map.max_entries - 1) -		return -ENOENT; - -	*next = i + 1; -	return 0; -} - -struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key) -{ -	struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - -	if (key >= map->max_entries) -		return NULL; - -	return READ_ONCE(stab->sock_map[key]); -} - -static int sock_map_delete_elem(struct bpf_map *map, void *key) -{ -	struct bpf_stab *stab = container_of(map, struct bpf_stab, map); -	struct smap_psock *psock; -	int k = *(u32 *)key; -	struct sock *sock; - -	if (k >= map->max_entries) -		return -EINVAL; - -	raw_spin_lock_bh(&stab->lock); -	sock = stab->sock_map[k]; -	stab->sock_map[k] = NULL; -	raw_spin_unlock_bh(&stab->lock); -	if (!sock) -		return -EINVAL; - -	psock = smap_psock_sk(sock); -	if (!psock) -		return 0; -	if (psock->bpf_parse) { -		write_lock_bh(&sock->sk_callback_lock); -		smap_stop_sock(psock, sock); -		write_unlock_bh(&sock->sk_callback_lock); -	} -	smap_list_map_remove(psock, &stab->sock_map[k]); -	smap_release_sock(psock, sock); -	return 0; -} - -/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are - * done inside rcu critical sections. This ensures on updates that the psock - * will not be released via smap_release_sock() until concurrent updates/deletes - * complete. All operations operate on sock_map using cmpxchg and xchg - * operations to ensure we do not get stale references. Any reads into the - * map must be done with READ_ONCE() because of this. - * - * A psock is destroyed via call_rcu and after any worker threads are cancelled - * and syncd so we are certain all references from the update/lookup/delete - * operations as well as references in the data path are no longer in use. - * - * Psocks may exist in multiple maps, but only a single set of parse/verdict - * programs may be inherited from the maps it belongs to. A reference count - * is kept with the total number of references to the psock from all maps. The - * psock will not be released until this reaches zero. The psock and sock - * user data data use the sk_callback_lock to protect critical data structures - * from concurrent access. This allows us to avoid two updates from modifying - * the user data in sock and the lock is required anyways for modifying - * callbacks, we simply increase its scope slightly. - * - * Rules to follow, - *  - psock must always be read inside RCU critical section - *  - sk_user_data must only be modified inside sk_callback_lock and read - *    inside RCU critical section. - *  - psock->maps list must only be read & modified inside sk_callback_lock - *  - sock_map must use READ_ONCE and (cmp)xchg operations - *  - BPF verdict/parse programs must use READ_ONCE and xchg operations - */ - -static int __sock_map_ctx_update_elem(struct bpf_map *map, -				      struct bpf_sock_progs *progs, -				      struct sock *sock, -				      void *key) -{ -	struct bpf_prog *verdict, *parse, *tx_msg; -	struct smap_psock *psock; -	bool new = false; -	int err = 0; - -	/* 1. If sock map has BPF programs those will be inherited by the -	 * sock being added. If the sock is already attached to BPF programs -	 * this results in an error. -	 */ -	verdict = READ_ONCE(progs->bpf_verdict); -	parse = READ_ONCE(progs->bpf_parse); -	tx_msg = READ_ONCE(progs->bpf_tx_msg); - -	if (parse && verdict) { -		/* bpf prog refcnt may be zero if a concurrent attach operation -		 * removes the program after the above READ_ONCE() but before -		 * we increment the refcnt. If this is the case abort with an -		 * error. -		 */ -		verdict = bpf_prog_inc_not_zero(verdict); -		if (IS_ERR(verdict)) -			return PTR_ERR(verdict); - -		parse = bpf_prog_inc_not_zero(parse); -		if (IS_ERR(parse)) { -			bpf_prog_put(verdict); -			return PTR_ERR(parse); -		} -	} - -	if (tx_msg) { -		tx_msg = bpf_prog_inc_not_zero(tx_msg); -		if (IS_ERR(tx_msg)) { -			if (parse && verdict) { -				bpf_prog_put(parse); -				bpf_prog_put(verdict); -			} -			return PTR_ERR(tx_msg); -		} -	} - -	psock = smap_psock_sk(sock); - -	/* 2. Do not allow inheriting programs if psock exists and has -	 * already inherited programs. This would create confusion on -	 * which parser/verdict program is running. If no psock exists -	 * create one. Inside sk_callback_lock to ensure concurrent create -	 * doesn't update user data. -	 */ -	if (psock) { -		if (!psock_is_smap_sk(sock)) { -			err = -EBUSY; -			goto out_progs; -		} -		if (READ_ONCE(psock->bpf_parse) && parse) { -			err = -EBUSY; -			goto out_progs; -		} -		if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) { -			err = -EBUSY; -			goto out_progs; -		} -		if (!refcount_inc_not_zero(&psock->refcnt)) { -			err = -EAGAIN; -			goto out_progs; -		} -	} else { -		psock = smap_init_psock(sock, map->numa_node); -		if (IS_ERR(psock)) { -			err = PTR_ERR(psock); -			goto out_progs; -		} - -		set_bit(SMAP_TX_RUNNING, &psock->state); -		new = true; -	} - -	/* 3. At this point we have a reference to a valid psock that is -	 * running. Attach any BPF programs needed. -	 */ -	if (tx_msg) -		bpf_tcp_msg_add(psock, sock, tx_msg); -	if (new) { -		err = tcp_set_ulp_id(sock, TCP_ULP_BPF); -		if (err) -			goto out_free; -	} - -	if (parse && verdict && !psock->strp_enabled) { -		err = smap_init_sock(psock, sock); -		if (err) -			goto out_free; -		smap_init_progs(psock, verdict, parse); -		write_lock_bh(&sock->sk_callback_lock); -		smap_start_sock(psock, sock); -		write_unlock_bh(&sock->sk_callback_lock); -	} - -	return err; -out_free: -	smap_release_sock(psock, sock); -out_progs: -	if (parse && verdict) { -		bpf_prog_put(parse); -		bpf_prog_put(verdict); -	} -	if (tx_msg) -		bpf_prog_put(tx_msg); -	return err; -} - -static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, -				    struct bpf_map *map, -				    void *key, u64 flags) -{ -	struct bpf_stab *stab = container_of(map, struct bpf_stab, map); -	struct bpf_sock_progs *progs = &stab->progs; -	struct sock *osock, *sock = skops->sk; -	struct smap_psock_map_entry *e; -	struct smap_psock *psock; -	u32 i = *(u32 *)key; -	int err; - -	if (unlikely(flags > BPF_EXIST)) -		return -EINVAL; -	if (unlikely(i >= stab->map.max_entries)) -		return -E2BIG; - -	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); -	if (!e) -		return -ENOMEM; - -	err = __sock_map_ctx_update_elem(map, progs, sock, key); -	if (err) -		goto out; - -	/* psock guaranteed to be present. */ -	psock = smap_psock_sk(sock); -	raw_spin_lock_bh(&stab->lock); -	osock = stab->sock_map[i]; -	if (osock && flags == BPF_NOEXIST) { -		err = -EEXIST; -		goto out_unlock; -	} -	if (!osock && flags == BPF_EXIST) { -		err = -ENOENT; -		goto out_unlock; -	} - -	e->entry = &stab->sock_map[i]; -	e->map = map; -	spin_lock_bh(&psock->maps_lock); -	list_add_tail(&e->list, &psock->maps); -	spin_unlock_bh(&psock->maps_lock); - -	stab->sock_map[i] = sock; -	if (osock) { -		psock = smap_psock_sk(osock); -		smap_list_map_remove(psock, &stab->sock_map[i]); -		smap_release_sock(psock, osock); -	} -	raw_spin_unlock_bh(&stab->lock); -	return 0; -out_unlock: -	smap_release_sock(psock, sock); -	raw_spin_unlock_bh(&stab->lock); -out: -	kfree(e); -	return err; -} - -int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) -{ -	struct bpf_sock_progs *progs; -	struct bpf_prog *orig; - -	if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { -		struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - -		progs = &stab->progs; -	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) { -		struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - -		progs = &htab->progs; -	} else { -		return -EINVAL; -	} - -	switch (type) { -	case BPF_SK_MSG_VERDICT: -		orig = xchg(&progs->bpf_tx_msg, prog); -		break; -	case BPF_SK_SKB_STREAM_PARSER: -		orig = xchg(&progs->bpf_parse, prog); -		break; -	case BPF_SK_SKB_STREAM_VERDICT: -		orig = xchg(&progs->bpf_verdict, prog); -		break; -	default: -		return -EOPNOTSUPP; -	} - -	if (orig) -		bpf_prog_put(orig); - -	return 0; -} - -int sockmap_get_from_fd(const union bpf_attr *attr, int type, -			struct bpf_prog *prog) -{ -	int ufd = attr->target_fd; -	struct bpf_map *map; -	struct fd f; -	int err; - -	f = fdget(ufd); -	map = __bpf_map_get(f); -	if (IS_ERR(map)) -		return PTR_ERR(map); - -	err = sock_map_prog(map, prog, attr->attach_type); -	fdput(f); -	return err; -} - -static void *sock_map_lookup(struct bpf_map *map, void *key) -{ -	return NULL; -} - -static int sock_map_update_elem(struct bpf_map *map, -				void *key, void *value, u64 flags) -{ -	struct bpf_sock_ops_kern skops; -	u32 fd = *(u32 *)value; -	struct socket *socket; -	int err; - -	socket = sockfd_lookup(fd, &err); -	if (!socket) -		return err; - -	skops.sk = socket->sk; -	if (!skops.sk) { -		fput(socket->file); -		return -EINVAL; -	} - -	/* ULPs are currently supported only for TCP sockets in ESTABLISHED -	 * state. -	 */ -	if (skops.sk->sk_type != SOCK_STREAM || -	    skops.sk->sk_protocol != IPPROTO_TCP || -	    skops.sk->sk_state != TCP_ESTABLISHED) { -		fput(socket->file); -		return -EOPNOTSUPP; -	} - -	lock_sock(skops.sk); -	preempt_disable(); -	rcu_read_lock(); -	err = sock_map_ctx_update_elem(&skops, map, key, flags); -	rcu_read_unlock(); -	preempt_enable(); -	release_sock(skops.sk); -	fput(socket->file); -	return err; -} - -static void sock_map_release(struct bpf_map *map) -{ -	struct bpf_sock_progs *progs; -	struct bpf_prog *orig; - -	if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { -		struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - -		progs = &stab->progs; -	} else { -		struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - -		progs = &htab->progs; -	} - -	orig = xchg(&progs->bpf_parse, NULL); -	if (orig) -		bpf_prog_put(orig); -	orig = xchg(&progs->bpf_verdict, NULL); -	if (orig) -		bpf_prog_put(orig); - -	orig = xchg(&progs->bpf_tx_msg, NULL); -	if (orig) -		bpf_prog_put(orig); -} - -static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) -{ -	struct bpf_htab *htab; -	int i, err; -	u64 cost; - -	if (!capable(CAP_NET_ADMIN)) -		return ERR_PTR(-EPERM); - -	/* check sanity of attributes */ -	if (attr->max_entries == 0 || -	    attr->key_size == 0 || -	    attr->value_size != 4 || -	    attr->map_flags & ~SOCK_CREATE_FLAG_MASK) -		return ERR_PTR(-EINVAL); - -	if (attr->key_size > MAX_BPF_STACK) -		/* eBPF programs initialize keys on stack, so they cannot be -		 * larger than max stack size -		 */ -		return ERR_PTR(-E2BIG); - -	err = bpf_tcp_ulp_register(); -	if (err && err != -EEXIST) -		return ERR_PTR(err); - -	htab = kzalloc(sizeof(*htab), GFP_USER); -	if (!htab) -		return ERR_PTR(-ENOMEM); - -	bpf_map_init_from_attr(&htab->map, attr); - -	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); -	htab->elem_size = sizeof(struct htab_elem) + -			  round_up(htab->map.key_size, 8); -	err = -EINVAL; -	if (htab->n_buckets == 0 || -	    htab->n_buckets > U32_MAX / sizeof(struct bucket)) -		goto free_htab; - -	cost = (u64) htab->n_buckets * sizeof(struct bucket) + -	       (u64) htab->elem_size * htab->map.max_entries; - -	if (cost >= U32_MAX - PAGE_SIZE) -		goto free_htab; - -	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; -	err = bpf_map_precharge_memlock(htab->map.pages); -	if (err) -		goto free_htab; - -	err = -ENOMEM; -	htab->buckets = bpf_map_area_alloc( -				htab->n_buckets * sizeof(struct bucket), -				htab->map.numa_node); -	if (!htab->buckets) -		goto free_htab; - -	for (i = 0; i < htab->n_buckets; i++) { -		INIT_HLIST_HEAD(&htab->buckets[i].head); -		raw_spin_lock_init(&htab->buckets[i].lock); -	} - -	return &htab->map; -free_htab: -	kfree(htab); -	return ERR_PTR(err); -} - -static void __bpf_htab_free(struct rcu_head *rcu) -{ -	struct bpf_htab *htab; - -	htab = container_of(rcu, struct bpf_htab, rcu); -	bpf_map_area_free(htab->buckets); -	kfree(htab); -} - -static void sock_hash_free(struct bpf_map *map) -{ -	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); -	int i; - -	synchronize_rcu(); - -	/* At this point no update, lookup or delete operations can happen. -	 * However, be aware we can still get a socket state event updates, -	 * and data ready callabacks that reference the psock from sk_user_data -	 * Also psock worker threads are still in-flight. So smap_release_sock -	 * will only free the psock after cancel_sync on the worker threads -	 * and a grace period expire to ensure psock is really safe to remove. -	 */ -	rcu_read_lock(); -	for (i = 0; i < htab->n_buckets; i++) { -		struct bucket *b = __select_bucket(htab, i); -		struct hlist_head *head; -		struct hlist_node *n; -		struct htab_elem *l; - -		raw_spin_lock_bh(&b->lock); -		head = &b->head; -		hlist_for_each_entry_safe(l, n, head, hash_node) { -			struct sock *sock = l->sk; -			struct smap_psock *psock; - -			hlist_del_rcu(&l->hash_node); -			psock = smap_psock_sk(sock); -			/* This check handles a racing sock event that can get -			 * the sk_callback_lock before this case but after xchg -			 * causing the refcnt to hit zero and sock user data -			 * (psock) to be null and queued for garbage collection. -			 */ -			if (likely(psock)) { -				smap_list_hash_remove(psock, l); -				smap_release_sock(psock, sock); -			} -			free_htab_elem(htab, l); -		} -		raw_spin_unlock_bh(&b->lock); -	} -	rcu_read_unlock(); -	call_rcu(&htab->rcu, __bpf_htab_free); -} - -static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, -					      void *key, u32 key_size, u32 hash, -					      struct sock *sk, -					      struct htab_elem *old_elem) -{ -	struct htab_elem *l_new; - -	if (atomic_inc_return(&htab->count) > htab->map.max_entries) { -		if (!old_elem) { -			atomic_dec(&htab->count); -			return ERR_PTR(-E2BIG); -		} -	} -	l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, -			     htab->map.numa_node); -	if (!l_new) { -		atomic_dec(&htab->count); -		return ERR_PTR(-ENOMEM); -	} - -	memcpy(l_new->key, key, key_size); -	l_new->sk = sk; -	l_new->hash = hash; -	return l_new; -} - -static inline u32 htab_map_hash(const void *key, u32 key_len) -{ -	return jhash(key, key_len, 0); -} - -static int sock_hash_get_next_key(struct bpf_map *map, -				  void *key, void *next_key) -{ -	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); -	struct htab_elem *l, *next_l; -	struct hlist_head *h; -	u32 hash, key_size; -	int i = 0; - -	WARN_ON_ONCE(!rcu_read_lock_held()); - -	key_size = map->key_size; -	if (!key) -		goto find_first_elem; -	hash = htab_map_hash(key, key_size); -	h = select_bucket(htab, hash); - -	l = lookup_elem_raw(h, hash, key, key_size); -	if (!l) -		goto find_first_elem; -	next_l = hlist_entry_safe( -		     rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), -		     struct htab_elem, hash_node); -	if (next_l) { -		memcpy(next_key, next_l->key, key_size); -		return 0; -	} - -	/* no more elements in this hash list, go to the next bucket */ -	i = hash & (htab->n_buckets - 1); -	i++; - -find_first_elem: -	/* iterate over buckets */ -	for (; i < htab->n_buckets; i++) { -		h = select_bucket(htab, i); - -		/* pick first element in the bucket */ -		next_l = hlist_entry_safe( -				rcu_dereference_raw(hlist_first_rcu(h)), -				struct htab_elem, hash_node); -		if (next_l) { -			/* if it's not empty, just return it */ -			memcpy(next_key, next_l->key, key_size); -			return 0; -		} -	} - -	/* iterated over all buckets and all elements */ -	return -ENOENT; -} - -static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, -				     struct bpf_map *map, -				     void *key, u64 map_flags) -{ -	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); -	struct bpf_sock_progs *progs = &htab->progs; -	struct htab_elem *l_new = NULL, *l_old; -	struct smap_psock_map_entry *e = NULL; -	struct hlist_head *head; -	struct smap_psock *psock; -	u32 key_size, hash; -	struct sock *sock; -	struct bucket *b; -	int err; - -	sock = skops->sk; - -	if (sock->sk_type != SOCK_STREAM || -	    sock->sk_protocol != IPPROTO_TCP) -		return -EOPNOTSUPP; - -	if (unlikely(map_flags > BPF_EXIST)) -		return -EINVAL; - -	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); -	if (!e) -		return -ENOMEM; - -	WARN_ON_ONCE(!rcu_read_lock_held()); -	key_size = map->key_size; -	hash = htab_map_hash(key, key_size); -	b = __select_bucket(htab, hash); -	head = &b->head; - -	err = __sock_map_ctx_update_elem(map, progs, sock, key); -	if (err) -		goto err; - -	/* psock is valid here because otherwise above *ctx_update_elem would -	 * have thrown an error. It is safe to skip error check. -	 */ -	psock = smap_psock_sk(sock); -	raw_spin_lock_bh(&b->lock); -	l_old = lookup_elem_raw(head, hash, key, key_size); -	if (l_old && map_flags == BPF_NOEXIST) { -		err = -EEXIST; -		goto bucket_err; -	} -	if (!l_old && map_flags == BPF_EXIST) { -		err = -ENOENT; -		goto bucket_err; -	} - -	l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old); -	if (IS_ERR(l_new)) { -		err = PTR_ERR(l_new); -		goto bucket_err; -	} - -	rcu_assign_pointer(e->hash_link, l_new); -	e->map = map; -	spin_lock_bh(&psock->maps_lock); -	list_add_tail(&e->list, &psock->maps); -	spin_unlock_bh(&psock->maps_lock); - -	/* add new element to the head of the list, so that -	 * concurrent search will find it before old elem -	 */ -	hlist_add_head_rcu(&l_new->hash_node, head); -	if (l_old) { -		psock = smap_psock_sk(l_old->sk); - -		hlist_del_rcu(&l_old->hash_node); -		smap_list_hash_remove(psock, l_old); -		smap_release_sock(psock, l_old->sk); -		free_htab_elem(htab, l_old); -	} -	raw_spin_unlock_bh(&b->lock); -	return 0; -bucket_err: -	smap_release_sock(psock, sock); -	raw_spin_unlock_bh(&b->lock); -err: -	kfree(e); -	return err; -} - -static int sock_hash_update_elem(struct bpf_map *map, -				void *key, void *value, u64 flags) -{ -	struct bpf_sock_ops_kern skops; -	u32 fd = *(u32 *)value; -	struct socket *socket; -	int err; - -	socket = sockfd_lookup(fd, &err); -	if (!socket) -		return err; - -	skops.sk = socket->sk; -	if (!skops.sk) { -		fput(socket->file); -		return -EINVAL; -	} - -	/* ULPs are currently supported only for TCP sockets in ESTABLISHED -	 * state. -	 */ -	if (skops.sk->sk_type != SOCK_STREAM || -	    skops.sk->sk_protocol != IPPROTO_TCP || -	    skops.sk->sk_state != TCP_ESTABLISHED) { -		fput(socket->file); -		return -EOPNOTSUPP; -	} - -	lock_sock(skops.sk); -	preempt_disable(); -	rcu_read_lock(); -	err = sock_hash_ctx_update_elem(&skops, map, key, flags); -	rcu_read_unlock(); -	preempt_enable(); -	release_sock(skops.sk); -	fput(socket->file); -	return err; -} - -static int sock_hash_delete_elem(struct bpf_map *map, void *key) -{ -	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); -	struct hlist_head *head; -	struct bucket *b; -	struct htab_elem *l; -	u32 hash, key_size; -	int ret = -ENOENT; - -	key_size = map->key_size; -	hash = htab_map_hash(key, key_size); -	b = __select_bucket(htab, hash); -	head = &b->head; - -	raw_spin_lock_bh(&b->lock); -	l = lookup_elem_raw(head, hash, key, key_size); -	if (l) { -		struct sock *sock = l->sk; -		struct smap_psock *psock; - -		hlist_del_rcu(&l->hash_node); -		psock = smap_psock_sk(sock); -		/* This check handles a racing sock event that can get the -		 * sk_callback_lock before this case but after xchg happens -		 * causing the refcnt to hit zero and sock user data (psock) -		 * to be null and queued for garbage collection. -		 */ -		if (likely(psock)) { -			smap_list_hash_remove(psock, l); -			smap_release_sock(psock, sock); -		} -		free_htab_elem(htab, l); -		ret = 0; -	} -	raw_spin_unlock_bh(&b->lock); -	return ret; -} - -struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key) -{ -	struct bpf_htab *htab = container_of(map, struct bpf_htab, map); -	struct hlist_head *head; -	struct htab_elem *l; -	u32 key_size, hash; -	struct bucket *b; -	struct sock *sk; - -	key_size = map->key_size; -	hash = htab_map_hash(key, key_size); -	b = __select_bucket(htab, hash); -	head = &b->head; - -	l = lookup_elem_raw(head, hash, key, key_size); -	sk = l ? l->sk : NULL; -	return sk; -} - -const struct bpf_map_ops sock_map_ops = { -	.map_alloc = sock_map_alloc, -	.map_free = sock_map_free, -	.map_lookup_elem = sock_map_lookup, -	.map_get_next_key = sock_map_get_next_key, -	.map_update_elem = sock_map_update_elem, -	.map_delete_elem = sock_map_delete_elem, -	.map_release_uref = sock_map_release, -	.map_check_btf = map_check_no_btf, -}; - -const struct bpf_map_ops sock_hash_ops = { -	.map_alloc = sock_hash_alloc, -	.map_free = sock_hash_free, -	.map_lookup_elem = sock_map_lookup, -	.map_get_next_key = sock_hash_get_next_key, -	.map_update_elem = sock_hash_update_elem, -	.map_delete_elem = sock_hash_delete_elem, -	.map_release_uref = sock_map_release, -	.map_check_btf = map_check_no_btf, -}; - -static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops) -{ -	return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || -	       ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; -} -BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, -	   struct bpf_map *, map, void *, key, u64, flags) -{ -	WARN_ON_ONCE(!rcu_read_lock_held()); - -	/* ULPs are currently supported only for TCP sockets in ESTABLISHED -	 * state. This checks that the sock ops triggering the update is -	 * one indicating we are (or will be soon) in an ESTABLISHED state. -	 */ -	if (!bpf_is_valid_sock_op(bpf_sock)) -		return -EOPNOTSUPP; -	return sock_map_ctx_update_elem(bpf_sock, map, key, flags); -} - -const struct bpf_func_proto bpf_sock_map_update_proto = { -	.func		= bpf_sock_map_update, -	.gpl_only	= false, -	.pkt_access	= true, -	.ret_type	= RET_INTEGER, -	.arg1_type	= ARG_PTR_TO_CTX, -	.arg2_type	= ARG_CONST_MAP_PTR, -	.arg3_type	= ARG_PTR_TO_MAP_KEY, -	.arg4_type	= ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, -	   struct bpf_map *, map, void *, key, u64, flags) -{ -	WARN_ON_ONCE(!rcu_read_lock_held()); - -	if (!bpf_is_valid_sock_op(bpf_sock)) -		return -EOPNOTSUPP; -	return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); -} - -const struct bpf_func_proto bpf_sock_hash_update_proto = { -	.func		= bpf_sock_hash_update, -	.gpl_only	= false, -	.pkt_access	= true, -	.ret_type	= RET_INTEGER, -	.arg1_type	= ARG_PTR_TO_CTX, -	.arg2_type	= ARG_CONST_MAP_PTR, -	.arg3_type	= ARG_PTR_TO_MAP_KEY, -	.arg4_type	= ARG_ANYTHING, -}; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 8061a439ef18..90daf285de03 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -505,7 +505,7 @@ const struct bpf_func_proto bpf_get_stack_proto = {  /* Called from eBPF program */  static void *stack_map_lookup_elem(struct bpf_map *map, void *key)  { -	return NULL; +	return ERR_PTR(-EOPNOTSUPP);  }  /* Called from syscall */ @@ -600,7 +600,7 @@ static void stack_map_free(struct bpf_map *map)  	put_callchain_buffers();  } -const struct bpf_map_ops stack_map_ops = { +const struct bpf_map_ops stack_trace_map_ops = {  	.map_alloc = stack_map_alloc,  	.map_free = stack_map_free,  	.map_get_next_key = stack_map_get_next_key, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8339d81cba1d..ccb93277aae2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -30,7 +30,6 @@  #include <linux/cred.h>  #include <linux/timekeeping.h>  #include <linux/ctype.h> -#include <linux/btf.h>  #include <linux/nospec.h>  #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ @@ -652,6 +651,17 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)  	return -ENOTSUPP;  } +static void *__bpf_copy_key(void __user *ukey, u64 key_size) +{ +	if (key_size) +		return memdup_user(ukey, key_size); + +	if (ukey) +		return ERR_PTR(-EINVAL); + +	return NULL; +} +  /* last field in 'union bpf_attr' used by this command */  #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value @@ -679,7 +689,7 @@ static int map_lookup_elem(union bpf_attr *attr)  		goto err_put;  	} -	key = memdup_user(ukey, map->key_size); +	key = __bpf_copy_key(ukey, map->key_size);  	if (IS_ERR(key)) {  		err = PTR_ERR(key);  		goto err_put; @@ -687,7 +697,8 @@ static int map_lookup_elem(union bpf_attr *attr)  	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||  	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || -	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) +	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || +	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)  		value_size = round_up(map->value_size, 8) * num_possible_cpus();  	else if (IS_FD_MAP(map))  		value_size = sizeof(u32); @@ -706,6 +717,8 @@ static int map_lookup_elem(union bpf_attr *attr)  		err = bpf_percpu_hash_copy(map, key, value);  	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {  		err = bpf_percpu_array_copy(map, key, value); +	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { +		err = bpf_percpu_cgroup_storage_copy(map, key, value);  	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {  		err = bpf_stackmap_copy(map, key, value);  	} else if (IS_FD_ARRAY(map)) { @@ -714,13 +727,21 @@ static int map_lookup_elem(union bpf_attr *attr)  		err = bpf_fd_htab_map_lookup_elem(map, key, value);  	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {  		err = bpf_fd_reuseport_array_lookup_elem(map, key, value); +	} else if (map->map_type == BPF_MAP_TYPE_QUEUE || +		   map->map_type == BPF_MAP_TYPE_STACK) { +		err = map->ops->map_peek_elem(map, value);  	} else {  		rcu_read_lock();  		ptr = map->ops->map_lookup_elem(map, key); -		if (ptr) +		if (IS_ERR(ptr)) { +			err = PTR_ERR(ptr); +		} else if (!ptr) { +			err = -ENOENT; +		} else { +			err = 0;  			memcpy(value, ptr, value_size); +		}  		rcu_read_unlock(); -		err = ptr ? 0 : -ENOENT;  	}  	if (err) @@ -741,6 +762,17 @@ err_put:  	return err;  } +static void maybe_wait_bpf_programs(struct bpf_map *map) +{ +	/* Wait for any running BPF programs to complete so that +	 * userspace, when we return to it, knows that all programs +	 * that could be running use the new map value. +	 */ +	if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || +	    map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +		synchronize_rcu(); +} +  #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags  static int map_update_elem(union bpf_attr *attr) @@ -767,7 +799,7 @@ static int map_update_elem(union bpf_attr *attr)  		goto err_put;  	} -	key = memdup_user(ukey, map->key_size); +	key = __bpf_copy_key(ukey, map->key_size);  	if (IS_ERR(key)) {  		err = PTR_ERR(key);  		goto err_put; @@ -775,7 +807,8 @@ static int map_update_elem(union bpf_attr *attr)  	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||  	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || -	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) +	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || +	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)  		value_size = round_up(map->value_size, 8) * num_possible_cpus();  	else  		value_size = map->value_size; @@ -810,6 +843,9 @@ static int map_update_elem(union bpf_attr *attr)  		err = bpf_percpu_hash_update(map, key, value, attr->flags);  	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {  		err = bpf_percpu_array_update(map, key, value, attr->flags); +	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { +		err = bpf_percpu_cgroup_storage_update(map, key, value, +						       attr->flags);  	} else if (IS_FD_ARRAY(map)) {  		rcu_read_lock();  		err = bpf_fd_array_map_update_elem(map, f.file, key, value, @@ -824,6 +860,9 @@ static int map_update_elem(union bpf_attr *attr)  		/* rcu_read_lock() is not needed */  		err = bpf_fd_reuseport_array_update_elem(map, key, value,  							 attr->flags); +	} else if (map->map_type == BPF_MAP_TYPE_QUEUE || +		   map->map_type == BPF_MAP_TYPE_STACK) { +		err = map->ops->map_push_elem(map, value, attr->flags);  	} else {  		rcu_read_lock();  		err = map->ops->map_update_elem(map, key, value, attr->flags); @@ -831,6 +870,7 @@ static int map_update_elem(union bpf_attr *attr)  	}  	__this_cpu_dec(bpf_prog_active);  	preempt_enable(); +	maybe_wait_bpf_programs(map);  out:  free_value:  	kfree(value); @@ -865,7 +905,7 @@ static int map_delete_elem(union bpf_attr *attr)  		goto err_put;  	} -	key = memdup_user(ukey, map->key_size); +	key = __bpf_copy_key(ukey, map->key_size);  	if (IS_ERR(key)) {  		err = PTR_ERR(key);  		goto err_put; @@ -883,6 +923,7 @@ static int map_delete_elem(union bpf_attr *attr)  	rcu_read_unlock();  	__this_cpu_dec(bpf_prog_active);  	preempt_enable(); +	maybe_wait_bpf_programs(map);  out:  	kfree(key);  err_put: @@ -917,7 +958,7 @@ static int map_get_next_key(union bpf_attr *attr)  	}  	if (ukey) { -		key = memdup_user(ukey, map->key_size); +		key = __bpf_copy_key(ukey, map->key_size);  		if (IS_ERR(key)) {  			err = PTR_ERR(key);  			goto err_put; @@ -958,6 +999,69 @@ err_put:  	return err;  } +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value + +static int map_lookup_and_delete_elem(union bpf_attr *attr) +{ +	void __user *ukey = u64_to_user_ptr(attr->key); +	void __user *uvalue = u64_to_user_ptr(attr->value); +	int ufd = attr->map_fd; +	struct bpf_map *map; +	void *key, *value; +	u32 value_size; +	struct fd f; +	int err; + +	if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) +		return -EINVAL; + +	f = fdget(ufd); +	map = __bpf_map_get(f); +	if (IS_ERR(map)) +		return PTR_ERR(map); + +	if (!(f.file->f_mode & FMODE_CAN_WRITE)) { +		err = -EPERM; +		goto err_put; +	} + +	key = __bpf_copy_key(ukey, map->key_size); +	if (IS_ERR(key)) { +		err = PTR_ERR(key); +		goto err_put; +	} + +	value_size = map->value_size; + +	err = -ENOMEM; +	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); +	if (!value) +		goto free_key; + +	if (map->map_type == BPF_MAP_TYPE_QUEUE || +	    map->map_type == BPF_MAP_TYPE_STACK) { +		err = map->ops->map_pop_elem(map, value); +	} else { +		err = -ENOTSUPP; +	} + +	if (err) +		goto free_value; + +	if (copy_to_user(uvalue, value, value_size) != 0) +		goto free_value; + +	err = 0; + +free_value: +	kfree(value); +free_key: +	kfree(key); +err_put: +	fdput(f); +	return err; +} +  static const struct bpf_prog_ops * const bpf_prog_types[] = {  #define BPF_PROG_TYPE(_id, _name) \  	[_id] = & _name ## _prog_ops, @@ -989,10 +1093,15 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)  /* drop refcnt on maps used by eBPF program and free auxilary data */  static void free_used_maps(struct bpf_prog_aux *aux)  { +	enum bpf_cgroup_storage_type stype;  	int i; -	if (aux->cgroup_storage) -		bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage); +	for_each_cgroup_storage_type(stype) { +		if (!aux->cgroup_storage[stype]) +			continue; +		bpf_cgroup_storage_release(aux->prog, +					   aux->cgroup_storage[stype]); +	}  	for (i = 0; i < aux->used_map_cnt; i++)  		bpf_map_put(aux->used_maps[i]); @@ -1616,6 +1725,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)  	case BPF_LIRC_MODE2:  		ptype = BPF_PROG_TYPE_LIRC_MODE2;  		break; +	case BPF_FLOW_DISSECTOR: +		ptype = BPF_PROG_TYPE_FLOW_DISSECTOR; +		break;  	default:  		return -EINVAL;  	} @@ -1632,11 +1744,14 @@ static int bpf_prog_attach(const union bpf_attr *attr)  	switch (ptype) {  	case BPF_PROG_TYPE_SK_SKB:  	case BPF_PROG_TYPE_SK_MSG: -		ret = sockmap_get_from_fd(attr, ptype, prog); +		ret = sock_map_get_from_fd(attr, prog);  		break;  	case BPF_PROG_TYPE_LIRC_MODE2:  		ret = lirc_prog_attach(attr, prog);  		break; +	case BPF_PROG_TYPE_FLOW_DISSECTOR: +		ret = skb_flow_dissector_bpf_prog_attach(attr, prog); +		break;  	default:  		ret = cgroup_bpf_prog_attach(attr, ptype, prog);  	} @@ -1683,12 +1798,14 @@ static int bpf_prog_detach(const union bpf_attr *attr)  		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;  		break;  	case BPF_SK_MSG_VERDICT: -		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL); +		return sock_map_get_from_fd(attr, NULL);  	case BPF_SK_SKB_STREAM_PARSER:  	case BPF_SK_SKB_STREAM_VERDICT: -		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); +		return sock_map_get_from_fd(attr, NULL);  	case BPF_LIRC_MODE2:  		return lirc_prog_detach(attr); +	case BPF_FLOW_DISSECTOR: +		return skb_flow_dissector_bpf_prog_detach(attr);  	default:  		return -EINVAL;  	} @@ -2418,6 +2535,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz  	case BPF_TASK_FD_QUERY:  		err = bpf_task_fd_query(&attr, uattr);  		break; +	case BPF_MAP_LOOKUP_AND_DELETE_ELEM: +		err = map_lookup_and_delete_elem(&attr); +		break;  	default:  		err = -EINVAL;  		break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 465952a8e465..98fa0be35370 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1,5 +1,6 @@  /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com   * Copyright (c) 2016 Facebook + * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io   *   * This program is free software; you can redistribute it and/or   * modify it under the terms of version 2 of the GNU General Public @@ -80,8 +81,8 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {   * (like pointer plus pointer becomes SCALAR_VALUE type)   *   * When verifier sees load or store instructions the type of base register - * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer - * types recognized by check_mem_access() function. + * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK, PTR_TO_SOCKET. These are + * four pointer types recognized by check_mem_access() function.   *   * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'   * and the range of [ptr, ptr + map's value_size) is accessible. @@ -140,6 +141,24 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {   *   * After the call R0 is set to return type of the function and registers R1-R5   * are set to NOT_INIT to indicate that they are no longer readable. + * + * The following reference types represent a potential reference to a kernel + * resource which, after first being allocated, must be checked and freed by + * the BPF program: + * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET + * + * When the verifier sees a helper call return a reference type, it allocates a + * pointer id for the reference and stores it in the current function state. + * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into + * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type + * passes through a NULL-check conditional. For the branch wherein the state is + * changed to CONST_IMM, the verifier releases the reference. + * + * For each helper function that allocates a reference, such as + * bpf_sk_lookup_tcp(), there is a corresponding release function, such as + * bpf_sk_release(). When a reference type passes into the release function, + * the verifier also releases the reference. If any unchecked or unreleased + * reference remains at the end of the program, the verifier rejects it.   */  /* verifier_state + insn_idx are pushed to stack when branch is encountered */ @@ -189,6 +208,7 @@ struct bpf_call_arg_meta {  	int access_size;  	s64 msize_smax_value;  	u64 msize_umax_value; +	int ptr_id;  };  static DEFINE_MUTEX(bpf_verifier_lock); @@ -249,6 +269,46 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type)  	       type == PTR_TO_PACKET_META;  } +static bool reg_type_may_be_null(enum bpf_reg_type type) +{ +	return type == PTR_TO_MAP_VALUE_OR_NULL || +	       type == PTR_TO_SOCKET_OR_NULL; +} + +static bool type_is_refcounted(enum bpf_reg_type type) +{ +	return type == PTR_TO_SOCKET; +} + +static bool type_is_refcounted_or_null(enum bpf_reg_type type) +{ +	return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL; +} + +static bool reg_is_refcounted(const struct bpf_reg_state *reg) +{ +	return type_is_refcounted(reg->type); +} + +static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) +{ +	return type_is_refcounted_or_null(reg->type); +} + +static bool arg_type_is_refcounted(enum bpf_arg_type type) +{ +	return type == ARG_PTR_TO_SOCKET; +} + +/* Determine whether the function releases some resources allocated by another + * function call. The first reference type argument will be assumed to be + * released by release_reference(). + */ +static bool is_release_function(enum bpf_func_id func_id) +{ +	return func_id == BPF_FUNC_sk_release; +} +  /* string representation of 'enum bpf_reg_type' */  static const char * const reg_type_str[] = {  	[NOT_INIT]		= "?", @@ -261,6 +321,16 @@ static const char * const reg_type_str[] = {  	[PTR_TO_PACKET]		= "pkt",  	[PTR_TO_PACKET_META]	= "pkt_meta",  	[PTR_TO_PACKET_END]	= "pkt_end", +	[PTR_TO_FLOW_KEYS]	= "flow_keys", +	[PTR_TO_SOCKET]		= "sock", +	[PTR_TO_SOCKET_OR_NULL] = "sock_or_null", +}; + +static char slot_type_char[] = { +	[STACK_INVALID]	= '?', +	[STACK_SPILL]	= 'r', +	[STACK_MISC]	= 'm', +	[STACK_ZERO]	= '0',  };  static void print_liveness(struct bpf_verifier_env *env, @@ -349,72 +419,179 @@ static void print_verifier_state(struct bpf_verifier_env *env,  		}  	}  	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { -		if (state->stack[i].slot_type[0] == STACK_SPILL) { -			verbose(env, " fp%d", -				(-i - 1) * BPF_REG_SIZE); -			print_liveness(env, state->stack[i].spilled_ptr.live); +		char types_buf[BPF_REG_SIZE + 1]; +		bool valid = false; +		int j; + +		for (j = 0; j < BPF_REG_SIZE; j++) { +			if (state->stack[i].slot_type[j] != STACK_INVALID) +				valid = true; +			types_buf[j] = slot_type_char[ +					state->stack[i].slot_type[j]]; +		} +		types_buf[BPF_REG_SIZE] = 0; +		if (!valid) +			continue; +		verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); +		print_liveness(env, state->stack[i].spilled_ptr.live); +		if (state->stack[i].slot_type[0] == STACK_SPILL)  			verbose(env, "=%s",  				reg_type_str[state->stack[i].spilled_ptr.type]); -		} -		if (state->stack[i].slot_type[0] == STACK_ZERO) -			verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); +		else +			verbose(env, "=%s", types_buf); +	} +	if (state->acquired_refs && state->refs[0].id) { +		verbose(env, " refs=%d", state->refs[0].id); +		for (i = 1; i < state->acquired_refs; i++) +			if (state->refs[i].id) +				verbose(env, ",%d", state->refs[i].id);  	}  	verbose(env, "\n");  } -static int copy_stack_state(struct bpf_func_state *dst, -			    const struct bpf_func_state *src) -{ -	if (!src->stack) -		return 0; -	if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) { -		/* internal bug, make state invalid to reject the program */ -		memset(dst, 0, sizeof(*dst)); -		return -EFAULT; -	} -	memcpy(dst->stack, src->stack, -	       sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE)); -	return 0; -} +#define COPY_STATE_FN(NAME, COUNT, FIELD, SIZE)				\ +static int copy_##NAME##_state(struct bpf_func_state *dst,		\ +			       const struct bpf_func_state *src)	\ +{									\ +	if (!src->FIELD)						\ +		return 0;						\ +	if (WARN_ON_ONCE(dst->COUNT < src->COUNT)) {			\ +		/* internal bug, make state invalid to reject the program */ \ +		memset(dst, 0, sizeof(*dst));				\ +		return -EFAULT;						\ +	}								\ +	memcpy(dst->FIELD, src->FIELD,					\ +	       sizeof(*src->FIELD) * (src->COUNT / SIZE));		\ +	return 0;							\ +} +/* copy_reference_state() */ +COPY_STATE_FN(reference, acquired_refs, refs, 1) +/* copy_stack_state() */ +COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) +#undef COPY_STATE_FN + +#define REALLOC_STATE_FN(NAME, COUNT, FIELD, SIZE)			\ +static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \ +				  bool copy_old)			\ +{									\ +	u32 old_size = state->COUNT;					\ +	struct bpf_##NAME##_state *new_##FIELD;				\ +	int slot = size / SIZE;						\ +									\ +	if (size <= old_size || !size) {				\ +		if (copy_old)						\ +			return 0;					\ +		state->COUNT = slot * SIZE;				\ +		if (!size && old_size) {				\ +			kfree(state->FIELD);				\ +			state->FIELD = NULL;				\ +		}							\ +		return 0;						\ +	}								\ +	new_##FIELD = kmalloc_array(slot, sizeof(struct bpf_##NAME##_state), \ +				    GFP_KERNEL);			\ +	if (!new_##FIELD)						\ +		return -ENOMEM;						\ +	if (copy_old) {							\ +		if (state->FIELD)					\ +			memcpy(new_##FIELD, state->FIELD,		\ +			       sizeof(*new_##FIELD) * (old_size / SIZE)); \ +		memset(new_##FIELD + old_size / SIZE, 0,		\ +		       sizeof(*new_##FIELD) * (size - old_size) / SIZE); \ +	}								\ +	state->COUNT = slot * SIZE;					\ +	kfree(state->FIELD);						\ +	state->FIELD = new_##FIELD;					\ +	return 0;							\ +} +/* realloc_reference_state() */ +REALLOC_STATE_FN(reference, acquired_refs, refs, 1) +/* realloc_stack_state() */ +REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) +#undef REALLOC_STATE_FN  /* do_check() starts with zero-sized stack in struct bpf_verifier_state to   * make it consume minimal amount of memory. check_stack_write() access from   * the program calls into realloc_func_state() to grow the stack size.   * Note there is a non-zero 'parent' pointer inside bpf_verifier_state - * which this function copies over. It points to previous bpf_verifier_state - * which is never reallocated + * which realloc_stack_state() copies over. It points to previous + * bpf_verifier_state which is never reallocated.   */ -static int realloc_func_state(struct bpf_func_state *state, int size, -			      bool copy_old) +static int realloc_func_state(struct bpf_func_state *state, int stack_size, +			      int refs_size, bool copy_old)  { -	u32 old_size = state->allocated_stack; -	struct bpf_stack_state *new_stack; -	int slot = size / BPF_REG_SIZE; +	int err = realloc_reference_state(state, refs_size, copy_old); +	if (err) +		return err; +	return realloc_stack_state(state, stack_size, copy_old); +} + +/* Acquire a pointer id from the env and update the state->refs to include + * this new pointer reference. + * On success, returns a valid pointer id to associate with the register + * On failure, returns a negative errno. + */ +static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) +{ +	struct bpf_func_state *state = cur_func(env); +	int new_ofs = state->acquired_refs; +	int id, err; -	if (size <= old_size || !size) { -		if (copy_old) +	err = realloc_reference_state(state, state->acquired_refs + 1, true); +	if (err) +		return err; +	id = ++env->id_gen; +	state->refs[new_ofs].id = id; +	state->refs[new_ofs].insn_idx = insn_idx; + +	return id; +} + +/* release function corresponding to acquire_reference_state(). Idempotent. */ +static int __release_reference_state(struct bpf_func_state *state, int ptr_id) +{ +	int i, last_idx; + +	if (!ptr_id) +		return -EFAULT; + +	last_idx = state->acquired_refs - 1; +	for (i = 0; i < state->acquired_refs; i++) { +		if (state->refs[i].id == ptr_id) { +			if (last_idx && i != last_idx) +				memcpy(&state->refs[i], &state->refs[last_idx], +				       sizeof(*state->refs)); +			memset(&state->refs[last_idx], 0, sizeof(*state->refs)); +			state->acquired_refs--;  			return 0; -		state->allocated_stack = slot * BPF_REG_SIZE; -		if (!size && old_size) { -			kfree(state->stack); -			state->stack = NULL;  		} -		return 0;  	} -	new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state), -				  GFP_KERNEL); -	if (!new_stack) -		return -ENOMEM; -	if (copy_old) { -		if (state->stack) -			memcpy(new_stack, state->stack, -			       sizeof(*new_stack) * (old_size / BPF_REG_SIZE)); -		memset(new_stack + old_size / BPF_REG_SIZE, 0, -		       sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE); -	} -	state->allocated_stack = slot * BPF_REG_SIZE; -	kfree(state->stack); -	state->stack = new_stack; +	return -EFAULT; +} + +/* variation on the above for cases where we expect that there must be an + * outstanding reference for the specified ptr_id. + */ +static int release_reference_state(struct bpf_verifier_env *env, int ptr_id) +{ +	struct bpf_func_state *state = cur_func(env); +	int err; + +	err = __release_reference_state(state, ptr_id); +	if (WARN_ON_ONCE(err != 0)) +		verbose(env, "verifier internal error: can't release reference\n"); +	return err; +} + +static int transfer_reference_state(struct bpf_func_state *dst, +				    struct bpf_func_state *src) +{ +	int err = realloc_reference_state(dst, src->acquired_refs, false); +	if (err) +		return err; +	err = copy_reference_state(dst, src); +	if (err) +		return err;  	return 0;  } @@ -422,6 +599,7 @@ static void free_func_state(struct bpf_func_state *state)  {  	if (!state)  		return; +	kfree(state->refs);  	kfree(state->stack);  	kfree(state);  } @@ -447,10 +625,14 @@ static int copy_func_state(struct bpf_func_state *dst,  {  	int err; -	err = realloc_func_state(dst, src->allocated_stack, false); +	err = realloc_func_state(dst, src->allocated_stack, src->acquired_refs, +				 false); +	if (err) +		return err; +	memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs)); +	err = copy_reference_state(dst, src);  	if (err)  		return err; -	memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack));  	return copy_stack_state(dst, src);  } @@ -466,7 +648,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,  		dst_state->frame[i] = NULL;  	}  	dst_state->curframe = src->curframe; -	dst_state->parent = src->parent;  	for (i = 0; i <= src->curframe; i++) {  		dst = dst_state->frame[i];  		if (!dst) { @@ -553,7 +734,9 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg);   */  static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)  { -	reg->id = 0; +	/* Clear id, off, and union(map_ptr, range) */ +	memset(((u8 *)reg) + sizeof(reg->type), 0, +	       offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));  	reg->var_off = tnum_const(imm);  	reg->smin_value = (s64)imm;  	reg->smax_value = (s64)imm; @@ -572,7 +755,6 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)  static void __mark_reg_const_zero(struct bpf_reg_state *reg)  {  	__mark_reg_known(reg, 0); -	reg->off = 0;  	reg->type = SCALAR_VALUE;  } @@ -683,9 +865,12 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)  /* Mark a register as having a completely unknown (scalar) value. */  static void __mark_reg_unknown(struct bpf_reg_state *reg)  { +	/* +	 * Clear type, id, off, and union(map_ptr, range) and +	 * padding between 'type' and union +	 */ +	memset(reg, 0, offsetof(struct bpf_reg_state, var_off));  	reg->type = SCALAR_VALUE; -	reg->id = 0; -	reg->off = 0;  	reg->var_off = tnum_unknown;  	reg->frameno = 0;  	__mark_reg_unbounded(reg); @@ -732,6 +917,7 @@ static void init_reg_state(struct bpf_verifier_env *env,  	for (i = 0; i < MAX_BPF_REG; i++) {  		mark_reg_not_init(env, regs, i);  		regs[i].live = REG_LIVE_NONE; +		regs[i].parent = NULL;  	}  	/* frame pointer */ @@ -823,10 +1009,6 @@ static int check_subprogs(struct bpf_verifier_env *env)  			verbose(env, "function calls to other bpf functions are allowed for root only\n");  			return -EPERM;  		} -		if (bpf_prog_is_dev_bound(env->prog->aux)) { -			verbose(env, "function calls in offloaded programs are not supported yet\n"); -			return -EINVAL; -		}  		ret = add_subprog(env, i + insn[i].imm + 1);  		if (ret < 0)  			return ret; @@ -876,74 +1058,21 @@ next:  	return 0;  } -static -struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, -				       const struct bpf_verifier_state *state, -				       struct bpf_verifier_state *parent, -				       u32 regno) -{ -	struct bpf_verifier_state *tmp = NULL; - -	/* 'parent' could be a state of caller and -	 * 'state' could be a state of callee. In such case -	 * parent->curframe < state->curframe -	 * and it's ok for r1 - r5 registers -	 * -	 * 'parent' could be a callee's state after it bpf_exit-ed. -	 * In such case parent->curframe > state->curframe -	 * and it's ok for r0 only -	 */ -	if (parent->curframe == state->curframe || -	    (parent->curframe < state->curframe && -	     regno >= BPF_REG_1 && regno <= BPF_REG_5) || -	    (parent->curframe > state->curframe && -	       regno == BPF_REG_0)) -		return parent; - -	if (parent->curframe > state->curframe && -	    regno >= BPF_REG_6) { -		/* for callee saved regs we have to skip the whole chain -		 * of states that belong to callee and mark as LIVE_READ -		 * the registers before the call -		 */ -		tmp = parent; -		while (tmp && tmp->curframe != state->curframe) { -			tmp = tmp->parent; -		} -		if (!tmp) -			goto bug; -		parent = tmp; -	} else { -		goto bug; -	} -	return parent; -bug: -	verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); -	verbose(env, "regno %d parent frame %d current frame %d\n", -		regno, parent->curframe, state->curframe); -	return NULL; -} - +/* Parentage chain of this register (or stack slot) should take care of all + * issues like callee-saved registers, stack slot allocation time, etc. + */  static int mark_reg_read(struct bpf_verifier_env *env, -			 const struct bpf_verifier_state *state, -			 struct bpf_verifier_state *parent, -			 u32 regno) +			 const struct bpf_reg_state *state, +			 struct bpf_reg_state *parent)  {  	bool writes = parent == state->parent; /* Observe write marks */ -	if (regno == BPF_REG_FP) -		/* We don't need to worry about FP liveness because it's read-only */ -		return 0; -  	while (parent) {  		/* if read wasn't screened by an earlier write ... */ -		if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN) +		if (writes && state->live & REG_LIVE_WRITTEN)  			break; -		parent = skip_callee(env, state, parent, regno); -		if (!parent) -			return -EFAULT;  		/* ... then we depend on parent's value */ -		parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ; +		parent->live |= REG_LIVE_READ;  		state = parent;  		parent = state->parent;  		writes = true; @@ -969,7 +1098,10 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,  			verbose(env, "R%d !read_ok\n", regno);  			return -EACCES;  		} -		return mark_reg_read(env, vstate, vstate->parent, regno); +		/* We don't need to worry about FP liveness because it's read-only */ +		if (regno != BPF_REG_FP) +			return mark_reg_read(env, ®s[regno], +					     regs[regno].parent);  	} else {  		/* check whether register used as dest operand can be written to */  		if (regno == BPF_REG_FP) { @@ -993,7 +1125,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type)  	case PTR_TO_PACKET:  	case PTR_TO_PACKET_META:  	case PTR_TO_PACKET_END: +	case PTR_TO_FLOW_KEYS:  	case CONST_PTR_TO_MAP: +	case PTR_TO_SOCKET: +	case PTR_TO_SOCKET_OR_NULL:  		return true;  	default:  		return false; @@ -1018,7 +1153,7 @@ static int check_stack_write(struct bpf_verifier_env *env,  	enum bpf_reg_type type;  	err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), -				 true); +				 state->acquired_refs, true);  	if (err)  		return err;  	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, @@ -1080,8 +1215,8 @@ static int check_stack_write(struct bpf_verifier_env *env,  	} else {  		u8 type = STACK_MISC; -		/* regular write of data into stack */ -		state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; +		/* regular write of data into stack destroys any spilled ptr */ +		state->stack[spi].spilled_ptr.type = NOT_INIT;  		/* only mark the slot as written if all 8 bytes were written  		 * otherwise read propagation may incorrectly stop too soon @@ -1106,61 +1241,6 @@ static int check_stack_write(struct bpf_verifier_env *env,  	return 0;  } -/* registers of every function are unique and mark_reg_read() propagates - * the liveness in the following cases: - * - from callee into caller for R1 - R5 that were used as arguments - * - from caller into callee for R0 that used as result of the call - * - from caller to the same caller skipping states of the callee for R6 - R9, - *   since R6 - R9 are callee saved by implicit function prologue and - *   caller's R6 != callee's R6, so when we propagate liveness up to - *   parent states we need to skip callee states for R6 - R9. - * - * stack slot marking is different, since stacks of caller and callee are - * accessible in both (since caller can pass a pointer to caller's stack to - * callee which can pass it to another function), hence mark_stack_slot_read() - * has to propagate the stack liveness to all parent states at given frame number. - * Consider code: - * f1() { - *   ptr = fp - 8; - *   *ptr = ctx; - *   call f2 { - *      .. = *ptr; - *   } - *   .. = *ptr; - * } - * First *ptr is reading from f1's stack and mark_stack_slot_read() has - * to mark liveness at the f1's frame and not f2's frame. - * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has - * to propagate liveness to f2 states at f1's frame level and further into - * f1 states at f1's frame level until write into that stack slot - */ -static void mark_stack_slot_read(struct bpf_verifier_env *env, -				 const struct bpf_verifier_state *state, -				 struct bpf_verifier_state *parent, -				 int slot, int frameno) -{ -	bool writes = parent == state->parent; /* Observe write marks */ - -	while (parent) { -		if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE) -			/* since LIVE_WRITTEN mark is only done for full 8-byte -			 * write the read marks are conservative and parent -			 * state may not even have the stack allocated. In such case -			 * end the propagation, since the loop reached beginning -			 * of the function -			 */ -			break; -		/* if read wasn't screened by an earlier write ... */ -		if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) -			break; -		/* ... then we depend on parent's value */ -		parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ; -		state = parent; -		parent = state->parent; -		writes = true; -	} -} -  static int check_stack_read(struct bpf_verifier_env *env,  			    struct bpf_func_state *reg_state /* func where register points to */,  			    int off, int size, int value_regno) @@ -1198,8 +1278,8 @@ static int check_stack_read(struct bpf_verifier_env *env,  			 */  			state->regs[value_regno].live |= REG_LIVE_WRITTEN;  		} -		mark_stack_slot_read(env, vstate, vstate->parent, spi, -				     reg_state->frameno); +		mark_reg_read(env, ®_state->stack[spi].spilled_ptr, +			      reg_state->stack[spi].spilled_ptr.parent);  		return 0;  	} else {  		int zeros = 0; @@ -1215,8 +1295,8 @@ static int check_stack_read(struct bpf_verifier_env *env,  				off, i, size);  			return -EACCES;  		} -		mark_stack_slot_read(env, vstate, vstate->parent, spi, -				     reg_state->frameno); +		mark_reg_read(env, ®_state->stack[spi].spilled_ptr, +			      reg_state->stack[spi].spilled_ptr.parent);  		if (value_regno >= 0) {  			if (zeros == size) {  				/* any size read into register is zero extended, @@ -1321,6 +1401,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,  	case BPF_PROG_TYPE_LWT_XMIT:  	case BPF_PROG_TYPE_SK_SKB:  	case BPF_PROG_TYPE_SK_MSG: +	case BPF_PROG_TYPE_FLOW_DISSECTOR:  		if (meta)  			return meta->pkt_access; @@ -1404,6 +1485,40 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,  	return -EACCES;  } +static int check_flow_keys_access(struct bpf_verifier_env *env, int off, +				  int size) +{ +	if (size < 0 || off < 0 || +	    (u64)off + size > sizeof(struct bpf_flow_keys)) { +		verbose(env, "invalid access to flow keys off=%d size=%d\n", +			off, size); +		return -EACCES; +	} +	return 0; +} + +static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, +			     int size, enum bpf_access_type t) +{ +	struct bpf_reg_state *regs = cur_regs(env); +	struct bpf_reg_state *reg = ®s[regno]; +	struct bpf_insn_access_aux info; + +	if (reg->smin_value < 0) { +		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", +			regno); +		return -EACCES; +	} + +	if (!bpf_sock_is_valid_access(off, size, t, &info)) { +		verbose(env, "invalid bpf_sock access off=%d size=%d\n", +			off, size); +		return -EACCES; +	} + +	return 0; +} +  static bool __is_pointer_value(bool allow_ptr_leaks,  			       const struct bpf_reg_state *reg)  { @@ -1413,25 +1528,39 @@ static bool __is_pointer_value(bool allow_ptr_leaks,  	return reg->type != SCALAR_VALUE;  } +static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) +{ +	return cur_regs(env) + regno; +} +  static bool is_pointer_value(struct bpf_verifier_env *env, int regno)  { -	return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); +	return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno));  }  static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)  { -	const struct bpf_reg_state *reg = cur_regs(env) + regno; +	const struct bpf_reg_state *reg = reg_state(env, regno); -	return reg->type == PTR_TO_CTX; +	return reg->type == PTR_TO_CTX || +	       reg->type == PTR_TO_SOCKET;  }  static bool is_pkt_reg(struct bpf_verifier_env *env, int regno)  { -	const struct bpf_reg_state *reg = cur_regs(env) + regno; +	const struct bpf_reg_state *reg = reg_state(env, regno);  	return type_is_pkt_pointer(reg->type);  } +static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno) +{ +	const struct bpf_reg_state *reg = reg_state(env, regno); + +	/* Separate to is_ctx_reg() since we still want to allow BPF_ST here. */ +	return reg->type == PTR_TO_FLOW_KEYS; +} +  static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,  				   const struct bpf_reg_state *reg,  				   int off, int size, bool strict) @@ -1505,6 +1634,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,  		 * right in front, treat it the very same way.  		 */  		return check_pkt_ptr_alignment(env, reg, off, size, strict); +	case PTR_TO_FLOW_KEYS: +		pointer_desc = "flow keys "; +		break;  	case PTR_TO_MAP_VALUE:  		pointer_desc = "value ";  		break; @@ -1519,6 +1651,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,  		 */  		strict = true;  		break; +	case PTR_TO_SOCKET: +		pointer_desc = "sock "; +		break;  	default:  		break;  	} @@ -1727,9 +1862,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  			else  				mark_reg_known_zero(env, regs,  						    value_regno); -			regs[value_regno].id = 0; -			regs[value_regno].off = 0; -			regs[value_regno].range = 0;  			regs[value_regno].type = reg_type;  		} @@ -1778,6 +1910,25 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  		err = check_packet_access(env, regno, off, size, false);  		if (!err && t == BPF_READ && value_regno >= 0)  			mark_reg_unknown(env, regs, value_regno); +	} else if (reg->type == PTR_TO_FLOW_KEYS) { +		if (t == BPF_WRITE && value_regno >= 0 && +		    is_pointer_value(env, value_regno)) { +			verbose(env, "R%d leaks addr into flow keys\n", +				value_regno); +			return -EACCES; +		} + +		err = check_flow_keys_access(env, off, size); +		if (!err && t == BPF_READ && value_regno >= 0) +			mark_reg_unknown(env, regs, value_regno); +	} else if (reg->type == PTR_TO_SOCKET) { +		if (t == BPF_WRITE) { +			verbose(env, "cannot write into socket\n"); +			return -EACCES; +		} +		err = check_sock_access(env, regno, off, size, t); +		if (!err && value_regno >= 0) +			mark_reg_unknown(env, regs, value_regno);  	} else {  		verbose(env, "R%d invalid mem access '%s'\n", regno,  			reg_type_str[reg->type]); @@ -1818,10 +1969,11 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins  	}  	if (is_ctx_reg(env, insn->dst_reg) || -	    is_pkt_reg(env, insn->dst_reg)) { +	    is_pkt_reg(env, insn->dst_reg) || +	    is_flow_key_reg(env, insn->dst_reg)) {  		verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", -			insn->dst_reg, is_ctx_reg(env, insn->dst_reg) ? -			"context" : "packet"); +			insn->dst_reg, +			reg_type_str[reg_state(env, insn->dst_reg)->type]);  		return -EACCES;  	} @@ -1846,7 +1998,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,  				int access_size, bool zero_size_allowed,  				struct bpf_call_arg_meta *meta)  { -	struct bpf_reg_state *reg = cur_regs(env) + regno; +	struct bpf_reg_state *reg = reg_state(env, regno);  	struct bpf_func_state *state = func(env, reg);  	int off, i, slot, spi; @@ -1908,8 +2060,8 @@ mark:  		/* reading any byte out of 8-byte 'spill_slot' will cause  		 * the whole slot to be marked as 'read'  		 */ -		mark_stack_slot_read(env, env->cur_state, env->cur_state->parent, -				     spi, state->frameno); +		mark_reg_read(env, &state->stack[spi].spilled_ptr, +			      state->stack[spi].spilled_ptr.parent);  	}  	return update_stack_depth(env, state, off);  } @@ -1978,7 +2130,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  	}  	if (arg_type == ARG_PTR_TO_MAP_KEY || -	    arg_type == ARG_PTR_TO_MAP_VALUE) { +	    arg_type == ARG_PTR_TO_MAP_VALUE || +	    arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {  		expected_type = PTR_TO_STACK;  		if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&  		    type != expected_type) @@ -1999,6 +2152,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  		err = check_ctx_reg(env, reg, regno);  		if (err < 0)  			return err; +	} else if (arg_type == ARG_PTR_TO_SOCKET) { +		expected_type = PTR_TO_SOCKET; +		if (type != expected_type) +			goto err_type; +		if (meta->ptr_id || !reg->id) { +			verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n", +				meta->ptr_id, reg->id); +			return -EFAULT; +		} +		meta->ptr_id = reg->id;  	} else if (arg_type_is_mem_ptr(arg_type)) {  		expected_type = PTR_TO_STACK;  		/* One exception here. In case function allows for NULL to be @@ -2038,7 +2201,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  		err = check_helper_mem_access(env, regno,  					      meta->map_ptr->key_size, false,  					      NULL); -	} else if (arg_type == ARG_PTR_TO_MAP_VALUE) { +	} else if (arg_type == ARG_PTR_TO_MAP_VALUE || +		   arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {  		/* bpf_map_xxx(..., map_ptr, ..., value) call:  		 * check [value, value + map->value_size) validity  		 */ @@ -2047,9 +2211,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,  			verbose(env, "invalid map_ptr to access map->value\n");  			return -EACCES;  		} +		meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE);  		err = check_helper_mem_access(env, regno,  					      meta->map_ptr->value_size, false, -					      NULL); +					      meta);  	} else if (arg_type_is_mem_size(arg_type)) {  		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); @@ -2129,6 +2294,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,  			goto error;  		break;  	case BPF_MAP_TYPE_CGROUP_STORAGE: +	case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:  		if (func_id != BPF_FUNC_get_local_storage)  			goto error;  		break; @@ -2171,6 +2337,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,  		if (func_id != BPF_FUNC_sk_select_reuseport)  			goto error;  		break; +	case BPF_MAP_TYPE_QUEUE: +	case BPF_MAP_TYPE_STACK: +		if (func_id != BPF_FUNC_map_peek_elem && +		    func_id != BPF_FUNC_map_pop_elem && +		    func_id != BPF_FUNC_map_push_elem) +			goto error; +		break;  	default:  		break;  	} @@ -2219,13 +2392,21 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,  			goto error;  		break;  	case BPF_FUNC_get_local_storage: -		if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) +		if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && +		    map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)  			goto error;  		break;  	case BPF_FUNC_sk_select_reuseport:  		if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY)  			goto error;  		break; +	case BPF_FUNC_map_peek_elem: +	case BPF_FUNC_map_pop_elem: +	case BPF_FUNC_map_push_elem: +		if (map->map_type != BPF_MAP_TYPE_QUEUE && +		    map->map_type != BPF_MAP_TYPE_STACK) +			goto error; +		break;  	default:  		break;  	} @@ -2286,10 +2467,32 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)  	return true;  } +static bool check_refcount_ok(const struct bpf_func_proto *fn) +{ +	int count = 0; + +	if (arg_type_is_refcounted(fn->arg1_type)) +		count++; +	if (arg_type_is_refcounted(fn->arg2_type)) +		count++; +	if (arg_type_is_refcounted(fn->arg3_type)) +		count++; +	if (arg_type_is_refcounted(fn->arg4_type)) +		count++; +	if (arg_type_is_refcounted(fn->arg5_type)) +		count++; + +	/* We only support one arg being unreferenced at the moment, +	 * which is sufficient for the helper functions we have right now. +	 */ +	return count <= 1; +} +  static int check_func_proto(const struct bpf_func_proto *fn)  {  	return check_raw_mode_ok(fn) && -	       check_arg_pair_ok(fn) ? 0 : -EINVAL; +	       check_arg_pair_ok(fn) && +	       check_refcount_ok(fn) ? 0 : -EINVAL;  }  /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] @@ -2305,10 +2508,9 @@ static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,  		if (reg_is_pkt_pointer_any(®s[i]))  			mark_reg_unknown(env, regs, i); -	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { -		if (state->stack[i].slot_type[0] != STACK_SPILL) +	bpf_for_each_spilled_reg(i, state, reg) { +		if (!reg)  			continue; -		reg = &state->stack[i].spilled_ptr;  		if (reg_is_pkt_pointer_any(reg))  			__mark_reg_unknown(reg);  	} @@ -2323,12 +2525,45 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)  		__clear_all_pkt_pointers(env, vstate->frame[i]);  } +static void release_reg_references(struct bpf_verifier_env *env, +				   struct bpf_func_state *state, int id) +{ +	struct bpf_reg_state *regs = state->regs, *reg; +	int i; + +	for (i = 0; i < MAX_BPF_REG; i++) +		if (regs[i].id == id) +			mark_reg_unknown(env, regs, i); + +	bpf_for_each_spilled_reg(i, state, reg) { +		if (!reg) +			continue; +		if (reg_is_refcounted(reg) && reg->id == id) +			__mark_reg_unknown(reg); +	} +} + +/* The pointer with the specified id has released its reference to kernel + * resources. Identify all copies of the same pointer and clear the reference. + */ +static int release_reference(struct bpf_verifier_env *env, +			     struct bpf_call_arg_meta *meta) +{ +	struct bpf_verifier_state *vstate = env->cur_state; +	int i; + +	for (i = 0; i <= vstate->curframe; i++) +		release_reg_references(env, vstate->frame[i], meta->ptr_id); + +	return release_reference_state(env, meta->ptr_id); +} +  static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  			   int *insn_idx)  {  	struct bpf_verifier_state *state = env->cur_state;  	struct bpf_func_state *caller, *callee; -	int i, subprog, target_insn; +	int i, err, subprog, target_insn;  	if (state->curframe + 1 >= MAX_CALL_FRAMES) {  		verbose(env, "the call stack of %d frames is too deep\n", @@ -2366,11 +2601,18 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  			state->curframe + 1 /* frameno within this callchain */,  			subprog /* subprog number within this prog */); -	/* copy r1 - r5 args that callee can access */ +	/* Transfer references to the callee */ +	err = transfer_reference_state(callee, caller); +	if (err) +		return err; + +	/* copy r1 - r5 args that callee can access.  The copy includes parent +	 * pointers, which connects us up to the liveness chain +	 */  	for (i = BPF_REG_1; i <= BPF_REG_5; i++)  		callee->regs[i] = caller->regs[i]; -	/* after the call regsiters r0 - r5 were scratched */ +	/* after the call registers r0 - r5 were scratched */  	for (i = 0; i < CALLER_SAVED_REGS; i++) {  		mark_reg_not_init(env, caller->regs, caller_saved[i]);  		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); @@ -2396,6 +2638,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)  	struct bpf_verifier_state *state = env->cur_state;  	struct bpf_func_state *caller, *callee;  	struct bpf_reg_state *r0; +	int err;  	callee = state->frame[state->curframe];  	r0 = &callee->regs[BPF_REG_0]; @@ -2415,6 +2658,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)  	/* return to the caller whatever r0 had in the callee */  	caller->regs[BPF_REG_0] = *r0; +	/* Transfer references to the caller */ +	err = transfer_reference_state(caller, callee); +	if (err) +		return err; +  	*insn_idx = callee->callsite + 1;  	if (env->log.level) {  		verbose(env, "returning from callee:\n"); @@ -2454,7 +2702,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,  	if (func_id != BPF_FUNC_tail_call &&  	    func_id != BPF_FUNC_map_lookup_elem &&  	    func_id != BPF_FUNC_map_update_elem && -	    func_id != BPF_FUNC_map_delete_elem) +	    func_id != BPF_FUNC_map_delete_elem && +	    func_id != BPF_FUNC_map_push_elem && +	    func_id != BPF_FUNC_map_pop_elem && +	    func_id != BPF_FUNC_map_peek_elem)  		return 0;  	if (meta->map_ptr == NULL) { @@ -2471,6 +2722,18 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,  	return 0;  } +static int check_reference_leak(struct bpf_verifier_env *env) +{ +	struct bpf_func_state *state = cur_func(env); +	int i; + +	for (i = 0; i < state->acquired_refs; i++) { +		verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", +			state->refs[i].id, state->refs[i].insn_idx); +	} +	return state->acquired_refs ? -EINVAL : 0; +} +  static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)  {  	const struct bpf_func_proto *fn = NULL; @@ -2549,6 +2812,18 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn  			return err;  	} +	if (func_id == BPF_FUNC_tail_call) { +		err = check_reference_leak(env); +		if (err) { +			verbose(env, "tail_call would lead to reference leak\n"); +			return err; +		} +	} else if (is_release_function(func_id)) { +		err = release_reference(env, &meta); +		if (err) +			return err; +	} +  	regs = cur_regs(env);  	/* check that flags argument in get_local_storage(map, flags) is 0, @@ -2580,7 +2855,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn  			regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;  		/* There is no offset yet applied, variable or fixed */  		mark_reg_known_zero(env, regs, BPF_REG_0); -		regs[BPF_REG_0].off = 0;  		/* remember map_ptr, so that check_map_access()  		 * can check 'value_size' boundary of memory access  		 * to map element returned from bpf_map_lookup_elem() @@ -2592,6 +2866,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn  		}  		regs[BPF_REG_0].map_ptr = meta.map_ptr;  		regs[BPF_REG_0].id = ++env->id_gen; +	} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { +		int id = acquire_reference_state(env, insn_idx); +		if (id < 0) +			return id; +		mark_reg_known_zero(env, regs, BPF_REG_0); +		regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; +		regs[BPF_REG_0].id = id;  	} else {  		verbose(env, "unknown return type %d of func %s#%d\n",  			fn->ret_type, func_id_name(func_id), func_id); @@ -2722,20 +3003,20 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  		return -EACCES;  	} -	if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { -		verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", -			dst); -		return -EACCES; -	} -	if (ptr_reg->type == CONST_PTR_TO_MAP) { -		verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", -			dst); +	switch (ptr_reg->type) { +	case PTR_TO_MAP_VALUE_OR_NULL: +		verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", +			dst, reg_type_str[ptr_reg->type]);  		return -EACCES; -	} -	if (ptr_reg->type == PTR_TO_PACKET_END) { -		verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", -			dst); +	case CONST_PTR_TO_MAP: +	case PTR_TO_PACKET_END: +	case PTR_TO_SOCKET: +	case PTR_TO_SOCKET_OR_NULL: +		verbose(env, "R%d pointer arithmetic on %s prohibited\n", +			dst, reg_type_str[ptr_reg->type]);  		return -EACCES; +	default: +		break;  	}  	/* In case of 'scalar += pointer', dst_reg inherits pointer type and id. @@ -3455,10 +3736,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,  	for (j = 0; j <= vstate->curframe; j++) {  		state = vstate->frame[j]; -		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { -			if (state->stack[i].slot_type[0] != STACK_SPILL) +		bpf_for_each_spilled_reg(i, state, reg) { +			if (!reg)  				continue; -			reg = &state->stack[i].spilled_ptr;  			if (reg->type == type && reg->id == dst_reg->id)  				reg->range = max(reg->range, new_range);  		} @@ -3664,12 +3944,11 @@ static void reg_combine_min_max(struct bpf_reg_state *true_src,  	}  } -static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, -			 bool is_null) +static void mark_ptr_or_null_reg(struct bpf_func_state *state, +				 struct bpf_reg_state *reg, u32 id, +				 bool is_null)  { -	struct bpf_reg_state *reg = ®s[regno]; - -	if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { +	if (reg_type_may_be_null(reg->type) && reg->id == id) {  		/* Old offset (both fixed and variable parts) should  		 * have been known-zero, because we don't allow pointer  		 * arithmetic on pointers that might be NULL. @@ -3682,40 +3961,49 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,  		}  		if (is_null) {  			reg->type = SCALAR_VALUE; -		} else if (reg->map_ptr->inner_map_meta) { -			reg->type = CONST_PTR_TO_MAP; -			reg->map_ptr = reg->map_ptr->inner_map_meta; -		} else { -			reg->type = PTR_TO_MAP_VALUE; +		} else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { +			if (reg->map_ptr->inner_map_meta) { +				reg->type = CONST_PTR_TO_MAP; +				reg->map_ptr = reg->map_ptr->inner_map_meta; +			} else { +				reg->type = PTR_TO_MAP_VALUE; +			} +		} else if (reg->type == PTR_TO_SOCKET_OR_NULL) { +			reg->type = PTR_TO_SOCKET; +		} +		if (is_null || !reg_is_refcounted(reg)) { +			/* We don't need id from this point onwards anymore, +			 * thus we should better reset it, so that state +			 * pruning has chances to take effect. +			 */ +			reg->id = 0;  		} -		/* We don't need id from this point onwards anymore, thus we -		 * should better reset it, so that state pruning has chances -		 * to take effect. -		 */ -		reg->id = 0;  	}  }  /* The logic is similar to find_good_pkt_pointers(), both could eventually   * be folded together at some point.   */ -static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, -			  bool is_null) +static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, +				  bool is_null)  {  	struct bpf_func_state *state = vstate->frame[vstate->curframe]; -	struct bpf_reg_state *regs = state->regs; +	struct bpf_reg_state *reg, *regs = state->regs;  	u32 id = regs[regno].id;  	int i, j; +	if (reg_is_refcounted_or_null(®s[regno]) && is_null) +		__release_reference_state(state, id); +  	for (i = 0; i < MAX_BPF_REG; i++) -		mark_map_reg(regs, i, id, is_null); +		mark_ptr_or_null_reg(state, ®s[i], id, is_null);  	for (j = 0; j <= vstate->curframe; j++) {  		state = vstate->frame[j]; -		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { -			if (state->stack[i].slot_type[0] != STACK_SPILL) +		bpf_for_each_spilled_reg(i, state, reg) { +			if (!reg)  				continue; -			mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); +			mark_ptr_or_null_reg(state, reg, id, is_null);  		}  	}  } @@ -3917,12 +4205,14 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  	/* detect if R == 0 where R is returned from bpf_map_lookup_elem() */  	if (BPF_SRC(insn->code) == BPF_K &&  	    insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && -	    dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { -		/* Mark all identical map registers in each branch as either +	    reg_type_may_be_null(dst_reg->type)) { +		/* Mark all identical registers in each branch as either  		 * safe or unknown depending R == 0 or R != 0 conditional.  		 */ -		mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE); -		mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ); +		mark_ptr_or_null_regs(this_branch, insn->dst_reg, +				      opcode == BPF_JNE); +		mark_ptr_or_null_regs(other_branch, insn->dst_reg, +				      opcode == BPF_JEQ);  	} else if (!try_match_pkt_pointers(insn, dst_reg, ®s[insn->src_reg],  					   this_branch, other_branch) &&  		   is_pointer_value(env, insn->dst_reg)) { @@ -4045,6 +4335,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)  	if (err)  		return err; +	/* Disallow usage of BPF_LD_[ABS|IND] with reference tracking, as +	 * gen_ld_abs() may terminate the program at runtime, leading to +	 * reference leak. +	 */ +	err = check_reference_leak(env); +	if (err) { +		verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n"); +		return err; +	} +  	if (regs[BPF_REG_6].type != PTR_TO_CTX) {  		verbose(env,  			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -4378,7 +4678,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,  		/* explored state didn't use this */  		return true; -	equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0; +	equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0;  	if (rold->type == PTR_TO_STACK)  		/* two stack pointers are equal only if they're pointing to @@ -4459,6 +4759,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,  	case PTR_TO_CTX:  	case CONST_PTR_TO_MAP:  	case PTR_TO_PACKET_END: +	case PTR_TO_FLOW_KEYS: +	case PTR_TO_SOCKET: +	case PTR_TO_SOCKET_OR_NULL:  		/* Only valid matches are exact, which memcmp() above  		 * would have accepted  		 */ @@ -4534,6 +4837,14 @@ static bool stacksafe(struct bpf_func_state *old,  	return true;  } +static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur) +{ +	if (old->acquired_refs != cur->acquired_refs) +		return false; +	return !memcmp(old->refs, cur->refs, +		       sizeof(*old->refs) * old->acquired_refs); +} +  /* compare two verifier states   *   * all states stored in state_list are known to be valid, since @@ -4579,6 +4890,9 @@ static bool func_states_equal(struct bpf_func_state *old,  	if (!stacksafe(old, cur, idmap))  		goto out_free; + +	if (!refsafe(old, cur)) +		goto out_free;  	ret = true;  out_free:  	kfree(idmap); @@ -4611,7 +4925,7 @@ static bool states_equal(struct bpf_verifier_env *env,   * equivalent state (jump target or such) we didn't arrive by the straight-line   * code, so read marks in the state must propagate to the parent regardless   * of the state's write marks. That's what 'parent == state->parent' comparison - * in mark_reg_read() and mark_stack_slot_read() is for. + * in mark_reg_read() is for.   */  static int propagate_liveness(struct bpf_verifier_env *env,  			      const struct bpf_verifier_state *vstate, @@ -4632,7 +4946,8 @@ static int propagate_liveness(struct bpf_verifier_env *env,  		if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ)  			continue;  		if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { -			err = mark_reg_read(env, vstate, vparent, i); +			err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i], +					    &vparent->frame[vstate->curframe]->regs[i]);  			if (err)  				return err;  		} @@ -4647,7 +4962,8 @@ static int propagate_liveness(struct bpf_verifier_env *env,  			if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)  				continue;  			if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) -				mark_stack_slot_read(env, vstate, vparent, i, frame); +				mark_reg_read(env, &state->stack[i].spilled_ptr, +					      &parent->stack[i].spilled_ptr);  		}  	}  	return err; @@ -4657,7 +4973,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  {  	struct bpf_verifier_state_list *new_sl;  	struct bpf_verifier_state_list *sl; -	struct bpf_verifier_state *cur = env->cur_state; +	struct bpf_verifier_state *cur = env->cur_state, *new;  	int i, j, err;  	sl = env->explored_states[insn_idx]; @@ -4699,16 +5015,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  		return -ENOMEM;  	/* add new state to the head of linked list */ -	err = copy_verifier_state(&new_sl->state, cur); +	new = &new_sl->state; +	err = copy_verifier_state(new, cur);  	if (err) { -		free_verifier_state(&new_sl->state, false); +		free_verifier_state(new, false);  		kfree(new_sl);  		return err;  	}  	new_sl->next = env->explored_states[insn_idx];  	env->explored_states[insn_idx] = new_sl;  	/* connect new state to parentage chain */ -	cur->parent = &new_sl->state; +	for (i = 0; i < BPF_REG_FP; i++) +		cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i];  	/* clear write marks in current state: the writes we did are not writes  	 * our child did, so they don't screen off its reads from us.  	 * (There are no read marks in current state, because reads always mark @@ -4721,13 +5039,48 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  	/* all stack frames are accessible from callee, clear them all */  	for (j = 0; j <= cur->curframe; j++) {  		struct bpf_func_state *frame = cur->frame[j]; +		struct bpf_func_state *newframe = new->frame[j]; -		for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) +		for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) {  			frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; +			frame->stack[i].spilled_ptr.parent = +						&newframe->stack[i].spilled_ptr; +		}  	}  	return 0;  } +/* Return true if it's OK to have the same insn return a different type. */ +static bool reg_type_mismatch_ok(enum bpf_reg_type type) +{ +	switch (type) { +	case PTR_TO_CTX: +	case PTR_TO_SOCKET: +	case PTR_TO_SOCKET_OR_NULL: +		return false; +	default: +		return true; +	} +} + +/* If an instruction was previously used with particular pointer types, then we + * need to be careful to avoid cases such as the below, where it may be ok + * for one branch accessing the pointer, but not ok for the other branch: + * + * R1 = sock_ptr + * goto X; + * ... + * R1 = some_other_valid_ptr; + * goto X; + * ... + * R2 = *(u32 *)(R1 + 0); + */ +static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) +{ +	return src != prev && (!reg_type_mismatch_ok(src) || +			       !reg_type_mismatch_ok(prev)); +} +  static int do_check(struct bpf_verifier_env *env)  {  	struct bpf_verifier_state *state; @@ -4742,7 +5095,6 @@ static int do_check(struct bpf_verifier_env *env)  	if (!state)  		return -ENOMEM;  	state->curframe = 0; -	state->parent = NULL;  	state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);  	if (!state->frame[0]) {  		kfree(state); @@ -4822,6 +5174,7 @@ static int do_check(struct bpf_verifier_env *env)  		regs = cur_regs(env);  		env->insn_aux_data[insn_idx].seen = true; +  		if (class == BPF_ALU || class == BPF_ALU64) {  			err = check_alu_op(env, insn);  			if (err) @@ -4861,9 +5214,7 @@ static int do_check(struct bpf_verifier_env *env)  				 */  				*prev_src_type = src_reg_type; -			} else if (src_reg_type != *prev_src_type && -				   (src_reg_type == PTR_TO_CTX || -				    *prev_src_type == PTR_TO_CTX)) { +			} else if (reg_type_mismatch(src_reg_type, *prev_src_type)) {  				/* ABuser program is trying to use the same insn  				 * dst_reg = *(u32*) (src_reg + off)  				 * with different pointer types: @@ -4908,9 +5259,7 @@ static int do_check(struct bpf_verifier_env *env)  			if (*prev_dst_type == NOT_INIT) {  				*prev_dst_type = dst_reg_type; -			} else if (dst_reg_type != *prev_dst_type && -				   (dst_reg_type == PTR_TO_CTX || -				    *prev_dst_type == PTR_TO_CTX)) { +			} else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) {  				verbose(env, "same insn cannot be used with different pointers\n");  				return -EINVAL;  			} @@ -4927,8 +5276,9 @@ static int do_check(struct bpf_verifier_env *env)  				return err;  			if (is_ctx_reg(env, insn->dst_reg)) { -				verbose(env, "BPF_ST stores into R%d context is not allowed\n", -					insn->dst_reg); +				verbose(env, "BPF_ST stores into R%d %s is not allowed\n", +					insn->dst_reg, +					reg_type_str[reg_state(env, insn->dst_reg)->type]);  				return -EACCES;  			} @@ -4990,6 +5340,10 @@ static int do_check(struct bpf_verifier_env *env)  					continue;  				} +				err = check_reference_leak(env); +				if (err) +					return err; +  				/* eBPF calling convetion is such that R0 is used  				 * to return the value from eBPF program.  				 * Make sure that it's readable at this time @@ -5103,6 +5457,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,  	return 0;  } +static bool bpf_map_is_cgroup_storage(struct bpf_map *map) +{ +	return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || +		map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); +} +  /* look for pseudo eBPF instructions that access map FDs and   * replace them with actual map pointers   */ @@ -5193,10 +5553,9 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)  			}  			env->used_maps[env->used_map_cnt++] = map; -			if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE && +			if (bpf_map_is_cgroup_storage(map) &&  			    bpf_cgroup_storage_assign(env->prog, map)) { -				verbose(env, -					"only one cgroup storage is allowed\n"); +				verbose(env, "only one cgroup storage of each type is allowed\n");  				fdput(f);  				return -EBUSY;  			} @@ -5225,11 +5584,15 @@ next_insn:  /* drop refcnt of maps used by the rejected program */  static void release_maps(struct bpf_verifier_env *env)  { +	enum bpf_cgroup_storage_type stype;  	int i; -	if (env->prog->aux->cgroup_storage) +	for_each_cgroup_storage_type(stype) { +		if (!env->prog->aux->cgroup_storage[stype]) +			continue;  		bpf_cgroup_storage_release(env->prog, -					   env->prog->aux->cgroup_storage); +			env->prog->aux->cgroup_storage[stype]); +	}  	for (i = 0; i < env->used_map_cnt; i++)  		bpf_map_put(env->used_maps[i]); @@ -5327,8 +5690,10 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)  	}  } -/* convert load instructions that access fields of 'struct __sk_buff' - * into sequence of instructions that access fields of 'struct sk_buff' +/* convert load instructions that access fields of a context type into a + * sequence of instructions that access fields of the underlying structure: + *     struct __sk_buff    -> struct sk_buff + *     struct bpf_sock_ops -> struct sock   */  static int convert_ctx_accesses(struct bpf_verifier_env *env)  { @@ -5357,12 +5722,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  		}  	} -	if (!ops->convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux)) +	if (bpf_prog_is_dev_bound(env->prog->aux))  		return 0;  	insn = env->prog->insnsi + delta;  	for (i = 0; i < insn_cnt; i++, insn++) { +		bpf_convert_ctx_access_t convert_ctx_access; +  		if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||  		    insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||  		    insn->code == (BPF_LDX | BPF_MEM | BPF_W) || @@ -5404,8 +5771,18 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  			continue;  		} -		if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) +		switch (env->insn_aux_data[i + delta].ptr_type) { +		case PTR_TO_CTX: +			if (!ops->convert_ctx_access) +				continue; +			convert_ctx_access = ops->convert_ctx_access; +			break; +		case PTR_TO_SOCKET: +			convert_ctx_access = bpf_sock_convert_ctx_access; +			break; +		default:  			continue; +		}  		ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;  		size = BPF_LDST_BYTES(insn); @@ -5437,8 +5814,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  		}  		target_size = 0; -		cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog, -					      &target_size); +		cnt = convert_ctx_access(type, insn, insn_buf, env->prog, +					 &target_size);  		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||  		    (ctx_field_size && !target_size)) {  			verbose(env, "bpf verifier is misconfigured\n"); @@ -5629,10 +6006,10 @@ static int fixup_call_args(struct bpf_verifier_env *env)  	struct bpf_insn *insn = prog->insnsi;  	int i, depth;  #endif -	int err; +	int err = 0; -	err = 0; -	if (env->prog->jit_requested) { +	if (env->prog->jit_requested && +	    !bpf_prog_is_dev_bound(env->prog->aux)) {  		err = jit_subprogs(env);  		if (err == 0)  			return 0; @@ -5801,7 +6178,10 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)  		if (prog->jit_requested && BITS_PER_LONG == 64 &&  		    (insn->imm == BPF_FUNC_map_lookup_elem ||  		     insn->imm == BPF_FUNC_map_update_elem || -		     insn->imm == BPF_FUNC_map_delete_elem)) { +		     insn->imm == BPF_FUNC_map_delete_elem || +		     insn->imm == BPF_FUNC_map_push_elem   || +		     insn->imm == BPF_FUNC_map_pop_elem    || +		     insn->imm == BPF_FUNC_map_peek_elem)) {  			aux = &env->insn_aux_data[i + delta];  			if (bpf_map_ptr_poisoned(aux))  				goto patch_call_imm; @@ -5834,6 +6214,14 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)  			BUILD_BUG_ON(!__same_type(ops->map_update_elem,  				     (int (*)(struct bpf_map *map, void *key, void *value,  					      u64 flags))NULL)); +			BUILD_BUG_ON(!__same_type(ops->map_push_elem, +				     (int (*)(struct bpf_map *map, void *value, +					      u64 flags))NULL)); +			BUILD_BUG_ON(!__same_type(ops->map_pop_elem, +				     (int (*)(struct bpf_map *map, void *value))NULL)); +			BUILD_BUG_ON(!__same_type(ops->map_peek_elem, +				     (int (*)(struct bpf_map *map, void *value))NULL)); +  			switch (insn->imm) {  			case BPF_FUNC_map_lookup_elem:  				insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - @@ -5847,6 +6235,18 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)  				insn->imm = BPF_CAST_CALL(ops->map_delete_elem) -  					    __bpf_call_base;  				continue; +			case BPF_FUNC_map_push_elem: +				insn->imm = BPF_CAST_CALL(ops->map_push_elem) - +					    __bpf_call_base; +				continue; +			case BPF_FUNC_map_pop_elem: +				insn->imm = BPF_CAST_CALL(ops->map_pop_elem) - +					    __bpf_call_base; +				continue; +			case BPF_FUNC_map_peek_elem: +				insn->imm = BPF_CAST_CALL(ops->map_peek_elem) - +					    __bpf_call_base; +				continue;  			}  			goto patch_call_imm; @@ -5970,6 +6370,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)  		env->cur_state = NULL;  	} +	if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux)) +		ret = bpf_prog_offload_finalize(env); +  skip_full_check:  	while (!pop_stack(env, NULL, NULL));  	free_states(env); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 47147c9e184d..686d244e798d 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -154,7 +154,7 @@ void __xsk_map_flush(struct bpf_map *map)  static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)  { -	return NULL; +	return ERR_PTR(-EOPNOTSUPP);  }  static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, diff --git a/kernel/umh.c b/kernel/umh.c index c449858946af..0baa672e023c 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -405,11 +405,19 @@ struct subprocess_info *call_usermodehelper_setup_file(struct file *file,  		void (*cleanup)(struct subprocess_info *info), void *data)  {  	struct subprocess_info *sub_info; +	struct umh_info *info = data; +	const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper";  	sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL);  	if (!sub_info)  		return NULL; +	sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL); +	if (!sub_info->argv) { +		kfree(sub_info); +		return NULL; +	} +  	INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);  	sub_info->path = "none";  	sub_info->file = file; @@ -458,10 +466,11 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)  	return 0;  } -static void umh_save_pid(struct subprocess_info *info) +static void umh_clean_and_save_pid(struct subprocess_info *info)  {  	struct umh_info *umh_info = info->data; +	argv_free(info->argv);  	umh_info->pid = info->pid;  } @@ -471,6 +480,9 @@ static void umh_save_pid(struct subprocess_info *info)   * @len: length of the blob   * @info: information about usermode process (shouldn't be NULL)   * + * If info->cmdline is set it will be used as command line for the + * user process, else "usermodehelper" is used. + *   * Returns either negative error or zero which indicates success   * in executing a blob of bytes as a usermode process. In such   * case 'struct umh_info *info' is populated with two pipes @@ -500,7 +512,7 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info)  	err = -ENOMEM;  	sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup, -						  umh_save_pid, info); +						  umh_clean_and_save_pid, info);  	if (!sub_info)  		goto out; |