diff options
Diffstat (limited to 'kernel/bpf')
| -rw-r--r-- | kernel/bpf/Makefile | 1 | ||||
| -rw-r--r-- | kernel/bpf/arraymap.c | 2 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 2 | ||||
| -rw-r--r-- | kernel/bpf/devmap.c | 16 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 4 | ||||
| -rw-r--r-- | kernel/bpf/inode.c | 1 | ||||
| -rw-r--r-- | kernel/bpf/sockmap.c | 57 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 6 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 77 | 
9 files changed, 122 insertions, 44 deletions
| diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 897daa005b23..af3ab6164ff5 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0  obj-y := core.o  obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 98c0f00c3f5e..e2636737b69b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -98,7 +98,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)  	array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();  	if (array_size >= U32_MAX - PAGE_SIZE || -	    elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { +	    bpf_array_alloc_percpu(array)) {  		bpf_map_area_free(array);  		return ERR_PTR(-ENOMEM);  	} diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 917cc04a0a94..7b62df86be1d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1022,7 +1022,7 @@ select_insn:  		struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;  		struct bpf_array *array = container_of(map, struct bpf_array, map);  		struct bpf_prog *prog; -		u64 index = BPF_R3; +		u32 index = BPF_R3;  		if (unlikely(index >= array->map.max_entries))  			goto out; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 959c9a07f318..e745d6a88224 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -69,14 +69,17 @@ static LIST_HEAD(dev_map_list);  static u64 dev_map_bitmap_size(const union bpf_attr *attr)  { -	return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); +	return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);  }  static struct bpf_map *dev_map_alloc(union bpf_attr *attr)  {  	struct bpf_dtab *dtab; +	int err = -EINVAL;  	u64 cost; -	int err; + +	if (!capable(CAP_NET_ADMIN)) +		return ERR_PTR(-EPERM);  	/* check sanity of attributes */  	if (attr->max_entries == 0 || attr->key_size != 4 || @@ -108,9 +111,12 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)  	if (err)  		goto free_dtab; +	err = -ENOMEM; +  	/* A per cpu bitfield with a bit per possible net device */ -	dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), -					    __alignof__(unsigned long)); +	dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), +						__alignof__(unsigned long), +						GFP_KERNEL | __GFP_NOWARN);  	if (!dtab->flush_needed)  		goto free_dtab; @@ -128,7 +134,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)  free_dtab:  	free_percpu(dtab->flush_needed);  	kfree(dtab); -	return ERR_PTR(-ENOMEM); +	return ERR_PTR(err);  }  static void dev_map_free(struct bpf_map *map) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 431126f31ea3..6533f08d1238 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -317,10 +317,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)  		 */  		goto free_htab; -	if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) -		/* make sure the size for pcpu_alloc() is reasonable */ -		goto free_htab; -  	htab->elem_size = sizeof(struct htab_elem) +  			  round_up(htab->map.key_size, 8);  	if (percpu) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index e833ed914358..be1dde967208 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -363,6 +363,7 @@ out:  	putname(pname);  	return ret;  } +EXPORT_SYMBOL_GPL(bpf_obj_get_user);  static void bpf_evict_inode(struct inode *inode)  { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 6424ce0e4969..dbd7b322a86b 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -39,6 +39,7 @@  #include <linux/workqueue.h>  #include <linux/list.h>  #include <net/strparser.h> +#include <net/tcp.h>  struct bpf_stab {  	struct bpf_map map; @@ -92,21 +93,45 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)  	return rcu_dereference_sk_user_data(sk);  } +/* compute the linear packet data range [data, data_end) for skb when + * sk_skb type programs are in use. + */ +static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) +{ +	TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); +} + +enum __sk_action { +	__SK_DROP = 0, +	__SK_PASS, +	__SK_REDIRECT, +}; +  static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)  {  	struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);  	int rc;  	if (unlikely(!prog)) -		return SK_DROP; +		return __SK_DROP;  	skb_orphan(skb); +	/* We need to ensure that BPF metadata for maps is also cleared +	 * when we orphan the skb so that we don't have the possibility +	 * to reference a stale map. +	 */ +	TCP_SKB_CB(skb)->bpf.map = NULL;  	skb->sk = psock->sock; -	bpf_compute_data_end(skb); +	bpf_compute_data_end_sk_skb(skb); +	preempt_disable();  	rc = (*prog->bpf_func)(skb, prog->insnsi); +	preempt_enable();  	skb->sk = NULL; -	return rc; +	/* Moving return codes from UAPI namespace into internal namespace */ +	return rc == SK_PASS ? +		(TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) : +		__SK_DROP;  }  static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) @@ -114,17 +139,10 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)  	struct sock *sk;  	int rc; -	/* Because we use per cpu values to feed input from sock redirect -	 * in BPF program to do_sk_redirect_map() call we need to ensure we -	 * are not preempted. RCU read lock is not sufficient in this case -	 * with CONFIG_PREEMPT_RCU enabled so we must be explicit here. -	 */ -	preempt_disable();  	rc = smap_verdict_func(psock, skb);  	switch (rc) { -	case SK_REDIRECT: -		sk = do_sk_redirect_map(); -		preempt_enable(); +	case __SK_REDIRECT: +		sk = do_sk_redirect_map(skb);  		if (likely(sk)) {  			struct smap_psock *peer = smap_psock_sk(sk); @@ -139,10 +157,8 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)  			}  		}  	/* Fall through and free skb otherwise */ -	case SK_DROP: +	case __SK_DROP:  	default: -		if (rc != SK_REDIRECT) -			preempt_enable();  		kfree_skb(skb);  	}  } @@ -369,7 +385,7 @@ static int smap_parse_func_strparser(struct strparser *strp,  	 * any socket yet.  	 */  	skb->sk = psock->sock; -	bpf_compute_data_end(skb); +	bpf_compute_data_end_sk_skb(skb);  	rc = (*prog->bpf_func)(skb, prog->insnsi);  	skb->sk = NULL;  	rcu_read_unlock(); @@ -487,6 +503,9 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)  	int err = -EINVAL;  	u64 cost; +	if (!capable(CAP_NET_ADMIN)) +		return ERR_PTR(-EPERM); +  	/* check sanity of attributes */  	if (attr->max_entries == 0 || attr->key_size != 4 ||  	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) @@ -840,6 +859,12 @@ static int sock_map_update_elem(struct bpf_map *map,  		return -EINVAL;  	} +	if (skops.sk->sk_type != SOCK_STREAM || +	    skops.sk->sk_protocol != IPPROTO_TCP) { +		fput(socket->file); +		return -EOPNOTSUPP; +	} +  	err = sock_map_ctx_update_elem(&skops, map, key, flags);  	fput(socket->file);  	return err; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb17e1cd1d43..25d074920a00 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -186,15 +186,17 @@ static int bpf_map_alloc_id(struct bpf_map *map)  static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)  { +	unsigned long flags; +  	if (do_idr_lock) -		spin_lock_bh(&map_idr_lock); +		spin_lock_irqsave(&map_idr_lock, flags);  	else  		__acquire(&map_idr_lock);  	idr_remove(&map_idr, map->id);  	if (do_idr_lock) -		spin_unlock_bh(&map_idr_lock); +		spin_unlock_irqrestore(&map_idr_lock, flags);  	else  		__release(&map_idr_lock);  } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 799b2451ef2d..c48ca2a34b5e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -653,6 +653,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)  {  	struct bpf_verifier_state *parent = state->parent; +	if (regno == BPF_REG_FP) +		/* We don't need to worry about FP liveness because it's read-only */ +		return; +  	while (parent) {  		/* if read wasn't screened by an earlier write ... */  		if (state->regs[regno].live & REG_LIVE_WRITTEN) @@ -1112,7 +1116,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  		/* ctx accesses must be at a fixed offset, so that we can  		 * determine what type of data were returned.  		 */ -		if (!tnum_is_const(reg->var_off)) { +		if (reg->off) { +			verbose("dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n", +				regno, reg->off, off - reg->off); +			return -EACCES; +		} +		if (!tnum_is_const(reg->var_off) || reg->var_off.value) {  			char tn_buf[48];  			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -1120,7 +1129,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  				tn_buf, off, size);  			return -EACCES;  		} -		off += reg->var_off.value;  		err = check_ctx_access(env, insn_idx, off, size, t, ®_type);  		if (!err && t == BPF_READ && value_regno >= 0) {  			/* ctx access returns either a scalar, or a @@ -2345,6 +2353,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  				 * copy register state to dest reg  				 */  				regs[insn->dst_reg] = regs[insn->src_reg]; +				regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;  			} else {  				/* R1 = (u32) R2 */  				if (is_pointer_value(env, insn->src_reg)) { @@ -2421,12 +2430,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  }  static void find_good_pkt_pointers(struct bpf_verifier_state *state, -				   struct bpf_reg_state *dst_reg) +				   struct bpf_reg_state *dst_reg, +				   bool range_right_open)  {  	struct bpf_reg_state *regs = state->regs, *reg; +	u16 new_range;  	int i; -	if (dst_reg->off < 0) +	if (dst_reg->off < 0 || +	    (dst_reg->off == 0 && range_right_open))  		/* This doesn't give us any range */  		return; @@ -2437,9 +2449,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,  		 */  		return; -	/* LLVM can generate four kind of checks: +	new_range = dst_reg->off; +	if (range_right_open) +		new_range--; + +	/* Examples for register markings:  	 * -	 * Type 1/2: +	 * pkt_data in dst register:  	 *  	 *   r2 = r3;  	 *   r2 += 8; @@ -2456,7 +2472,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,  	 *     r2=pkt(id=n,off=8,r=0)  	 *     r3=pkt(id=n,off=0,r=0)  	 * -	 * Type 3/4: +	 * pkt_data in src register:  	 *  	 *   r2 = r3;  	 *   r2 += 8; @@ -2474,7 +2490,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,  	 *     r3=pkt(id=n,off=0,r=0)  	 *  	 * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) -	 * so that range of bytes [r3, r3 + 8) is safe to access. +	 * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8) +	 * and [r3, r3 + 8-1) respectively is safe to access depending on +	 * the check.  	 */  	/* If our ids match, then we must have the same max_value.  And we @@ -2485,14 +2503,14 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,  	for (i = 0; i < MAX_BPF_REG; i++)  		if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)  			/* keep the maximum range already checked */ -			regs[i].range = max_t(u16, regs[i].range, dst_reg->off); +			regs[i].range = max(regs[i].range, new_range);  	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {  		if (state->stack_slot_type[i] != STACK_SPILL)  			continue;  		reg = &state->spilled_regs[i / BPF_REG_SIZE];  		if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) -			reg->range = max_t(u16, reg->range, dst_reg->off); +			reg->range = max(reg->range, new_range);  	}  } @@ -2856,19 +2874,43 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&  		   dst_reg->type == PTR_TO_PACKET &&  		   regs[insn->src_reg].type == PTR_TO_PACKET_END) { -		find_good_pkt_pointers(this_branch, dst_reg); +		/* pkt_data' > pkt_end */ +		find_good_pkt_pointers(this_branch, dst_reg, false); +	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && +		   dst_reg->type == PTR_TO_PACKET_END && +		   regs[insn->src_reg].type == PTR_TO_PACKET) { +		/* pkt_end > pkt_data' */ +		find_good_pkt_pointers(other_branch, ®s[insn->src_reg], true); +	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && +		   dst_reg->type == PTR_TO_PACKET && +		   regs[insn->src_reg].type == PTR_TO_PACKET_END) { +		/* pkt_data' < pkt_end */ +		find_good_pkt_pointers(other_branch, dst_reg, true);  	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && +		   dst_reg->type == PTR_TO_PACKET_END && +		   regs[insn->src_reg].type == PTR_TO_PACKET) { +		/* pkt_end < pkt_data' */ +		find_good_pkt_pointers(this_branch, ®s[insn->src_reg], false); +	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&  		   dst_reg->type == PTR_TO_PACKET &&  		   regs[insn->src_reg].type == PTR_TO_PACKET_END) { -		find_good_pkt_pointers(other_branch, dst_reg); +		/* pkt_data' >= pkt_end */ +		find_good_pkt_pointers(this_branch, dst_reg, true);  	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&  		   dst_reg->type == PTR_TO_PACKET_END &&  		   regs[insn->src_reg].type == PTR_TO_PACKET) { -		find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); +		/* pkt_end >= pkt_data' */ +		find_good_pkt_pointers(other_branch, ®s[insn->src_reg], false); +	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && +		   dst_reg->type == PTR_TO_PACKET && +		   regs[insn->src_reg].type == PTR_TO_PACKET_END) { +		/* pkt_data' <= pkt_end */ +		find_good_pkt_pointers(other_branch, dst_reg, false);  	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&  		   dst_reg->type == PTR_TO_PACKET_END &&  		   regs[insn->src_reg].type == PTR_TO_PACKET) { -		find_good_pkt_pointers(this_branch, ®s[insn->src_reg]); +		/* pkt_end <= pkt_data' */ +		find_good_pkt_pointers(this_branch, ®s[insn->src_reg], true);  	} else if (is_pointer_value(env, insn->dst_reg)) {  		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);  		return -EACCES; @@ -4205,7 +4247,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)  		}  		if (insn->imm == BPF_FUNC_redirect_map) { -			u64 addr = (unsigned long)prog; +			/* Note, we cannot use prog directly as imm as subsequent +			 * rewrites would still change the prog pointer. The only +			 * stable address we can use is aux, which also works with +			 * prog clones during blinding. +			 */ +			u64 addr = (unsigned long)prog->aux;  			struct bpf_insn r4_ld[] = {  				BPF_LD_IMM64(BPF_REG_4, addr),  				*insn, |