diff options
Diffstat (limited to 'kernel/bpf')
| -rw-r--r-- | kernel/bpf/arraymap.c | 61 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 26 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 2 | ||||
| -rw-r--r-- | kernel/bpf/inode.c | 40 | ||||
| -rw-r--r-- | kernel/bpf/offload.c | 15 | ||||
| -rw-r--r-- | kernel/bpf/sockmap.c | 11 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 2 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 388 | 
8 files changed, 398 insertions, 147 deletions
| diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 7c25426d3cf5..ab94d304a634 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -53,9 +53,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)  {  	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;  	int numa_node = bpf_map_attr_numa_node(attr); +	u32 elem_size, index_mask, max_entries; +	bool unpriv = !capable(CAP_SYS_ADMIN);  	struct bpf_array *array; -	u64 array_size; -	u32 elem_size; +	u64 array_size, mask64;  	/* check sanity of attributes */  	if (attr->max_entries == 0 || attr->key_size != 4 || @@ -72,11 +73,32 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)  	elem_size = round_up(attr->value_size, 8); +	max_entries = attr->max_entries; + +	/* On 32 bit archs roundup_pow_of_two() with max_entries that has +	 * upper most bit set in u32 space is undefined behavior due to +	 * resulting 1U << 32, so do it manually here in u64 space. +	 */ +	mask64 = fls_long(max_entries - 1); +	mask64 = 1ULL << mask64; +	mask64 -= 1; + +	index_mask = mask64; +	if (unpriv) { +		/* round up array size to nearest power of 2, +		 * since cpu will speculate within index_mask limits +		 */ +		max_entries = index_mask + 1; +		/* Check for overflows. */ +		if (max_entries < attr->max_entries) +			return ERR_PTR(-E2BIG); +	} +  	array_size = sizeof(*array);  	if (percpu) -		array_size += (u64) attr->max_entries * sizeof(void *); +		array_size += (u64) max_entries * sizeof(void *);  	else -		array_size += (u64) attr->max_entries * elem_size; +		array_size += (u64) max_entries * elem_size;  	/* make sure there is no u32 overflow later in round_up() */  	if (array_size >= U32_MAX - PAGE_SIZE) @@ -86,6 +108,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)  	array = bpf_map_area_alloc(array_size, numa_node);  	if (!array)  		return ERR_PTR(-ENOMEM); +	array->index_mask = index_mask; +	array->map.unpriv_array = unpriv;  	/* copy mandatory map attributes */  	array->map.map_type = attr->map_type; @@ -121,12 +145,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)  	if (unlikely(index >= array->map.max_entries))  		return NULL; -	return array->value + array->elem_size * index; +	return array->value + array->elem_size * (index & array->index_mask);  }  /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */  static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)  { +	struct bpf_array *array = container_of(map, struct bpf_array, map);  	struct bpf_insn *insn = insn_buf;  	u32 elem_size = round_up(map->value_size, 8);  	const int ret = BPF_REG_0; @@ -135,7 +160,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)  	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));  	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); -	*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); +	if (map->unpriv_array) { +		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); +		*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); +	} else { +		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); +	}  	if (is_power_of_2(elem_size)) {  		*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); @@ -157,7 +187,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)  	if (unlikely(index >= array->map.max_entries))  		return NULL; -	return this_cpu_ptr(array->pptrs[index]); +	return this_cpu_ptr(array->pptrs[index & array->index_mask]);  }  int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) @@ -177,7 +207,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)  	 */  	size = round_up(map->value_size, 8);  	rcu_read_lock(); -	pptr = array->pptrs[index]; +	pptr = array->pptrs[index & array->index_mask];  	for_each_possible_cpu(cpu) {  		bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);  		off += size; @@ -225,10 +255,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,  		return -EEXIST;  	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) -		memcpy(this_cpu_ptr(array->pptrs[index]), +		memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),  		       value, map->value_size);  	else -		memcpy(array->value + array->elem_size * index, +		memcpy(array->value + +		       array->elem_size * (index & array->index_mask),  		       value, map->value_size);  	return 0;  } @@ -262,7 +293,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,  	 */  	size = round_up(map->value_size, 8);  	rcu_read_lock(); -	pptr = array->pptrs[index]; +	pptr = array->pptrs[index & array->index_mask];  	for_each_possible_cpu(cpu) {  		bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);  		off += size; @@ -613,6 +644,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)  static u32 array_of_map_gen_lookup(struct bpf_map *map,  				   struct bpf_insn *insn_buf)  { +	struct bpf_array *array = container_of(map, struct bpf_array, map);  	u32 elem_size = round_up(map->value_size, 8);  	struct bpf_insn *insn = insn_buf;  	const int ret = BPF_REG_0; @@ -621,7 +653,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,  	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));  	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); -	*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); +	if (map->unpriv_array) { +		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); +		*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); +	} else { +		*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); +	}  	if (is_power_of_2(elem_size))  		*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));  	else diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b9f8686a84cf..7949e8b8f94e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)  }  EXPORT_SYMBOL_GPL(__bpf_call_base); +#ifndef CONFIG_BPF_JIT_ALWAYS_ON  /**   *	__bpf_prog_run - run eBPF program on a given context   *	@ctx: is the data we are operating on @@ -955,7 +956,7 @@ select_insn:  		DST = tmp;  		CONT;  	ALU_MOD_X: -		if (unlikely(SRC == 0)) +		if (unlikely((u32)SRC == 0))  			return 0;  		tmp = (u32) DST;  		DST = do_div(tmp, (u32) SRC); @@ -974,7 +975,7 @@ select_insn:  		DST = div64_u64(DST, SRC);  		CONT;  	ALU_DIV_X: -		if (unlikely(SRC == 0)) +		if (unlikely((u32)SRC == 0))  			return 0;  		tmp = (u32) DST;  		do_div(tmp, (u32) SRC); @@ -1317,6 +1318,14 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)  EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)  }; +#else +static unsigned int __bpf_prog_ret0(const void *ctx, +				    const struct bpf_insn *insn) +{ +	return 0; +} +#endif +  bool bpf_prog_array_compatible(struct bpf_array *array,  			       const struct bpf_prog *fp)  { @@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)   */  struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)  { +#ifndef CONFIG_BPF_JIT_ALWAYS_ON  	u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);  	fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; +#else +	fp->bpf_func = __bpf_prog_ret0; +#endif  	/* eBPF JITs can rewrite the program in case constant  	 * blinding is active. However, in case of error during @@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)  	 */  	if (!bpf_prog_is_dev_bound(fp->aux)) {  		fp = bpf_int_jit_compile(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON +		if (!fp->jited) { +			*err = -ENOTSUPP; +			return fp; +		} +#endif  	} else {  		*err = bpf_prog_offload_compile(fp);  		if (*err) @@ -1447,7 +1466,8 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)  	rcu_read_lock();  	prog = rcu_dereference(progs)->progs;  	for (; *prog; prog++) -		cnt++; +		if (*prog != &dummy_bpf_prog.prog) +			cnt++;  	rcu_read_unlock();  	return cnt;  } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index e469e05c8e83..3905d4bc5b80 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -114,6 +114,7 @@ static void htab_free_elems(struct bpf_htab *htab)  		pptr = htab_elem_get_ptr(get_htab_elem(htab, i),  					 htab->map.key_size);  		free_percpu(pptr); +		cond_resched();  	}  free_elems:  	bpf_map_area_free(htab->elems); @@ -159,6 +160,7 @@ static int prealloc_init(struct bpf_htab *htab)  			goto free_elems;  		htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,  				  pptr); +		cond_resched();  	}  skip_percpu_elems: diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 01aaef1a77c5..5bb5e49ef4c3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -368,7 +368,45 @@ out:  	putname(pname);  	return ret;  } -EXPORT_SYMBOL_GPL(bpf_obj_get_user); + +static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) +{ +	struct bpf_prog *prog; +	int ret = inode_permission(inode, MAY_READ | MAY_WRITE); +	if (ret) +		return ERR_PTR(ret); + +	if (inode->i_op == &bpf_map_iops) +		return ERR_PTR(-EINVAL); +	if (inode->i_op != &bpf_prog_iops) +		return ERR_PTR(-EACCES); + +	prog = inode->i_private; + +	ret = security_bpf_prog(prog); +	if (ret < 0) +		return ERR_PTR(ret); + +	if (!bpf_prog_get_ok(prog, &type, false)) +		return ERR_PTR(-EINVAL); + +	return bpf_prog_inc(prog); +} + +struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) +{ +	struct bpf_prog *prog; +	struct path path; +	int ret = kern_path(name, LOOKUP_FOLLOW, &path); +	if (ret) +		return ERR_PTR(ret); +	prog = __get_prog_inode(d_backing_inode(path.dentry), type); +	if (!IS_ERR(prog)) +		touch_atime(&path); +	path_put(&path); +	return prog; +} +EXPORT_SYMBOL(bpf_prog_get_type_path);  static void bpf_evict_inode(struct inode *inode)  { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 68ec884440b7..8455b89d1bbf 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2017 Netronome Systems, Inc. + * + * This software is licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree. + * + * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE + * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME + * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + */ +  #include <linux/bpf.h>  #include <linux/bpf_verifier.h>  #include <linux/bug.h> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 5ee2e41893d9..1712d319c2d8 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -591,8 +591,15 @@ static void sock_map_free(struct bpf_map *map)  		write_lock_bh(&sock->sk_callback_lock);  		psock = smap_psock_sk(sock); -		smap_list_remove(psock, &stab->sock_map[i]); -		smap_release_sock(psock, sock); +		/* This check handles a racing sock event that can get the +		 * sk_callback_lock before this case but after xchg happens +		 * causing the refcnt to hit zero and sock user data (psock) +		 * to be null and queued for garbage collection. +		 */ +		if (likely(psock)) { +			smap_list_remove(psock, &stab->sock_map[i]); +			smap_release_sock(psock, sock); +		}  		write_unlock_bh(&sock->sk_callback_lock);  	}  	rcu_read_unlock(); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c4cfeaa8d5e..5cb783fc8224 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1057,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)  }  EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static bool bpf_prog_get_ok(struct bpf_prog *prog, +bool bpf_prog_get_ok(struct bpf_prog *prog,  			    enum bpf_prog_type *attach_type, bool attach_drv)  {  	/* not an attachment, just a refcount inc, always allow */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d4593571c404..13551e623501 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -978,6 +978,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)  	return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);  } +static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) +{ +	const struct bpf_reg_state *reg = cur_regs(env) + regno; + +	return reg->type == PTR_TO_CTX; +} +  static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,  				   const struct bpf_reg_state *reg,  				   int off, int size, bool strict) @@ -1059,6 +1066,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,  		break;  	case PTR_TO_STACK:  		pointer_desc = "stack "; +		/* The stack spill tracking logic in check_stack_write() +		 * and check_stack_read() relies on stack accesses being +		 * aligned. +		 */ +		strict = true;  		break;  	default:  		break; @@ -1067,6 +1079,29 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,  					   strict);  } +/* truncate register to smaller size (in bytes) + * must be called with size < BPF_REG_SIZE + */ +static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) +{ +	u64 mask; + +	/* clear high bits in bit representation */ +	reg->var_off = tnum_cast(reg->var_off, size); + +	/* fix arithmetic bounds */ +	mask = ((u64)1 << (size * 8)) - 1; +	if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) { +		reg->umin_value &= mask; +		reg->umax_value &= mask; +	} else { +		reg->umin_value = 0; +		reg->umax_value = mask; +	} +	reg->smin_value = reg->umin_value; +	reg->smax_value = reg->umax_value; +} +  /* check whether memory at (regno + off) is accessible for t = (read | write)   * if t==write, value_regno is a register which value is stored into memory   * if t==read, value_regno is a register which will receive the value from memory @@ -1200,9 +1235,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  	if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&  	    regs[value_regno].type == SCALAR_VALUE) {  		/* b/h/w load zero-extends, mark upper bits as known 0 */ -		regs[value_regno].var_off = -			tnum_cast(regs[value_regno].var_off, size); -		__update_reg_bounds(®s[value_regno]); +		coerce_reg_to_size(®s[value_regno], size);  	}  	return err;  } @@ -1232,6 +1265,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins  		return -EACCES;  	} +	if (is_ctx_reg(env, insn->dst_reg)) { +		verbose(env, "BPF_XADD stores into R%d context is not allowed\n", +			insn->dst_reg); +		return -EACCES; +	} +  	/* check whether atomic_add can read the memory */  	err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,  			       BPF_SIZE(insn->code), BPF_READ, -1); @@ -1282,6 +1321,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,  		tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);  		verbose(env, "invalid variable stack read R%d var_off=%s\n",  			regno, tn_buf); +		return -EACCES;  	}  	off = regs[regno].off + regs[regno].var_off.value;  	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || @@ -1674,7 +1714,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)  		return -EINVAL;  	} +	/* With LD_ABS/IND some JITs save/restore skb from r1. */  	changes_data = bpf_helper_changes_pkt_data(fn->func); +	if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { +		verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", +			func_id_name(func_id), func_id); +		return -EINVAL; +	}  	memset(&meta, 0, sizeof(meta));  	meta.pkt_access = fn->pkt_access; @@ -1696,6 +1742,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)  	err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);  	if (err)  		return err; +	if (func_id == BPF_FUNC_tail_call) { +		if (meta.map_ptr == NULL) { +			verbose(env, "verifier bug\n"); +			return -EINVAL; +		} +		env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; +	}  	err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);  	if (err)  		return err; @@ -1766,14 +1819,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)  	return 0;  } -static void coerce_reg_to_32(struct bpf_reg_state *reg) -{ -	/* clear high 32 bits */ -	reg->var_off = tnum_cast(reg->var_off, 4); -	/* Update bounds */ -	__update_reg_bounds(reg); -} -  static bool signed_add_overflows(s64 a, s64 b)  {  	/* Do the add in u64, where overflow is well-defined */ @@ -1794,6 +1839,41 @@ static bool signed_sub_overflows(s64 a, s64 b)  	return res > a;  } +static bool check_reg_sane_offset(struct bpf_verifier_env *env, +				  const struct bpf_reg_state *reg, +				  enum bpf_reg_type type) +{ +	bool known = tnum_is_const(reg->var_off); +	s64 val = reg->var_off.value; +	s64 smin = reg->smin_value; + +	if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { +		verbose(env, "math between %s pointer and %lld is not allowed\n", +			reg_type_str[type], val); +		return false; +	} + +	if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { +		verbose(env, "%s pointer offset %d is not allowed\n", +			reg_type_str[type], reg->off); +		return false; +	} + +	if (smin == S64_MIN) { +		verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", +			reg_type_str[type]); +		return false; +	} + +	if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { +		verbose(env, "value %lld makes %s pointer be out of bounds\n", +			smin, reg_type_str[type]); +		return false; +	} + +	return true; +} +  /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.   * Caller should also handle BPF_MOV case separately.   * If we return -EACCES, caller may want to try again treating pointer as a @@ -1815,44 +1895,36 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  	dst_reg = ®s[dst]; -	if (WARN_ON_ONCE(known && (smin_val != smax_val))) { -		print_verifier_state(env, env->cur_state); -		verbose(env, -			"verifier internal error: known but bad sbounds\n"); -		return -EINVAL; -	} -	if (WARN_ON_ONCE(known && (umin_val != umax_val))) { -		print_verifier_state(env, env->cur_state); -		verbose(env, -			"verifier internal error: known but bad ubounds\n"); -		return -EINVAL; +	if ((known && (smin_val != smax_val || umin_val != umax_val)) || +	    smin_val > smax_val || umin_val > umax_val) { +		/* Taint dst register if offset had invalid bounds derived from +		 * e.g. dead branches. +		 */ +		__mark_reg_unknown(dst_reg); +		return 0;  	}  	if (BPF_CLASS(insn->code) != BPF_ALU64) {  		/* 32-bit ALU ops on pointers produce (meaningless) scalars */ -		if (!env->allow_ptr_leaks) -			verbose(env, -				"R%d 32-bit pointer arithmetic prohibited\n", -				dst); +		verbose(env, +			"R%d 32-bit pointer arithmetic prohibited\n", +			dst);  		return -EACCES;  	}  	if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { -		if (!env->allow_ptr_leaks) -			verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", -				dst); +		verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", +			dst);  		return -EACCES;  	}  	if (ptr_reg->type == CONST_PTR_TO_MAP) { -		if (!env->allow_ptr_leaks) -			verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", -				dst); +		verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", +			dst);  		return -EACCES;  	}  	if (ptr_reg->type == PTR_TO_PACKET_END) { -		if (!env->allow_ptr_leaks) -			verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", -				dst); +		verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", +			dst);  		return -EACCES;  	} @@ -1862,6 +1934,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  	dst_reg->type = ptr_reg->type;  	dst_reg->id = ptr_reg->id; +	if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) || +	    !check_reg_sane_offset(env, ptr_reg, ptr_reg->type)) +		return -EINVAL; +  	switch (opcode) {  	case BPF_ADD:  		/* We can take a fixed offset as long as it doesn't overflow @@ -1915,9 +1991,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  	case BPF_SUB:  		if (dst_reg == off_reg) {  			/* scalar -= pointer.  Creates an unknown scalar */ -			if (!env->allow_ptr_leaks) -				verbose(env, "R%d tried to subtract pointer from scalar\n", -					dst); +			verbose(env, "R%d tried to subtract pointer from scalar\n", +				dst);  			return -EACCES;  		}  		/* We don't allow subtraction from FP, because (according to @@ -1925,9 +2000,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  		 * be able to deal with it.  		 */  		if (ptr_reg->type == PTR_TO_STACK) { -			if (!env->allow_ptr_leaks) -				verbose(env, "R%d subtraction from stack pointer prohibited\n", -					dst); +			verbose(env, "R%d subtraction from stack pointer prohibited\n", +				dst);  			return -EACCES;  		}  		if (known && (ptr_reg->off - smin_val == @@ -1976,28 +2050,30 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,  	case BPF_AND:  	case BPF_OR:  	case BPF_XOR: -		/* bitwise ops on pointers are troublesome, prohibit for now. -		 * (However, in principle we could allow some cases, e.g. -		 * ptr &= ~3 which would reduce min_value by 3.) -		 */ -		if (!env->allow_ptr_leaks) -			verbose(env, "R%d bitwise operator %s on pointer prohibited\n", -				dst, bpf_alu_string[opcode >> 4]); +		/* bitwise ops on pointers are troublesome, prohibit. */ +		verbose(env, "R%d bitwise operator %s on pointer prohibited\n", +			dst, bpf_alu_string[opcode >> 4]);  		return -EACCES;  	default:  		/* other operators (e.g. MUL,LSH) produce non-pointer results */ -		if (!env->allow_ptr_leaks) -			verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", -				dst, bpf_alu_string[opcode >> 4]); +		verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", +			dst, bpf_alu_string[opcode >> 4]);  		return -EACCES;  	} +	if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type)) +		return -EINVAL; +  	__update_reg_bounds(dst_reg);  	__reg_deduce_bounds(dst_reg);  	__reg_bound_offset(dst_reg);  	return 0;  } +/* WARNING: This function does calculations on 64-bit values, but the actual + * execution may occur on 32-bit values. Therefore, things like bitshifts + * need extra checks in the 32-bit case. + */  static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,  				      struct bpf_insn *insn,  				      struct bpf_reg_state *dst_reg, @@ -2008,12 +2084,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,  	bool src_known, dst_known;  	s64 smin_val, smax_val;  	u64 umin_val, umax_val; +	u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; -	if (BPF_CLASS(insn->code) != BPF_ALU64) { -		/* 32-bit ALU ops are (32,32)->64 */ -		coerce_reg_to_32(dst_reg); -		coerce_reg_to_32(&src_reg); -	}  	smin_val = src_reg.smin_value;  	smax_val = src_reg.smax_value;  	umin_val = src_reg.umin_value; @@ -2021,6 +2093,21 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,  	src_known = tnum_is_const(src_reg.var_off);  	dst_known = tnum_is_const(dst_reg->var_off); +	if ((src_known && (smin_val != smax_val || umin_val != umax_val)) || +	    smin_val > smax_val || umin_val > umax_val) { +		/* Taint dst register if offset had invalid bounds derived from +		 * e.g. dead branches. +		 */ +		__mark_reg_unknown(dst_reg); +		return 0; +	} + +	if (!src_known && +	    opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { +		__mark_reg_unknown(dst_reg); +		return 0; +	} +  	switch (opcode) {  	case BPF_ADD:  		if (signed_add_overflows(dst_reg->smin_value, smin_val) || @@ -2149,9 +2236,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,  		__update_reg_bounds(dst_reg);  		break;  	case BPF_LSH: -		if (umax_val > 63) { -			/* Shifts greater than 63 are undefined.  This includes -			 * shifts by a negative number. +		if (umax_val >= insn_bitness) { +			/* Shifts greater than 31 or 63 are undefined. +			 * This includes shifts by a negative number.  			 */  			mark_reg_unknown(env, regs, insn->dst_reg);  			break; @@ -2177,27 +2264,29 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,  		__update_reg_bounds(dst_reg);  		break;  	case BPF_RSH: -		if (umax_val > 63) { -			/* Shifts greater than 63 are undefined.  This includes -			 * shifts by a negative number. +		if (umax_val >= insn_bitness) { +			/* Shifts greater than 31 or 63 are undefined. +			 * This includes shifts by a negative number.  			 */  			mark_reg_unknown(env, regs, insn->dst_reg);  			break;  		} -		/* BPF_RSH is an unsigned shift, so make the appropriate casts */ -		if (dst_reg->smin_value < 0) { -			if (umin_val) { -				/* Sign bit will be cleared */ -				dst_reg->smin_value = 0; -			} else { -				/* Lost sign bit information */ -				dst_reg->smin_value = S64_MIN; -				dst_reg->smax_value = S64_MAX; -			} -		} else { -			dst_reg->smin_value = -				(u64)(dst_reg->smin_value) >> umax_val; -		} +		/* BPF_RSH is an unsigned shift.  If the value in dst_reg might +		 * be negative, then either: +		 * 1) src_reg might be zero, so the sign bit of the result is +		 *    unknown, so we lose our signed bounds +		 * 2) it's known negative, thus the unsigned bounds capture the +		 *    signed bounds +		 * 3) the signed bounds cross zero, so they tell us nothing +		 *    about the result +		 * If the value in dst_reg is known nonnegative, then again the +		 * unsigned bounts capture the signed bounds. +		 * Thus, in all cases it suffices to blow away our signed bounds +		 * and rely on inferring new ones from the unsigned bounds and +		 * var_off of the result. +		 */ +		dst_reg->smin_value = S64_MIN; +		dst_reg->smax_value = S64_MAX;  		if (src_known)  			dst_reg->var_off = tnum_rshift(dst_reg->var_off,  						       umin_val); @@ -2213,6 +2302,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,  		break;  	} +	if (BPF_CLASS(insn->code) != BPF_ALU64) { +		/* 32-bit ALU ops are (32,32)->32 */ +		coerce_reg_to_size(dst_reg, 4); +		coerce_reg_to_size(&src_reg, 4); +	} +  	__reg_deduce_bounds(dst_reg);  	__reg_bound_offset(dst_reg);  	return 0; @@ -2227,7 +2322,6 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,  	struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;  	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};  	u8 opcode = BPF_OP(insn->code); -	int rc;  	dst_reg = ®s[insn->dst_reg];  	src_reg = NULL; @@ -2238,43 +2332,29 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,  		if (src_reg->type != SCALAR_VALUE) {  			if (dst_reg->type != SCALAR_VALUE) {  				/* Combining two pointers by any ALU op yields -				 * an arbitrary scalar. +				 * an arbitrary scalar. Disallow all math except +				 * pointer subtraction  				 */ -				if (!env->allow_ptr_leaks) { -					verbose(env, "R%d pointer %s pointer prohibited\n", -						insn->dst_reg, -						bpf_alu_string[opcode >> 4]); -					return -EACCES; +				if (opcode == BPF_SUB){ +					mark_reg_unknown(env, regs, insn->dst_reg); +					return 0;  				} -				mark_reg_unknown(env, regs, insn->dst_reg); -				return 0; +				verbose(env, "R%d pointer %s pointer prohibited\n", +					insn->dst_reg, +					bpf_alu_string[opcode >> 4]); +				return -EACCES;  			} else {  				/* scalar += pointer  				 * This is legal, but we have to reverse our  				 * src/dest handling in computing the range  				 */ -				rc = adjust_ptr_min_max_vals(env, insn, -							     src_reg, dst_reg); -				if (rc == -EACCES && env->allow_ptr_leaks) { -					/* scalar += unknown scalar */ -					__mark_reg_unknown(&off_reg); -					return adjust_scalar_min_max_vals( -							env, insn, -							dst_reg, off_reg); -				} -				return rc; +				return adjust_ptr_min_max_vals(env, insn, +							       src_reg, dst_reg);  			}  		} else if (ptr_reg) {  			/* pointer += scalar */ -			rc = adjust_ptr_min_max_vals(env, insn, -						     dst_reg, src_reg); -			if (rc == -EACCES && env->allow_ptr_leaks) { -				/* unknown scalar += scalar */ -				__mark_reg_unknown(dst_reg); -				return adjust_scalar_min_max_vals( -						env, insn, dst_reg, *src_reg); -			} -			return rc; +			return adjust_ptr_min_max_vals(env, insn, +						       dst_reg, src_reg);  		}  	} else {  		/* Pretend the src is a reg with a known value, since we only @@ -2283,17 +2363,9 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,  		off_reg.type = SCALAR_VALUE;  		__mark_reg_known(&off_reg, insn->imm);  		src_reg = &off_reg; -		if (ptr_reg) { /* pointer += K */ -			rc = adjust_ptr_min_max_vals(env, insn, -						     ptr_reg, src_reg); -			if (rc == -EACCES && env->allow_ptr_leaks) { -				/* unknown scalar += K */ -				__mark_reg_unknown(dst_reg); -				return adjust_scalar_min_max_vals( -						env, insn, dst_reg, off_reg); -			} -			return rc; -		} +		if (ptr_reg) /* pointer += K */ +			return adjust_ptr_min_max_vals(env, insn, +						       ptr_reg, src_reg);  	}  	/* Got here implies adding two SCALAR_VALUEs */ @@ -2390,17 +2462,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  					return -EACCES;  				}  				mark_reg_unknown(env, regs, insn->dst_reg); -				/* high 32 bits are known zero. */ -				regs[insn->dst_reg].var_off = tnum_cast( -						regs[insn->dst_reg].var_off, 4); -				__update_reg_bounds(®s[insn->dst_reg]); +				coerce_reg_to_size(®s[insn->dst_reg], 4);  			}  		} else {  			/* case: R = imm  			 * remember the value we stored into this reg  			 */  			regs[insn->dst_reg].type = SCALAR_VALUE; -			__mark_reg_known(regs + insn->dst_reg, insn->imm); +			if (BPF_CLASS(insn->code) == BPF_ALU64) { +				__mark_reg_known(regs + insn->dst_reg, +						 insn->imm); +			} else { +				__mark_reg_known(regs + insn->dst_reg, +						 (u32)insn->imm); +			}  		}  	} else if (opcode > BPF_END) { @@ -2436,6 +2511,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  			return -EINVAL;  		} +		if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { +			verbose(env, "BPF_ARSH not supported for 32 bit ALU\n"); +			return -EINVAL; +		} +  		if ((opcode == BPF_LSH || opcode == BPF_RSH ||  		     opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {  			int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; @@ -3431,15 +3511,14 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,  			return range_within(rold, rcur) &&  			       tnum_in(rold->var_off, rcur->var_off);  		} else { -			/* if we knew anything about the old value, we're not -			 * equal, because we can't know anything about the -			 * scalar value of the pointer in the new value. +			/* We're trying to use a pointer in place of a scalar. +			 * Even if the scalar was unbounded, this could lead to +			 * pointer leaks because scalars are allowed to leak +			 * while pointers are not. We could make this safe in +			 * special cases if root is calling us, but it's +			 * probably not worth the hassle.  			 */ -			return rold->umin_value == 0 && -			       rold->umax_value == U64_MAX && -			       rold->smin_value == S64_MIN && -			       rold->smax_value == S64_MAX && -			       tnum_is_unknown(rold->var_off); +			return false;  		}  	case PTR_TO_MAP_VALUE:  		/* If the new min/max/var_off satisfy the old ones and @@ -3932,6 +4011,12 @@ static int do_check(struct bpf_verifier_env *env)  			if (err)  				return err; +			if (is_ctx_reg(env, insn->dst_reg)) { +				verbose(env, "BPF_ST stores into R%d context is not allowed\n", +					insn->dst_reg); +				return -EACCES; +			} +  			/* check that memory (dst_reg + off) is writeable */  			err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,  					       BPF_SIZE(insn->code), BPF_WRITE, @@ -4384,6 +4469,24 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)  	int i, cnt, delta = 0;  	for (i = 0; i < insn_cnt; i++, insn++) { +		if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || +		    insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { +			/* due to JIT bugs clear upper 32-bits of src register +			 * before div/mod operation +			 */ +			insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); +			insn_buf[1] = *insn; +			cnt = 2; +			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); +			if (!new_prog) +				return -ENOMEM; + +			delta    += cnt - 1; +			env->prog = prog = new_prog; +			insn      = new_prog->insnsi + i + delta; +			continue; +		} +  		if (insn->code != (BPF_JMP | BPF_CALL))  			continue; @@ -4407,6 +4510,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)  			 */  			insn->imm = 0;  			insn->code = BPF_JMP | BPF_TAIL_CALL; + +			/* instead of changing every JIT dealing with tail_call +			 * emit two extra insns: +			 * if (index >= max_entries) goto out; +			 * index &= array->index_mask; +			 * to avoid out-of-bounds cpu speculation +			 */ +			map_ptr = env->insn_aux_data[i + delta].map_ptr; +			if (map_ptr == BPF_MAP_PTR_POISON) { +				verbose(env, "tail_call abusing map_ptr\n"); +				return -EINVAL; +			} +			if (!map_ptr->unpriv_array) +				continue; +			insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, +						  map_ptr->max_entries, 2); +			insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, +						    container_of(map_ptr, +								 struct bpf_array, +								 map)->index_mask); +			insn_buf[2] = *insn; +			cnt = 3; +			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); +			if (!new_prog) +				return -ENOMEM; + +			delta    += cnt - 1; +			env->prog = prog = new_prog; +			insn      = new_prog->insnsi + i + delta;  			continue;  		} |