diff options
Diffstat (limited to 'kernel/bpf/verifier.c')
| -rw-r--r-- | kernel/bpf/verifier.c | 2659 | 
1 files changed, 2153 insertions, 506 deletions
| diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 264b3dc714cc..a5255a0dcbb6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -262,7 +262,7 @@ struct bpf_call_arg_meta {  	struct btf *ret_btf;  	u32 ret_btf_id;  	u32 subprogno; -	struct bpf_map_value_off_desc *kptr_off_desc; +	struct btf_field *kptr_field;  	u8 uninit_dynptr_regno;  }; @@ -451,17 +451,29 @@ static bool reg_type_not_null(enum bpf_reg_type type)  		type == PTR_TO_SOCK_COMMON;  } -static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) +static bool type_is_ptr_alloc_obj(u32 type)  { -	return reg->type == PTR_TO_MAP_VALUE && -		map_value_has_spin_lock(reg->map_ptr); +	return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;  } -static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) +static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)  { -	type = base_type(type); -	return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || -		type == PTR_TO_MEM || type == PTR_TO_BTF_ID; +	struct btf_record *rec = NULL; +	struct btf_struct_meta *meta; + +	if (reg->type == PTR_TO_MAP_VALUE) { +		rec = reg->map_ptr->record; +	} else if (type_is_ptr_alloc_obj(reg->type)) { +		meta = btf_find_struct_meta(reg->btf, reg->btf_id); +		if (meta) +			rec = meta->record; +	} +	return rec; +} + +static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) +{ +	return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);  }  static bool type_is_rdonly_mem(u32 type) @@ -511,6 +523,23 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)  	return func_id == BPF_FUNC_dynptr_data;  } +static bool is_callback_calling_function(enum bpf_func_id func_id) +{ +	return func_id == BPF_FUNC_for_each_map_elem || +	       func_id == BPF_FUNC_timer_set_callback || +	       func_id == BPF_FUNC_find_vma || +	       func_id == BPF_FUNC_loop || +	       func_id == BPF_FUNC_user_ringbuf_drain; +} + +static bool is_storage_get_function(enum bpf_func_id func_id) +{ +	return func_id == BPF_FUNC_sk_storage_get || +	       func_id == BPF_FUNC_inode_storage_get || +	       func_id == BPF_FUNC_task_storage_get || +	       func_id == BPF_FUNC_cgrp_storage_get; +} +  static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,  					const struct bpf_map *map)  { @@ -541,7 +570,7 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)  static const char *reg_type_str(struct bpf_verifier_env *env,  				enum bpf_reg_type type)  { -	char postfix[16] = {0}, prefix[32] = {0}; +	char postfix[16] = {0}, prefix[64] = {0};  	static const char * const str[] = {  		[NOT_INIT]		= "?",  		[SCALAR_VALUE]		= "scalar", @@ -563,7 +592,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env,  		[PTR_TO_BUF]		= "buf",  		[PTR_TO_FUNC]		= "func",  		[PTR_TO_MAP_KEY]	= "map_key", -		[PTR_TO_DYNPTR]		= "dynptr_ptr", +		[CONST_PTR_TO_DYNPTR]	= "dynptr_ptr",  	};  	if (type & PTR_MAYBE_NULL) { @@ -573,16 +602,15 @@ static const char *reg_type_str(struct bpf_verifier_env *env,  			strncpy(postfix, "_or_null", 16);  	} -	if (type & MEM_RDONLY) -		strncpy(prefix, "rdonly_", 32); -	if (type & MEM_ALLOC) -		strncpy(prefix, "alloc_", 32); -	if (type & MEM_USER) -		strncpy(prefix, "user_", 32); -	if (type & MEM_PERCPU) -		strncpy(prefix, "percpu_", 32); -	if (type & PTR_UNTRUSTED) -		strncpy(prefix, "untrusted_", 32); +	snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s", +		 type & MEM_RDONLY ? "rdonly_" : "", +		 type & MEM_RINGBUF ? "ringbuf_" : "", +		 type & MEM_USER ? "user_" : "", +		 type & MEM_PERCPU ? "percpu_" : "", +		 type & MEM_RCU ? "rcu_" : "", +		 type & PTR_UNTRUSTED ? "untrusted_" : "", +		 type & PTR_TRUSTED ? "trusted_" : "" +	);  	snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",  		 prefix, str[base_type(type)], postfix); @@ -697,6 +725,28 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type)  	return type == BPF_DYNPTR_TYPE_RINGBUF;  } +static void __mark_dynptr_reg(struct bpf_reg_state *reg, +			      enum bpf_dynptr_type type, +			      bool first_slot); + +static void __mark_reg_not_init(const struct bpf_verifier_env *env, +				struct bpf_reg_state *reg); + +static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1, +				   struct bpf_reg_state *sreg2, +				   enum bpf_dynptr_type type) +{ +	__mark_dynptr_reg(sreg1, type, true); +	__mark_dynptr_reg(sreg2, type, false); +} + +static void mark_dynptr_cb_reg(struct bpf_reg_state *reg, +			       enum bpf_dynptr_type type) +{ +	__mark_dynptr_reg(reg, type, true); +} + +  static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,  				   enum bpf_arg_type arg_type, int insn_idx)  { @@ -718,9 +768,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_  	if (type == BPF_DYNPTR_TYPE_INVALID)  		return -EINVAL; -	state->stack[spi].spilled_ptr.dynptr.first_slot = true; -	state->stack[spi].spilled_ptr.dynptr.type = type; -	state->stack[spi - 1].spilled_ptr.dynptr.type = type; +	mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr, +			       &state->stack[spi - 1].spilled_ptr, type);  	if (dynptr_type_refcounted(type)) {  		/* The id is used to track proper releasing */ @@ -728,8 +777,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_  		if (id < 0)  			return id; -		state->stack[spi].spilled_ptr.id = id; -		state->stack[spi - 1].spilled_ptr.id = id; +		state->stack[spi].spilled_ptr.ref_obj_id = id; +		state->stack[spi - 1].spilled_ptr.ref_obj_id = id;  	}  	return 0; @@ -751,25 +800,23 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re  	}  	/* Invalidate any slices associated with this dynptr */ -	if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { -		release_reference(env, state->stack[spi].spilled_ptr.id); -		state->stack[spi].spilled_ptr.id = 0; -		state->stack[spi - 1].spilled_ptr.id = 0; -	} - -	state->stack[spi].spilled_ptr.dynptr.first_slot = false; -	state->stack[spi].spilled_ptr.dynptr.type = 0; -	state->stack[spi - 1].spilled_ptr.dynptr.type = 0; +	if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) +		WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id)); +	__mark_reg_not_init(env, &state->stack[spi].spilled_ptr); +	__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);  	return 0;  }  static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)  {  	struct bpf_func_state *state = func(env, reg); -	int spi = get_spi(reg->off); -	int i; +	int spi, i; +	if (reg->type == CONST_PTR_TO_DYNPTR) +		return false; + +	spi = get_spi(reg->off);  	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))  		return true; @@ -782,13 +829,17 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_  	return true;  } -bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, -			      struct bpf_reg_state *reg) +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)  {  	struct bpf_func_state *state = func(env, reg); -	int spi = get_spi(reg->off); +	int spi;  	int i; +	/* This already represents first slot of initialized bpf_dynptr */ +	if (reg->type == CONST_PTR_TO_DYNPTR) +		return true; + +	spi = get_spi(reg->off);  	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||  	    !state->stack[spi].spilled_ptr.dynptr.first_slot)  		return false; @@ -802,21 +853,24 @@ bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,  	return true;  } -bool is_dynptr_type_expected(struct bpf_verifier_env *env, -			     struct bpf_reg_state *reg, -			     enum bpf_arg_type arg_type) +static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg, +				    enum bpf_arg_type arg_type)  {  	struct bpf_func_state *state = func(env, reg);  	enum bpf_dynptr_type dynptr_type; -	int spi = get_spi(reg->off); +	int spi;  	/* ARG_PTR_TO_DYNPTR takes any type of dynptr */  	if (arg_type == ARG_PTR_TO_DYNPTR)  		return true;  	dynptr_type = arg_to_dynptr_type(arg_type); - -	return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; +	if (reg->type == CONST_PTR_TO_DYNPTR) { +		return reg->dynptr.type == dynptr_type; +	} else { +		spi = get_spi(reg->off); +		return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; +	}  }  /* The reg state of a pointer or a bounded scalar was saved when @@ -875,7 +929,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,  			if (reg->id)  				verbose_a("id=%d", reg->id); -			if (reg_type_may_be_refcounted_or_null(t) && reg->ref_obj_id) +			if (reg->ref_obj_id)  				verbose_a("ref_obj_id=%d", reg->ref_obj_id);  			if (t != SCALAR_VALUE)  				verbose_a("off=%d", reg->off); @@ -1008,9 +1062,9 @@ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t  	if (unlikely(check_mul_overflow(n, size, &bytes)))  		return NULL; -	if (ksize(dst) < bytes) { +	if (ksize(dst) < ksize(src)) {  		kfree(dst); -		dst = kmalloc_track_caller(bytes, flags); +		dst = kmalloc_track_caller(kmalloc_size_roundup(bytes), flags);  		if (!dst)  			return NULL;  	} @@ -1027,12 +1081,14 @@ out:   */  static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size)  { +	size_t alloc_size;  	void *new_arr;  	if (!new_n || old_n == new_n)  		goto out; -	new_arr = krealloc_array(arr, new_n, size, GFP_KERNEL); +	alloc_size = kmalloc_size_roundup(size_mul(new_n, size)); +	new_arr = krealloc(arr, alloc_size, GFP_KERNEL);  	if (!new_arr) {  		kfree(arr);  		return NULL; @@ -1204,8 +1260,10 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,  		dst_state->frame[i] = NULL;  	}  	dst_state->speculative = src->speculative; +	dst_state->active_rcu_lock = src->active_rcu_lock;  	dst_state->curframe = src->curframe; -	dst_state->active_spin_lock = src->active_spin_lock; +	dst_state->active_lock.ptr = src->active_lock.ptr; +	dst_state->active_lock.id = src->active_lock.id;  	dst_state->branches = src->branches;  	dst_state->parent = src->parent;  	dst_state->first_insn_idx = src->first_insn_idx; @@ -1324,9 +1382,6 @@ static const int caller_saved[CALLER_SAVED_REGS] = {  	BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5  }; -static void __mark_reg_not_init(const struct bpf_verifier_env *env, -				struct bpf_reg_state *reg); -  /* This helper doesn't clear reg->id */  static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)  { @@ -1389,6 +1444,19 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,  	__mark_reg_known_zero(regs + regno);  } +static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, +			      bool first_slot) +{ +	/* reg->type has no meaning for STACK_DYNPTR, but when we set reg for +	 * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply +	 * set it unconditionally as it is ignored for STACK_DYNPTR anyway. +	 */ +	__mark_reg_known_zero(reg); +	reg->type = CONST_PTR_TO_DYNPTR; +	reg->dynptr.type = type; +	reg->dynptr.first_slot = first_slot; +} +  static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)  {  	if (base_type(reg->type) == PTR_TO_MAP_VALUE) { @@ -1400,7 +1468,7 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)  			/* transfer reg's id which is unique for every map_lookup_elem  			 * as UID of the inner map.  			 */ -			if (map_value_has_timer(map->inner_map_meta)) +			if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER))  				reg->map_uid = reg->id;  		} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {  			reg->type = PTR_TO_XDP_SOCK; @@ -1689,7 +1757,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,  	reg->type = SCALAR_VALUE;  	reg->var_off = tnum_unknown;  	reg->frameno = 0; -	reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; +	reg->precise = !env->bpf_capable;  	__mark_reg_unbounded(reg);  } @@ -2498,15 +2566,30 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,  	return 0;  } +static void mark_jmp_point(struct bpf_verifier_env *env, int idx) +{ +	env->insn_aux_data[idx].jmp_point = true; +} + +static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx) +{ +	return env->insn_aux_data[insn_idx].jmp_point; +} +  /* for any branch, call, exit record the history of jmps in the given state */  static int push_jmp_history(struct bpf_verifier_env *env,  			    struct bpf_verifier_state *cur)  {  	u32 cnt = cur->jmp_history_cnt;  	struct bpf_idx_pair *p; +	size_t alloc_size; + +	if (!is_jmp_point(env, env->insn_idx)) +		return 0;  	cnt++; -	p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER); +	alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); +	p = krealloc(cur->jmp_history, alloc_size, GFP_USER);  	if (!p)  		return -ENOMEM;  	p[cnt - 1].idx = env->insn_idx; @@ -2658,6 +2741,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,  		if (opcode == BPF_CALL) {  			if (insn->src_reg == BPF_PSEUDO_CALL)  				return -ENOTSUPP; +			/* BPF helpers that invoke callback subprogs are +			 * equivalent to BPF_PSEUDO_CALL above +			 */ +			if (insn->src_reg == 0 && is_callback_calling_function(insn->imm)) +				return -ENOTSUPP;  			/* regular helper call sets R0 */  			*reg_mask &= ~1;  			if (*reg_mask & 0x3f) { @@ -2747,8 +2835,11 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,  	/* big hammer: mark all scalars precise in this path.  	 * pop_stack may still get !precise scalars. +	 * We also skip current state and go straight to first parent state, +	 * because precision markings in current non-checkpointed state are +	 * not needed. See why in the comment in __mark_chain_precision below.  	 */ -	for (; st; st = st->parent) +	for (st = st->parent; st; st = st->parent) {  		for (i = 0; i <= st->curframe; i++) {  			func = st->frame[i];  			for (j = 0; j < BPF_REG_FP; j++) { @@ -2766,9 +2857,122 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,  				reg->precise = true;  			}  		} +	} +} + +static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ +	struct bpf_func_state *func; +	struct bpf_reg_state *reg; +	int i, j; + +	for (i = 0; i <= st->curframe; i++) { +		func = st->frame[i]; +		for (j = 0; j < BPF_REG_FP; j++) { +			reg = &func->regs[j]; +			if (reg->type != SCALAR_VALUE) +				continue; +			reg->precise = false; +		} +		for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { +			if (!is_spilled_reg(&func->stack[j])) +				continue; +			reg = &func->stack[j].spilled_ptr; +			if (reg->type != SCALAR_VALUE) +				continue; +			reg->precise = false; +		} +	}  } -static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, +/* + * __mark_chain_precision() backtracks BPF program instruction sequence and + * chain of verifier states making sure that register *regno* (if regno >= 0) + * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked + * SCALARS, as well as any other registers and slots that contribute to + * a tracked state of given registers/stack slots, depending on specific BPF + * assembly instructions (see backtrack_insns() for exact instruction handling + * logic). This backtracking relies on recorded jmp_history and is able to + * traverse entire chain of parent states. This process ends only when all the + * necessary registers/slots and their transitive dependencies are marked as + * precise. + * + * One important and subtle aspect is that precise marks *do not matter* in + * the currently verified state (current state). It is important to understand + * why this is the case. + * + * First, note that current state is the state that is not yet "checkpointed", + * i.e., it is not yet put into env->explored_states, and it has no children + * states as well. It's ephemeral, and can end up either a) being discarded if + * compatible explored state is found at some point or BPF_EXIT instruction is + * reached or b) checkpointed and put into env->explored_states, branching out + * into one or more children states. + * + * In the former case, precise markings in current state are completely + * ignored by state comparison code (see regsafe() for details). Only + * checkpointed ("old") state precise markings are important, and if old + * state's register/slot is precise, regsafe() assumes current state's + * register/slot as precise and checks value ranges exactly and precisely. If + * states turn out to be compatible, current state's necessary precise + * markings and any required parent states' precise markings are enforced + * after the fact with propagate_precision() logic, after the fact. But it's + * important to realize that in this case, even after marking current state + * registers/slots as precise, we immediately discard current state. So what + * actually matters is any of the precise markings propagated into current + * state's parent states, which are always checkpointed (due to b) case above). + * As such, for scenario a) it doesn't matter if current state has precise + * markings set or not. + * + * Now, for the scenario b), checkpointing and forking into child(ren) + * state(s). Note that before current state gets to checkpointing step, any + * processed instruction always assumes precise SCALAR register/slot + * knowledge: if precise value or range is useful to prune jump branch, BPF + * verifier takes this opportunity enthusiastically. Similarly, when + * register's value is used to calculate offset or memory address, exact + * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to + * what we mentioned above about state comparison ignoring precise markings + * during state comparison, BPF verifier ignores and also assumes precise + * markings *at will* during instruction verification process. But as verifier + * assumes precision, it also propagates any precision dependencies across + * parent states, which are not yet finalized, so can be further restricted + * based on new knowledge gained from restrictions enforced by their children + * states. This is so that once those parent states are finalized, i.e., when + * they have no more active children state, state comparison logic in + * is_state_visited() would enforce strict and precise SCALAR ranges, if + * required for correctness. + * + * To build a bit more intuition, note also that once a state is checkpointed, + * the path we took to get to that state is not important. This is crucial + * property for state pruning. When state is checkpointed and finalized at + * some instruction index, it can be correctly and safely used to "short + * circuit" any *compatible* state that reaches exactly the same instruction + * index. I.e., if we jumped to that instruction from a completely different + * code path than original finalized state was derived from, it doesn't + * matter, current state can be discarded because from that instruction + * forward having a compatible state will ensure we will safely reach the + * exit. States describe preconditions for further exploration, but completely + * forget the history of how we got here. + * + * This also means that even if we needed precise SCALAR range to get to + * finalized state, but from that point forward *that same* SCALAR register is + * never used in a precise context (i.e., it's precise value is not needed for + * correctness), it's correct and safe to mark such register as "imprecise" + * (i.e., precise marking set to false). This is what we rely on when we do + * not set precise marking in current state. If no child state requires + * precision for any given SCALAR register, it's safe to dictate that it can + * be imprecise. If any child state does require this register to be precise, + * we'll mark it precise later retroactively during precise markings + * propagation from child state to parent states. + * + * Skipping precise marking setting in current state is a mild version of + * relying on the above observation. But we can utilize this property even + * more aggressively by proactively forgetting any precise marking in the + * current state (which we inherited from the parent state), right before we + * checkpoint it and branch off into new child state. This is done by + * mark_all_scalars_imprecise() to hopefully get more permissive and generic + * finalized states which help in short circuiting more future states. + */ +static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,  				  int spi)  {  	struct bpf_verifier_state *st = env->cur_state; @@ -2785,18 +2989,18 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,  	if (!env->bpf_capable)  		return 0; -	func = st->frame[st->curframe]; +	/* Do sanity checks against current state of register and/or stack +	 * slot, but don't set precise flag in current state, as precision +	 * tracking in the current state is unnecessary. +	 */ +	func = st->frame[frame];  	if (regno >= 0) {  		reg = &func->regs[regno];  		if (reg->type != SCALAR_VALUE) {  			WARN_ONCE(1, "backtracing misuse");  			return -EFAULT;  		} -		if (!reg->precise) -			new_marks = true; -		else -			reg_mask = 0; -		reg->precise = true; +		new_marks = true;  	}  	while (spi >= 0) { @@ -2809,11 +3013,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,  			stack_mask = 0;  			break;  		} -		if (!reg->precise) -			new_marks = true; -		else -			stack_mask = 0; -		reg->precise = true; +		new_marks = true;  		break;  	} @@ -2821,12 +3021,42 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,  		return 0;  	if (!reg_mask && !stack_mask)  		return 0; +  	for (;;) {  		DECLARE_BITMAP(mask, 64);  		u32 history = st->jmp_history_cnt;  		if (env->log.level & BPF_LOG_LEVEL2)  			verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); + +		if (last_idx < 0) { +			/* we are at the entry into subprog, which +			 * is expected for global funcs, but only if +			 * requested precise registers are R1-R5 +			 * (which are global func's input arguments) +			 */ +			if (st->curframe == 0 && +			    st->frame[0]->subprogno > 0 && +			    st->frame[0]->callsite == BPF_MAIN_FUNC && +			    stack_mask == 0 && (reg_mask & ~0x3e) == 0) { +				bitmap_from_u64(mask, reg_mask); +				for_each_set_bit(i, mask, 32) { +					reg = &st->frame[0]->regs[i]; +					if (reg->type != SCALAR_VALUE) { +						reg_mask &= ~(1u << i); +						continue; +					} +					reg->precise = true; +				} +				return 0; +			} + +			verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n", +				st->frame[0]->subprogno, reg_mask, stack_mask); +			WARN_ONCE(1, "verifier backtracking bug"); +			return -EFAULT; +		} +  		for (i = last_idx;;) {  			if (skip_first) {  				err = 0; @@ -2866,7 +3096,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,  			break;  		new_marks = false; -		func = st->frame[st->curframe]; +		func = st->frame[frame];  		bitmap_from_u64(mask, reg_mask);  		for_each_set_bit(i, mask, 32) {  			reg = &func->regs[i]; @@ -2932,12 +3162,17 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,  int mark_chain_precision(struct bpf_verifier_env *env, int regno)  { -	return __mark_chain_precision(env, regno, -1); +	return __mark_chain_precision(env, env->cur_state->curframe, regno, -1);  } -static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) +static int mark_chain_precision_frame(struct bpf_verifier_env *env, int frame, int regno)  { -	return __mark_chain_precision(env, -1, spi); +	return __mark_chain_precision(env, frame, regno, -1); +} + +static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int frame, int spi) +{ +	return __mark_chain_precision(env, frame, -1, spi);  }  static bool is_spillable_regtype(enum bpf_reg_type type) @@ -3186,14 +3421,17 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,  		stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];  		mark_stack_slot_scratched(env, spi); -		if (!env->allow_ptr_leaks -				&& *stype != NOT_INIT -				&& *stype != SCALAR_VALUE) { -			/* Reject the write if there's are spilled pointers in -			 * range. If we didn't reject here, the ptr status -			 * would be erased below (even though not all slots are -			 * actually overwritten), possibly opening the door to -			 * leaks. +		if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) { +			/* Reject the write if range we may write to has not +			 * been initialized beforehand. If we didn't reject +			 * here, the ptr status would be erased below (even +			 * though not all slots are actually overwritten), +			 * possibly opening the door to leaks. +			 * +			 * We do however catch STACK_INVALID case below, and +			 * only allow reading possibly uninitialized memory +			 * later for CAP_PERFMON, as the write may not happen to +			 * that slot.  			 */  			verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",  				insn_idx, i); @@ -3683,15 +3921,15 @@ int check_ptr_off_reg(struct bpf_verifier_env *env,  }  static int map_kptr_match_type(struct bpf_verifier_env *env, -			       struct bpf_map_value_off_desc *off_desc, +			       struct btf_field *kptr_field,  			       struct bpf_reg_state *reg, u32 regno)  { -	const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id); -	int perm_flags = PTR_MAYBE_NULL; +	const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); +	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED;  	const char *reg_name = "";  	/* Only unreferenced case accepts untrusted pointers */ -	if (off_desc->type == BPF_KPTR_UNREF) +	if (kptr_field->type == BPF_KPTR_UNREF)  		perm_flags |= PTR_UNTRUSTED;  	if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) @@ -3738,15 +3976,15 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,  	 * strict mode to true for type match.  	 */  	if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, -				  off_desc->kptr.btf, off_desc->kptr.btf_id, -				  off_desc->type == BPF_KPTR_REF)) +				  kptr_field->kptr.btf, kptr_field->kptr.btf_id, +				  kptr_field->type == BPF_KPTR_REF))  		goto bad_type;  	return 0;  bad_type:  	verbose(env, "invalid kptr access, R%d type=%s%s ", regno,  		reg_type_str(env, reg->type), reg_name);  	verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name); -	if (off_desc->type == BPF_KPTR_UNREF) +	if (kptr_field->type == BPF_KPTR_UNREF)  		verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED),  			targ_name);  	else @@ -3756,7 +3994,7 @@ bad_type:  static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,  				 int value_regno, int insn_idx, -				 struct bpf_map_value_off_desc *off_desc) +				 struct btf_field *kptr_field)  {  	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];  	int class = BPF_CLASS(insn->code); @@ -3766,7 +4004,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,  	 *  - Reject cases where variable offset may touch kptr  	 *  - size of access (must be BPF_DW)  	 *  - tnum_is_const(reg->var_off) -	 *  - off_desc->offset == off + reg->var_off.value +	 *  - kptr_field->offset == off + reg->var_off.value  	 */  	/* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */  	if (BPF_MODE(insn->code) != BPF_MEM) { @@ -3777,7 +4015,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,  	/* We only allow loading referenced kptr, since it will be marked as  	 * untrusted, similar to unreferenced kptr.  	 */ -	if (class != BPF_LDX && off_desc->type == BPF_KPTR_REF) { +	if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) {  		verbose(env, "store to referenced kptr disallowed\n");  		return -EACCES;  	} @@ -3787,19 +4025,19 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,  		/* We can simply mark the value_regno receiving the pointer  		 * value from map as PTR_TO_BTF_ID, with the correct type.  		 */ -		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf, -				off_desc->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); +		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, +				kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);  		/* For mark_ptr_or_null_reg */  		val_reg->id = ++env->id_gen;  	} else if (class == BPF_STX) {  		val_reg = reg_state(env, value_regno);  		if (!register_is_null(val_reg) && -		    map_kptr_match_type(env, off_desc, val_reg, value_regno)) +		    map_kptr_match_type(env, kptr_field, val_reg, value_regno))  			return -EACCES;  	} else if (class == BPF_ST) {  		if (insn->imm) {  			verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n", -				off_desc->offset); +				kptr_field->offset);  			return -EACCES;  		}  	} else { @@ -3818,45 +4056,30 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,  	struct bpf_func_state *state = vstate->frame[vstate->curframe];  	struct bpf_reg_state *reg = &state->regs[regno];  	struct bpf_map *map = reg->map_ptr; -	int err; +	struct btf_record *rec; +	int err, i;  	err = check_mem_region_access(env, regno, off, size, map->value_size,  				      zero_size_allowed);  	if (err)  		return err; -	if (map_value_has_spin_lock(map)) { -		u32 lock = map->spin_lock_off; +	if (IS_ERR_OR_NULL(map->record)) +		return 0; +	rec = map->record; +	for (i = 0; i < rec->cnt; i++) { +		struct btf_field *field = &rec->fields[i]; +		u32 p = field->offset; -		/* if any part of struct bpf_spin_lock can be touched by -		 * load/store reject this program. -		 * To check that [x1, x2) overlaps with [y1, y2) +		/* If any part of a field  can be touched by load/store, reject +		 * this program. To check that [x1, x2) overlaps with [y1, y2),  		 * it is sufficient to check x1 < y2 && y1 < x2.  		 */ -		if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) && -		     lock < reg->umax_value + off + size) { -			verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n"); -			return -EACCES; -		} -	} -	if (map_value_has_timer(map)) { -		u32 t = map->timer_off; - -		if (reg->smin_value + off < t + sizeof(struct bpf_timer) && -		     t < reg->umax_value + off + size) { -			verbose(env, "bpf_timer cannot be accessed directly by load/store\n"); -			return -EACCES; -		} -	} -	if (map_value_has_kptrs(map)) { -		struct bpf_map_value_off *tab = map->kptr_off_tab; -		int i; - -		for (i = 0; i < tab->nr_off; i++) { -			u32 p = tab->off[i].offset; - -			if (reg->smin_value + off < p + sizeof(u64) && -			    p < reg->umax_value + off + size) { +		if (reg->smin_value + off < p + btf_field_type_size(field->type) && +		    p < reg->umax_value + off + size) { +			switch (field->type) { +			case BPF_KPTR_UNREF: +			case BPF_KPTR_REF:  				if (src != ACCESS_DIRECT) {  					verbose(env, "kptr cannot be accessed indirectly by helper\n");  					return -EACCES; @@ -3875,10 +4098,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,  					return -EACCES;  				}  				break; +			default: +				verbose(env, "%s cannot be accessed directly by load/store\n", +					btf_field_type_name(field->type)); +				return -EACCES;  			}  		}  	} -	return err; +	return 0;  }  #define MAX_PACKET_OFF 0xffff @@ -4095,6 +4322,30 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)  	return reg->type == PTR_TO_FLOW_KEYS;  } +static bool is_trusted_reg(const struct bpf_reg_state *reg) +{ +	/* A referenced register is always trusted. */ +	if (reg->ref_obj_id) +		return true; + +	/* If a register is not referenced, it is trusted if it has the +	 * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the +	 * other type modifiers may be safe, but we elect to take an opt-in +	 * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are +	 * not. +	 * +	 * Eventually, we should make PTR_TRUSTED the single source of truth +	 * for whether a register is trusted. +	 */ +	return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS && +	       !bpf_type_has_unsafe_modifiers(reg->type); +} + +static bool is_rcu_reg(const struct bpf_reg_state *reg) +{ +	return reg->type & MEM_RCU; +} +  static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,  				   const struct bpf_reg_state *reg,  				   int off, int size, bool strict) @@ -4511,6 +4762,18 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,  	u32 btf_id;  	int ret; +	if (!env->allow_ptr_leaks) { +		verbose(env, +			"'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", +			tname); +		return -EPERM; +	} +	if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) { +		verbose(env, +			"Cannot access kernel 'struct %s' from non-GPL compatible program\n", +			tname); +		return -EINVAL; +	}  	if (off < 0) {  		verbose(env,  			"R%d is ptr_%s invalid negative access: off=%d\n", @@ -4541,17 +4804,28 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,  		return -EACCES;  	} -	if (env->ops->btf_struct_access) { -		ret = env->ops->btf_struct_access(&env->log, reg->btf, t, -						  off, size, atype, &btf_id, &flag); +	if (env->ops->btf_struct_access && !type_is_alloc(reg->type)) { +		if (!btf_is_kernel(reg->btf)) { +			verbose(env, "verifier internal error: reg->btf must be kernel btf\n"); +			return -EFAULT; +		} +		ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);  	} else { -		if (atype != BPF_READ) { +		/* Writes are permitted with default btf_struct_access for +		 * program allocated objects (which always have ref_obj_id > 0), +		 * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC. +		 */ +		if (atype != BPF_READ && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {  			verbose(env, "only read is supported\n");  			return -EACCES;  		} -		ret = btf_struct_access(&env->log, reg->btf, t, off, size, -					atype, &btf_id, &flag); +		if (type_is_alloc(reg->type) && !reg->ref_obj_id) { +			verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n"); +			return -EFAULT; +		} + +		ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);  	}  	if (ret < 0) @@ -4563,6 +4837,30 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,  	if (type_flag(reg->type) & PTR_UNTRUSTED)  		flag |= PTR_UNTRUSTED; +	/* By default any pointer obtained from walking a trusted pointer is +	 * no longer trusted except the rcu case below. +	 */ +	flag &= ~PTR_TRUSTED; + +	if (flag & MEM_RCU) { +		/* Mark value register as MEM_RCU only if it is protected by +		 * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU +		 * itself can already indicate trustedness inside the rcu +		 * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since +		 * it could be null in some cases. +		 */ +		if (!env->cur_state->active_rcu_lock || +		    !(is_trusted_reg(reg) || is_rcu_reg(reg))) +			flag &= ~MEM_RCU; +		else +			flag |= PTR_MAYBE_NULL; +	} else if (reg->type & MEM_RCU) { +		/* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged +		 * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively. +		 */ +		flag |= PTR_UNTRUSTED; +	} +  	if (atype == BPF_READ && value_regno >= 0)  		mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); @@ -4577,6 +4875,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,  {  	struct bpf_reg_state *reg = regs + regno;  	struct bpf_map *map = reg->map_ptr; +	struct bpf_reg_state map_reg;  	enum bpf_type_flag flag = 0;  	const struct btf_type *t;  	const char *tname; @@ -4597,9 +4896,9 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,  	t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);  	tname = btf_name_by_offset(btf_vmlinux, t->name_off); -	if (!env->allow_ptr_to_map_access) { +	if (!env->allow_ptr_leaks) {  		verbose(env, -			"%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", +			"'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",  			tname);  		return -EPERM;  	} @@ -4615,7 +4914,10 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,  		return -EACCES;  	} -	ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag); +	/* Simulate access to a PTR_TO_BTF_ID */ +	memset(&map_reg, 0, sizeof(map_reg)); +	mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0); +	ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag);  	if (ret < 0)  		return ret; @@ -4751,7 +5053,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  		if (value_regno >= 0)  			mark_reg_unknown(env, regs, value_regno);  	} else if (reg->type == PTR_TO_MAP_VALUE) { -		struct bpf_map_value_off_desc *kptr_off_desc = NULL; +		struct btf_field *kptr_field = NULL;  		if (t == BPF_WRITE && value_regno >= 0 &&  		    is_pointer_value(env, value_regno)) { @@ -4765,11 +5067,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn  		if (err)  			return err;  		if (tnum_is_const(reg->var_off)) -			kptr_off_desc = bpf_map_kptr_off_contains(reg->map_ptr, -								  off + reg->var_off.value); -		if (kptr_off_desc) { -			err = check_map_kptr_access(env, regno, value_regno, insn_idx, -						    kptr_off_desc); +			kptr_field = btf_record_find(reg->map_ptr->record, +						     off + reg->var_off.value, BPF_KPTR); +		if (kptr_field) { +			err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);  		} else if (t == BPF_READ && value_regno >= 0) {  			struct bpf_map *map = reg->map_ptr; @@ -5160,10 +5461,6 @@ static int check_stack_range_initialized(  		}  		if (is_spilled_reg(&state->stack[spi]) && -		    base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID) -			goto mark; - -		if (is_spilled_reg(&state->stack[spi]) &&  		    (state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||  		     env->allow_ptr_leaks)) {  			if (clobber) { @@ -5193,6 +5490,11 @@ mark:  		mark_reg_read(env, &state->stack[spi].spilled_ptr,  			      state->stack[spi].spilled_ptr.parent,  			      REG_LIVE_READ64); +		/* We do not set REG_LIVE_WRITTEN for stack slot, as we can not +		 * be sure that whether stack slot is written to or not. Hence, +		 * we must still conservatively propagate reads upwards even if +		 * helper may write to the entire memory range. +		 */  	}  	return update_stack_depth(env, state, min_off);  } @@ -5374,8 +5676,8 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,  	return err;  } -int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, -			     u32 regno) +static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, +				    u32 regno)  {  	struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];  	bool may_be_null = type_may_be_null(mem_reg->type); @@ -5403,23 +5705,26 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state  }  /* Implementation details: - * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL + * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL. + * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL.   * Two bpf_map_lookups (even with the same key) will have different reg->id. - * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after - * value_or_null->value transition, since the verifier only cares about - * the range of access to valid map value pointer and doesn't care about actual - * address of the map element. + * Two separate bpf_obj_new will also have different reg->id. + * For traditional PTR_TO_MAP_VALUE or PTR_TO_BTF_ID | MEM_ALLOC, the verifier + * clears reg->id after value_or_null->value transition, since the verifier only + * cares about the range of access to valid map value pointer and doesn't care + * about actual address of the map element.   * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps   * reg->id > 0 after value_or_null->value transition. By doing so   * two bpf_map_lookups will be considered two different pointers that - * point to different bpf_spin_locks. + * point to different bpf_spin_locks. Likewise for pointers to allocated objects + * returned from bpf_obj_new.   * The verifier allows taking only one bpf_spin_lock at a time to avoid   * dead-locks.   * Since only one bpf_spin_lock is allowed the checks are simpler than   * reg_is_refcounted() logic. The verifier needs to remember only   * one spin_lock instead of array of acquired_refs. - * cur_state->active_spin_lock remembers which map value element got locked - * and clears it after bpf_spin_unlock. + * cur_state->active_lock remembers which map value element or allocated + * object got locked and clears it after bpf_spin_unlock.   */  static int process_spin_lock(struct bpf_verifier_env *env, int regno,  			     bool is_lock) @@ -5427,8 +5732,10 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,  	struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno];  	struct bpf_verifier_state *cur = env->cur_state;  	bool is_const = tnum_is_const(reg->var_off); -	struct bpf_map *map = reg->map_ptr;  	u64 val = reg->var_off.value; +	struct bpf_map *map = NULL; +	struct btf *btf = NULL; +	struct btf_record *rec;  	if (!is_const) {  		verbose(env, @@ -5436,49 +5743,78 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,  			regno);  		return -EINVAL;  	} -	if (!map->btf) { -		verbose(env, -			"map '%s' has to have BTF in order to use bpf_spin_lock\n", -			map->name); -		return -EINVAL; -	} -	if (!map_value_has_spin_lock(map)) { -		if (map->spin_lock_off == -E2BIG) +	if (reg->type == PTR_TO_MAP_VALUE) { +		map = reg->map_ptr; +		if (!map->btf) {  			verbose(env, -				"map '%s' has more than one 'struct bpf_spin_lock'\n", -				map->name); -		else if (map->spin_lock_off == -ENOENT) -			verbose(env, -				"map '%s' doesn't have 'struct bpf_spin_lock'\n", -				map->name); -		else -			verbose(env, -				"map '%s' is not a struct type or bpf_spin_lock is mangled\n", +				"map '%s' has to have BTF in order to use bpf_spin_lock\n",  				map->name); +			return -EINVAL; +		} +	} else { +		btf = reg->btf; +	} + +	rec = reg_btf_record(reg); +	if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) { +		verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local", +			map ? map->name : "kptr");  		return -EINVAL;  	} -	if (map->spin_lock_off != val + reg->off) { -		verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n", -			val + reg->off); +	if (rec->spin_lock_off != val + reg->off) { +		verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n", +			val + reg->off, rec->spin_lock_off);  		return -EINVAL;  	}  	if (is_lock) { -		if (cur->active_spin_lock) { +		if (cur->active_lock.ptr) {  			verbose(env,  				"Locking two bpf_spin_locks are not allowed\n");  			return -EINVAL;  		} -		cur->active_spin_lock = reg->id; +		if (map) +			cur->active_lock.ptr = map; +		else +			cur->active_lock.ptr = btf; +		cur->active_lock.id = reg->id;  	} else { -		if (!cur->active_spin_lock) { +		struct bpf_func_state *fstate = cur_func(env); +		void *ptr; +		int i; + +		if (map) +			ptr = map; +		else +			ptr = btf; + +		if (!cur->active_lock.ptr) {  			verbose(env, "bpf_spin_unlock without taking a lock\n");  			return -EINVAL;  		} -		if (cur->active_spin_lock != reg->id) { +		if (cur->active_lock.ptr != ptr || +		    cur->active_lock.id != reg->id) {  			verbose(env, "bpf_spin_unlock of different lock\n");  			return -EINVAL;  		} -		cur->active_spin_lock = 0; +		cur->active_lock.ptr = NULL; +		cur->active_lock.id = 0; + +		for (i = fstate->acquired_refs - 1; i >= 0; i--) { +			int err; + +			/* Complain on error because this reference state cannot +			 * be freed before this point, as bpf_spin_lock critical +			 * section does not allow functions that release the +			 * allocated object immediately. +			 */ +			if (!fstate->refs[i].release_on_unlock) +				continue; +			err = release_reference(env, fstate->refs[i].id); +			if (err) { +				verbose(env, "failed to release release_on_unlock reference"); +				return err; +			} +		}  	}  	return 0;  } @@ -5502,24 +5838,13 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,  			map->name);  		return -EINVAL;  	} -	if (!map_value_has_timer(map)) { -		if (map->timer_off == -E2BIG) -			verbose(env, -				"map '%s' has more than one 'struct bpf_timer'\n", -				map->name); -		else if (map->timer_off == -ENOENT) -			verbose(env, -				"map '%s' doesn't have 'struct bpf_timer'\n", -				map->name); -		else -			verbose(env, -				"map '%s' is not a struct type or bpf_timer is mangled\n", -				map->name); +	if (!btf_record_has_field(map->record, BPF_TIMER)) { +		verbose(env, "map '%s' has no valid bpf_timer\n", map->name);  		return -EINVAL;  	} -	if (map->timer_off != val + reg->off) { +	if (map->record->timer_off != val + reg->off) {  		verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n", -			val + reg->off, map->timer_off); +			val + reg->off, map->record->timer_off);  		return -EINVAL;  	}  	if (meta->map_ptr) { @@ -5535,10 +5860,9 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,  			     struct bpf_call_arg_meta *meta)  {  	struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; -	struct bpf_map_value_off_desc *off_desc;  	struct bpf_map *map_ptr = reg->map_ptr; +	struct btf_field *kptr_field;  	u32 kptr_off; -	int ret;  	if (!tnum_is_const(reg->var_off)) {  		verbose(env, @@ -5551,30 +5875,136 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,  			map_ptr->name);  		return -EINVAL;  	} -	if (!map_value_has_kptrs(map_ptr)) { -		ret = PTR_ERR_OR_ZERO(map_ptr->kptr_off_tab); -		if (ret == -E2BIG) -			verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name, -				BPF_MAP_VALUE_OFF_MAX); -		else if (ret == -EEXIST) -			verbose(env, "map '%s' has repeating kptr BTF tags\n", map_ptr->name); -		else -			verbose(env, "map '%s' has no valid kptr\n", map_ptr->name); +	if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) { +		verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);  		return -EINVAL;  	}  	meta->map_ptr = map_ptr;  	kptr_off = reg->off + reg->var_off.value; -	off_desc = bpf_map_kptr_off_contains(map_ptr, kptr_off); -	if (!off_desc) { +	kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR); +	if (!kptr_field) {  		verbose(env, "off=%d doesn't point to kptr\n", kptr_off);  		return -EACCES;  	} -	if (off_desc->type != BPF_KPTR_REF) { +	if (kptr_field->type != BPF_KPTR_REF) {  		verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);  		return -EACCES;  	} -	meta->kptr_off_desc = off_desc; +	meta->kptr_field = kptr_field; +	return 0; +} + +/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK + * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR. + * + * In both cases we deal with the first 8 bytes, but need to mark the next 8 + * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of + * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object. + * + * Mutability of bpf_dynptr is at two levels, one is at the level of struct + * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct + * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can + * mutate the view of the dynptr and also possibly destroy it. In the latter + * case, it cannot mutate the bpf_dynptr itself but it can still mutate the + * memory that dynptr points to. + * + * The verifier will keep track both levels of mutation (bpf_dynptr's in + * reg->type and the memory's in reg->dynptr.type), but there is no support for + * readonly dynptr view yet, hence only the first case is tracked and checked. + * + * This is consistent with how C applies the const modifier to a struct object, + * where the pointer itself inside bpf_dynptr becomes const but not what it + * points to. + * + * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument + * type, and declare it as 'const struct bpf_dynptr *' in their prototype. + */ +int process_dynptr_func(struct bpf_verifier_env *env, int regno, +			enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) +{ +	struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + +	/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an +	 * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): +	 */ +	if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) { +		verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n"); +		return -EFAULT; +	} +	/* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to +	 * check_func_arg_reg_off's logic. We only need to check offset +	 * alignment for PTR_TO_STACK. +	 */ +	if (reg->type == PTR_TO_STACK && (reg->off % BPF_REG_SIZE)) { +		verbose(env, "cannot pass in dynptr at an offset=%d\n", reg->off); +		return -EINVAL; +	} +	/*  MEM_UNINIT - Points to memory that is an appropriate candidate for +	 *		 constructing a mutable bpf_dynptr object. +	 * +	 *		 Currently, this is only possible with PTR_TO_STACK +	 *		 pointing to a region of at least 16 bytes which doesn't +	 *		 contain an existing bpf_dynptr. +	 * +	 *  MEM_RDONLY - Points to a initialized bpf_dynptr that will not be +	 *		 mutated or destroyed. However, the memory it points to +	 *		 may be mutated. +	 * +	 *  None       - Points to a initialized dynptr that can be mutated and +	 *		 destroyed, including mutation of the memory it points +	 *		 to. +	 */ +	if (arg_type & MEM_UNINIT) { +		if (!is_dynptr_reg_valid_uninit(env, reg)) { +			verbose(env, "Dynptr has to be an uninitialized dynptr\n"); +			return -EINVAL; +		} + +		/* We only support one dynptr being uninitialized at the moment, +		 * which is sufficient for the helper functions we have right now. +		 */ +		if (meta->uninit_dynptr_regno) { +			verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); +			return -EFAULT; +		} + +		meta->uninit_dynptr_regno = regno; +	} else /* MEM_RDONLY and None case from above */ { +		/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ +		if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) { +			verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n"); +			return -EINVAL; +		} + +		if (!is_dynptr_reg_valid_init(env, reg)) { +			verbose(env, +				"Expected an initialized dynptr as arg #%d\n", +				regno); +			return -EINVAL; +		} + +		/* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */ +		if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) { +			const char *err_extra = ""; + +			switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { +			case DYNPTR_TYPE_LOCAL: +				err_extra = "local"; +				break; +			case DYNPTR_TYPE_RINGBUF: +				err_extra = "ringbuf"; +				break; +			default: +				err_extra = "<unknown>"; +				break; +			} +			verbose(env, +				"Expected a dynptr of type %s as arg #%d\n", +				err_extra, regno); +			return -EINVAL; +		} +	}  	return 0;  } @@ -5639,16 +6069,6 @@ struct bpf_reg_types {  	u32 *btf_id;  }; -static const struct bpf_reg_types map_key_value_types = { -	.types = { -		PTR_TO_STACK, -		PTR_TO_PACKET, -		PTR_TO_PACKET_META, -		PTR_TO_MAP_KEY, -		PTR_TO_MAP_VALUE, -	}, -}; -  static const struct bpf_reg_types sock_types = {  	.types = {  		PTR_TO_SOCK_COMMON, @@ -5666,6 +6086,7 @@ static const struct bpf_reg_types btf_id_sock_common_types = {  		PTR_TO_TCP_SOCK,  		PTR_TO_XDP_SOCK,  		PTR_TO_BTF_ID, +		PTR_TO_BTF_ID | PTR_TRUSTED,  	},  	.btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],  }; @@ -5679,7 +6100,7 @@ static const struct bpf_reg_types mem_types = {  		PTR_TO_MAP_KEY,  		PTR_TO_MAP_VALUE,  		PTR_TO_MEM, -		PTR_TO_MEM | MEM_ALLOC, +		PTR_TO_MEM | MEM_RINGBUF,  		PTR_TO_BUF,  	},  }; @@ -5694,14 +6115,31 @@ static const struct bpf_reg_types int_ptr_types = {  	},  }; +static const struct bpf_reg_types spin_lock_types = { +	.types = { +		PTR_TO_MAP_VALUE, +		PTR_TO_BTF_ID | MEM_ALLOC, +	} +}; +  static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };  static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };  static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; -static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } }; +static const struct bpf_reg_types ringbuf_mem_types = { .types = { PTR_TO_MEM | MEM_RINGBUF } };  static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; -static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; -static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; -static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } }; +static const struct bpf_reg_types btf_ptr_types = { +	.types = { +		PTR_TO_BTF_ID, +		PTR_TO_BTF_ID | PTR_TRUSTED, +		PTR_TO_BTF_ID | MEM_RCU, +	}, +}; +static const struct bpf_reg_types percpu_btf_ptr_types = { +	.types = { +		PTR_TO_BTF_ID | MEM_PERCPU, +		PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED, +	} +};  static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };  static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };  static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; @@ -5710,13 +6148,13 @@ static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }  static const struct bpf_reg_types dynptr_types = {  	.types = {  		PTR_TO_STACK, -		PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL, +		CONST_PTR_TO_DYNPTR,  	}  };  static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { -	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types, -	[ARG_PTR_TO_MAP_VALUE]		= &map_key_value_types, +	[ARG_PTR_TO_MAP_KEY]		= &mem_types, +	[ARG_PTR_TO_MAP_VALUE]		= &mem_types,  	[ARG_CONST_SIZE]		= &scalar_types,  	[ARG_CONST_SIZE_OR_ZERO]	= &scalar_types,  	[ARG_CONST_ALLOC_SIZE_OR_ZERO]	= &scalar_types, @@ -5730,7 +6168,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {  	[ARG_PTR_TO_BTF_ID]		= &btf_ptr_types,  	[ARG_PTR_TO_SPIN_LOCK]		= &spin_lock_types,  	[ARG_PTR_TO_MEM]		= &mem_types, -	[ARG_PTR_TO_ALLOC_MEM]		= &alloc_mem_types, +	[ARG_PTR_TO_RINGBUF_MEM]	= &ringbuf_mem_types,  	[ARG_PTR_TO_INT]		= &int_ptr_types,  	[ARG_PTR_TO_LONG]		= &int_ptr_types,  	[ARG_PTR_TO_PERCPU_BTF_ID]	= &percpu_btf_ptr_types, @@ -5789,7 +6227,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,  	return -EACCES;  found: -	if (reg->type == PTR_TO_BTF_ID) { +	if (reg->type == PTR_TO_BTF_ID || reg->type & PTR_TRUSTED) {  		/* For bpf_sk_release, it needs to match against first member  		 * 'struct sock_common', hence make an exception for it. This  		 * allows bpf_sk_release to work for multiple socket types. @@ -5806,7 +6244,7 @@ found:  		}  		if (meta->func_id == BPF_FUNC_kptr_xchg) { -			if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno)) +			if (map_kptr_match_type(env, meta->kptr_field, reg, regno))  				return -EACCES;  		} else {  			if (arg_btf_id == BPF_PTR_POISON) { @@ -5825,6 +6263,11 @@ found:  				return -EACCES;  			}  		} +	} else if (type_is_alloc(reg->type)) { +		if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) { +			verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); +			return -EFAULT; +		}  	}  	return 0; @@ -5834,64 +6277,80 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,  			   const struct bpf_reg_state *reg, int regno,  			   enum bpf_arg_type arg_type)  { -	enum bpf_reg_type type = reg->type; -	bool fixed_off_ok = false; +	u32 type = reg->type; -	switch ((u32)type) { -	/* Pointer types where reg offset is explicitly allowed: */ -	case PTR_TO_STACK: -		if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) { -			verbose(env, "cannot pass in dynptr at an offset\n"); +	/* When referenced register is passed to release function, its fixed +	 * offset must be 0. +	 * +	 * We will check arg_type_is_release reg has ref_obj_id when storing +	 * meta->release_regno. +	 */ +	if (arg_type_is_release(arg_type)) { +		/* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it +		 * may not directly point to the object being released, but to +		 * dynptr pointing to such object, which might be at some offset +		 * on the stack. In that case, we simply to fallback to the +		 * default handling. +		 */ +		if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK) +			return 0; +		/* Doing check_ptr_off_reg check for the offset will catch this +		 * because fixed_off_ok is false, but checking here allows us +		 * to give the user a better error message. +		 */ +		if (reg->off) { +			verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n", +				regno);  			return -EINVAL;  		} -		fallthrough; +		return __check_ptr_off_reg(env, reg, regno, false); +	} + +	switch (type) { +	/* Pointer types where both fixed and variable offset is explicitly allowed: */ +	case PTR_TO_STACK:  	case PTR_TO_PACKET:  	case PTR_TO_PACKET_META:  	case PTR_TO_MAP_KEY:  	case PTR_TO_MAP_VALUE:  	case PTR_TO_MEM:  	case PTR_TO_MEM | MEM_RDONLY: -	case PTR_TO_MEM | MEM_ALLOC: +	case PTR_TO_MEM | MEM_RINGBUF:  	case PTR_TO_BUF:  	case PTR_TO_BUF | MEM_RDONLY:  	case SCALAR_VALUE: -		/* Some of the argument types nevertheless require a -		 * zero register offset. -		 */ -		if (base_type(arg_type) != ARG_PTR_TO_ALLOC_MEM) -			return 0; -		break; +		return 0;  	/* All the rest must be rejected, except PTR_TO_BTF_ID which allows  	 * fixed offset.  	 */  	case PTR_TO_BTF_ID: +	case PTR_TO_BTF_ID | MEM_ALLOC: +	case PTR_TO_BTF_ID | PTR_TRUSTED: +	case PTR_TO_BTF_ID | MEM_RCU: +	case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:  		/* When referenced PTR_TO_BTF_ID is passed to release function, -		 * it's fixed offset must be 0.	In the other cases, fixed offset -		 * can be non-zero. -		 */ -		if (arg_type_is_release(arg_type) && reg->off) { -			verbose(env, "R%d must have zero offset when passed to release func\n", -				regno); -			return -EINVAL; -		} -		/* For arg is release pointer, fixed_off_ok must be false, but -		 * we already checked and rejected reg->off != 0 above, so set -		 * to true to allow fixed offset for all other cases. +		 * its fixed offset must be 0. In the other cases, fixed offset +		 * can be non-zero. This was already checked above. So pass +		 * fixed_off_ok as true to allow fixed offset for all other +		 * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we +		 * still need to do checks instead of returning.  		 */ -		fixed_off_ok = true; -		break; +		return __check_ptr_off_reg(env, reg, regno, true);  	default: -		break; +		return __check_ptr_off_reg(env, reg, regno, false);  	} -	return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);  } -static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)  {  	struct bpf_func_state *state = func(env, reg); -	int spi = get_spi(reg->off); +	int spi; + +	if (reg->type == CONST_PTR_TO_DYNPTR) +		return reg->ref_obj_id; -	return state->stack[spi].spilled_ptr.id; +	spi = get_spi(reg->off); +	return state->stack[spi].spilled_ptr.ref_obj_id;  }  static int check_func_arg(struct bpf_verifier_env *env, u32 arg, @@ -5940,7 +6399,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,  		goto skip_type_check;  	/* arg_btf_id and arg_size are in a union. */ -	if (base_type(arg_type) == ARG_PTR_TO_BTF_ID) +	if (base_type(arg_type) == ARG_PTR_TO_BTF_ID || +	    base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)  		arg_btf_id = fn->arg_btf_id[arg];  	err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); @@ -5955,11 +6415,22 @@ skip_type_check:  	if (arg_type_is_release(arg_type)) {  		if (arg_type_is_dynptr(arg_type)) {  			struct bpf_func_state *state = func(env, reg); -			int spi = get_spi(reg->off); +			int spi; -			if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || -			    !state->stack[spi].spilled_ptr.id) { -				verbose(env, "arg %d is an unacquired reference\n", regno); +			/* Only dynptr created on stack can be released, thus +			 * the get_spi and stack state checks for spilled_ptr +			 * should only be done before process_dynptr_func for +			 * PTR_TO_STACK. +			 */ +			if (reg->type == PTR_TO_STACK) { +				spi = get_spi(reg->off); +				if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || +				    !state->stack[spi].spilled_ptr.ref_obj_id) { +					verbose(env, "arg %d is an unacquired reference\n", regno); +					return -EINVAL; +				} +			} else { +				verbose(env, "cannot release unowned const bpf_dynptr\n");  				return -EINVAL;  			}  		} else if (!reg->ref_obj_id && !register_is_null(reg)) { @@ -6056,19 +6527,22 @@ skip_type_check:  		break;  	case ARG_PTR_TO_SPIN_LOCK:  		if (meta->func_id == BPF_FUNC_spin_lock) { -			if (process_spin_lock(env, regno, true)) -				return -EACCES; +			err = process_spin_lock(env, regno, true); +			if (err) +				return err;  		} else if (meta->func_id == BPF_FUNC_spin_unlock) { -			if (process_spin_lock(env, regno, false)) -				return -EACCES; +			err = process_spin_lock(env, regno, false); +			if (err) +				return err;  		} else {  			verbose(env, "verifier internal error\n");  			return -EFAULT;  		}  		break;  	case ARG_PTR_TO_TIMER: -		if (process_timer_func(env, regno, meta)) -			return -EACCES; +		err = process_timer_func(env, regno, meta); +		if (err) +			return err;  		break;  	case ARG_PTR_TO_FUNC:  		meta->subprogno = reg->subprogno; @@ -6091,52 +6565,9 @@ skip_type_check:  		err = check_mem_size_reg(env, reg, regno, true, meta);  		break;  	case ARG_PTR_TO_DYNPTR: -		/* We only need to check for initialized / uninitialized helper -		 * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the -		 * assumption is that if it is, that a helper function -		 * initialized the dynptr on behalf of the BPF program. -		 */ -		if (base_type(reg->type) == PTR_TO_DYNPTR) -			break; -		if (arg_type & MEM_UNINIT) { -			if (!is_dynptr_reg_valid_uninit(env, reg)) { -				verbose(env, "Dynptr has to be an uninitialized dynptr\n"); -				return -EINVAL; -			} - -			/* We only support one dynptr being uninitialized at the moment, -			 * which is sufficient for the helper functions we have right now. -			 */ -			if (meta->uninit_dynptr_regno) { -				verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); -				return -EFAULT; -			} - -			meta->uninit_dynptr_regno = regno; -		} else if (!is_dynptr_reg_valid_init(env, reg)) { -			verbose(env, -				"Expected an initialized dynptr as arg #%d\n", -				arg + 1); -			return -EINVAL; -		} else if (!is_dynptr_type_expected(env, reg, arg_type)) { -			const char *err_extra = ""; - -			switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { -			case DYNPTR_TYPE_LOCAL: -				err_extra = "local"; -				break; -			case DYNPTR_TYPE_RINGBUF: -				err_extra = "ringbuf"; -				break; -			default: -				err_extra = "<unknown>"; -				break; -			} -			verbose(env, -				"Expected a dynptr of type %s as arg #%d\n", -				err_extra, arg + 1); -			return -EINVAL; -		} +		err = process_dynptr_func(env, regno, arg_type, meta); +		if (err) +			return err;  		break;  	case ARG_CONST_ALLOC_SIZE_OR_ZERO:  		if (!tnum_is_const(reg->var_off)) { @@ -6203,8 +6634,9 @@ skip_type_check:  		break;  	}  	case ARG_PTR_TO_KPTR: -		if (process_kptr_func(env, regno, meta)) -			return -EACCES; +		err = process_kptr_func(env, regno, meta); +		if (err) +			return err;  		break;  	} @@ -6365,6 +6797,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,  		    func_id != BPF_FUNC_task_storage_delete)  			goto error;  		break; +	case BPF_MAP_TYPE_CGRP_STORAGE: +		if (func_id != BPF_FUNC_cgrp_storage_get && +		    func_id != BPF_FUNC_cgrp_storage_delete) +			goto error; +		break;  	case BPF_MAP_TYPE_BLOOM_FILTER:  		if (func_id != BPF_FUNC_map_peek_elem &&  		    func_id != BPF_FUNC_map_push_elem) @@ -6477,6 +6914,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,  		if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE)  			goto error;  		break; +	case BPF_FUNC_cgrp_storage_get: +	case BPF_FUNC_cgrp_storage_delete: +		if (map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) +			goto error; +		break;  	default:  		break;  	} @@ -6548,9 +6990,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn)  	int i;  	for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { -		if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) -			return false; - +		if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID) +			return !!fn->arg_btf_id[i]; +		if (base_type(fn->arg_type[i]) == ARG_PTR_TO_SPIN_LOCK) +			return fn->arg_btf_id[i] == BPF_PTR_POISON;  		if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] &&  		    /* arg_btf_id and arg_size are in a union. */  		    (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM || @@ -6651,6 +7094,10 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,  				   struct bpf_func_state *callee,  				   int insn_idx); +static int set_callee_state(struct bpf_verifier_env *env, +			    struct bpf_func_state *caller, +			    struct bpf_func_state *callee, int insn_idx); +  static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  			     int *insn_idx, int subprog,  			     set_callee_state_fn set_callee_state_cb) @@ -6701,6 +7148,16 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn  		}  	} +	/* set_callee_state is used for direct subprog calls, but we are +	 * interested in validating only BPF helpers that can call subprogs as +	 * callbacks +	 */ +	if (set_callee_state_cb != set_callee_state && !is_callback_calling_function(insn->imm)) { +		verbose(env, "verifier bug: helper %s#%d is not marked as callback-calling\n", +			func_id_name(insn->imm), insn->imm); +		return -EFAULT; +	} +  	if (insn->code == (BPF_JMP | BPF_CALL) &&  	    insn->src_reg == 0 &&  	    insn->imm == BPF_FUNC_timer_set_callback) { @@ -6947,11 +7404,10 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,  {  	/* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void  	 *			  callback_ctx, u64 flags); -	 * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx); +	 * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);  	 */  	__mark_reg_not_init(env, &callee->regs[BPF_REG_0]); -	callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL; -	__mark_reg_known_zero(&callee->regs[BPF_REG_1]); +	mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);  	callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];  	/* unused */ @@ -7283,6 +7739,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn  		return -EINVAL;  	} +	if (!env->prog->aux->sleepable && fn->might_sleep) { +		verbose(env, "helper call might sleep in a non-sleepable prog\n"); +		return -EINVAL; +	} +  	/* With LD_ABS/IND some JITs save/restore skb from r1. */  	changes_data = bpf_helper_changes_pkt_data(fn->func);  	if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { @@ -7301,6 +7762,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn  		return err;  	} +	if (env->cur_state->active_rcu_lock) { +		if (fn->might_sleep) { +			verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n", +				func_id_name(func_id), func_id); +			return -EINVAL; +		} + +		if (env->prog->aux->sleepable && is_storage_get_function(func_id)) +			env->insn_aux_data[insn_idx].storage_get_func_atomic = true; +	} +  	meta.func_id = func_id;  	/* check args */  	for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { @@ -7329,7 +7801,15 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn  	regs = cur_regs(env); +	/* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot +	 * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr +	 * is safe to do directly. +	 */  	if (meta.uninit_dynptr_regno) { +		if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) { +			verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n"); +			return -EFAULT; +		}  		/* we write BPF_DW bits (8 bytes) at a time */  		for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {  			err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno, @@ -7347,15 +7827,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn  	if (meta.release_regno) {  		err = -EINVAL; -		if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) +		/* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot +		 * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr +		 * is safe to do directly. +		 */ +		if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { +			if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) { +				verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n"); +				return -EFAULT; +			}  			err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); -		else if (meta.ref_obj_id) +		} else if (meta.ref_obj_id) {  			err = release_reference(env, meta.ref_obj_id); -		/* meta.ref_obj_id can only be 0 if register that is meant to be -		 * released is NULL, which must be > R0. -		 */ -		else if (register_is_null(®s[meta.release_regno])) +		} else if (register_is_null(®s[meta.release_regno])) { +			/* meta.ref_obj_id can only be 0 if register that is meant to be +			 * released is NULL, which must be > R0. +			 */  			err = 0; +		}  		if (err) {  			verbose(env, "func %s#%d reference has not been acquired before\n",  				func_id_name(func_id), func_id); @@ -7429,11 +7918,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn  					return -EFAULT;  				} -				if (base_type(reg->type) != PTR_TO_DYNPTR) -					/* Find the id of the dynptr we're -					 * tracking the reference of -					 */ -					meta.ref_obj_id = stack_slot_get_id(env, reg); +				meta.ref_obj_id = dynptr_ref_obj_id(env, reg);  				break;  			}  		} @@ -7488,7 +7973,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn  		regs[BPF_REG_0].map_uid = meta.map_uid;  		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;  		if (!type_may_be_null(ret_type) && -		    map_value_has_spin_lock(meta.map_ptr)) { +		    btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) {  			regs[BPF_REG_0].id = ++env->id_gen;  		}  		break; @@ -7504,7 +7989,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn  		mark_reg_known_zero(env, regs, BPF_REG_0);  		regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;  		break; -	case RET_PTR_TO_ALLOC_MEM: +	case RET_PTR_TO_MEM:  		mark_reg_known_zero(env, regs, BPF_REG_0);  		regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;  		regs[BPF_REG_0].mem_size = meta.mem_size; @@ -7552,8 +8037,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn  		mark_reg_known_zero(env, regs, BPF_REG_0);  		regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;  		if (func_id == BPF_FUNC_kptr_xchg) { -			ret_btf = meta.kptr_off_desc->kptr.btf; -			ret_btf_id = meta.kptr_off_desc->kptr.btf_id; +			ret_btf = meta.kptr_field->kptr.btf; +			ret_btf_id = meta.kptr_field->kptr.btf_id;  		} else {  			if (fn->ret_btf_id == BPF_PTR_POISON) {  				verbose(env, "verifier internal error:"); @@ -7667,19 +8152,926 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,  	}  } +struct bpf_kfunc_call_arg_meta { +	/* In parameters */ +	struct btf *btf; +	u32 func_id; +	u32 kfunc_flags; +	const struct btf_type *func_proto; +	const char *func_name; +	/* Out parameters */ +	u32 ref_obj_id; +	u8 release_regno; +	bool r0_rdonly; +	u32 ret_btf_id; +	u64 r0_size; +	struct { +		u64 value; +		bool found; +	} arg_constant; +	struct { +		struct btf *btf; +		u32 btf_id; +	} arg_obj_drop; +	struct { +		struct btf_field *field; +	} arg_list_head; +}; + +static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) +{ +	return meta->kfunc_flags & KF_ACQUIRE; +} + +static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) +{ +	return meta->kfunc_flags & KF_RET_NULL; +} + +static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) +{ +	return meta->kfunc_flags & KF_RELEASE; +} + +static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta) +{ +	return meta->kfunc_flags & KF_TRUSTED_ARGS; +} + +static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta) +{ +	return meta->kfunc_flags & KF_SLEEPABLE; +} + +static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta) +{ +	return meta->kfunc_flags & KF_DESTRUCTIVE; +} + +static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta) +{ +	return meta->kfunc_flags & KF_RCU; +} + +static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg) +{ +	return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET); +} + +static bool __kfunc_param_match_suffix(const struct btf *btf, +				       const struct btf_param *arg, +				       const char *suffix) +{ +	int suffix_len = strlen(suffix), len; +	const char *param_name; + +	/* In the future, this can be ported to use BTF tagging */ +	param_name = btf_name_by_offset(btf, arg->name_off); +	if (str_is_empty(param_name)) +		return false; +	len = strlen(param_name); +	if (len < suffix_len) +		return false; +	param_name += len - suffix_len; +	return !strncmp(param_name, suffix, suffix_len); +} + +static bool is_kfunc_arg_mem_size(const struct btf *btf, +				  const struct btf_param *arg, +				  const struct bpf_reg_state *reg) +{ +	const struct btf_type *t; + +	t = btf_type_skip_modifiers(btf, arg->type, NULL); +	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) +		return false; + +	return __kfunc_param_match_suffix(btf, arg, "__sz"); +} + +static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg) +{ +	return __kfunc_param_match_suffix(btf, arg, "__k"); +} + +static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg) +{ +	return __kfunc_param_match_suffix(btf, arg, "__ign"); +} + +static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg) +{ +	return __kfunc_param_match_suffix(btf, arg, "__alloc"); +} + +static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, +					  const struct btf_param *arg, +					  const char *name) +{ +	int len, target_len = strlen(name); +	const char *param_name; + +	param_name = btf_name_by_offset(btf, arg->name_off); +	if (str_is_empty(param_name)) +		return false; +	len = strlen(param_name); +	if (len != target_len) +		return false; +	if (strcmp(param_name, name)) +		return false; + +	return true; +} + +enum { +	KF_ARG_DYNPTR_ID, +	KF_ARG_LIST_HEAD_ID, +	KF_ARG_LIST_NODE_ID, +}; + +BTF_ID_LIST(kf_arg_btf_ids) +BTF_ID(struct, bpf_dynptr_kern) +BTF_ID(struct, bpf_list_head) +BTF_ID(struct, bpf_list_node) + +static bool __is_kfunc_ptr_arg_type(const struct btf *btf, +				    const struct btf_param *arg, int type) +{ +	const struct btf_type *t; +	u32 res_id; + +	t = btf_type_skip_modifiers(btf, arg->type, NULL); +	if (!t) +		return false; +	if (!btf_type_is_ptr(t)) +		return false; +	t = btf_type_skip_modifiers(btf, t->type, &res_id); +	if (!t) +		return false; +	return btf_types_are_same(btf, res_id, btf_vmlinux, kf_arg_btf_ids[type]); +} + +static bool is_kfunc_arg_dynptr(const struct btf *btf, const struct btf_param *arg) +{ +	return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_DYNPTR_ID); +} + +static bool is_kfunc_arg_list_head(const struct btf *btf, const struct btf_param *arg) +{ +	return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_HEAD_ID); +} + +static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param *arg) +{ +	return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID); +} + +/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ +static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env, +					const struct btf *btf, +					const struct btf_type *t, int rec) +{ +	const struct btf_type *member_type; +	const struct btf_member *member; +	u32 i; + +	if (!btf_type_is_struct(t)) +		return false; + +	for_each_member(i, t, member) { +		const struct btf_array *array; + +		member_type = btf_type_skip_modifiers(btf, member->type, NULL); +		if (btf_type_is_struct(member_type)) { +			if (rec >= 3) { +				verbose(env, "max struct nesting depth exceeded\n"); +				return false; +			} +			if (!__btf_type_is_scalar_struct(env, btf, member_type, rec + 1)) +				return false; +			continue; +		} +		if (btf_type_is_array(member_type)) { +			array = btf_array(member_type); +			if (!array->nelems) +				return false; +			member_type = btf_type_skip_modifiers(btf, array->type, NULL); +			if (!btf_type_is_scalar(member_type)) +				return false; +			continue; +		} +		if (!btf_type_is_scalar(member_type)) +			return false; +	} +	return true; +} + + +static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { +#ifdef CONFIG_NET +	[PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], +	[PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], +	[PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +#endif +}; + +enum kfunc_ptr_arg_type { +	KF_ARG_PTR_TO_CTX, +	KF_ARG_PTR_TO_ALLOC_BTF_ID,  /* Allocated object */ +	KF_ARG_PTR_TO_KPTR,	     /* PTR_TO_KPTR but type specific */ +	KF_ARG_PTR_TO_DYNPTR, +	KF_ARG_PTR_TO_LIST_HEAD, +	KF_ARG_PTR_TO_LIST_NODE, +	KF_ARG_PTR_TO_BTF_ID,	     /* Also covers reg2btf_ids conversions */ +	KF_ARG_PTR_TO_MEM, +	KF_ARG_PTR_TO_MEM_SIZE,	     /* Size derived from next argument, skip it */ +}; + +enum special_kfunc_type { +	KF_bpf_obj_new_impl, +	KF_bpf_obj_drop_impl, +	KF_bpf_list_push_front, +	KF_bpf_list_push_back, +	KF_bpf_list_pop_front, +	KF_bpf_list_pop_back, +	KF_bpf_cast_to_kern_ctx, +	KF_bpf_rdonly_cast, +	KF_bpf_rcu_read_lock, +	KF_bpf_rcu_read_unlock, +}; + +BTF_SET_START(special_kfunc_set) +BTF_ID(func, bpf_obj_new_impl) +BTF_ID(func, bpf_obj_drop_impl) +BTF_ID(func, bpf_list_push_front) +BTF_ID(func, bpf_list_push_back) +BTF_ID(func, bpf_list_pop_front) +BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_cast_to_kern_ctx) +BTF_ID(func, bpf_rdonly_cast) +BTF_SET_END(special_kfunc_set) + +BTF_ID_LIST(special_kfunc_list) +BTF_ID(func, bpf_obj_new_impl) +BTF_ID(func, bpf_obj_drop_impl) +BTF_ID(func, bpf_list_push_front) +BTF_ID(func, bpf_list_push_back) +BTF_ID(func, bpf_list_pop_front) +BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_cast_to_kern_ctx) +BTF_ID(func, bpf_rdonly_cast) +BTF_ID(func, bpf_rcu_read_lock) +BTF_ID(func, bpf_rcu_read_unlock) + +static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta) +{ +	return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock]; +} + +static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta) +{ +	return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock]; +} + +static enum kfunc_ptr_arg_type +get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, +		       struct bpf_kfunc_call_arg_meta *meta, +		       const struct btf_type *t, const struct btf_type *ref_t, +		       const char *ref_tname, const struct btf_param *args, +		       int argno, int nargs) +{ +	u32 regno = argno + 1; +	struct bpf_reg_state *regs = cur_regs(env); +	struct bpf_reg_state *reg = ®s[regno]; +	bool arg_mem_size = false; + +	if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) +		return KF_ARG_PTR_TO_CTX; + +	/* In this function, we verify the kfunc's BTF as per the argument type, +	 * leaving the rest of the verification with respect to the register +	 * type to our caller. When a set of conditions hold in the BTF type of +	 * arguments, we resolve it to a known kfunc_ptr_arg_type. +	 */ +	if (btf_get_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) +		return KF_ARG_PTR_TO_CTX; + +	if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) +		return KF_ARG_PTR_TO_ALLOC_BTF_ID; + +	if (is_kfunc_arg_kptr_get(meta, argno)) { +		if (!btf_type_is_ptr(ref_t)) { +			verbose(env, "arg#0 BTF type must be a double pointer for kptr_get kfunc\n"); +			return -EINVAL; +		} +		ref_t = btf_type_by_id(meta->btf, ref_t->type); +		ref_tname = btf_name_by_offset(meta->btf, ref_t->name_off); +		if (!btf_type_is_struct(ref_t)) { +			verbose(env, "kernel function %s args#0 pointer type %s %s is not supported\n", +				meta->func_name, btf_type_str(ref_t), ref_tname); +			return -EINVAL; +		} +		return KF_ARG_PTR_TO_KPTR; +	} + +	if (is_kfunc_arg_dynptr(meta->btf, &args[argno])) +		return KF_ARG_PTR_TO_DYNPTR; + +	if (is_kfunc_arg_list_head(meta->btf, &args[argno])) +		return KF_ARG_PTR_TO_LIST_HEAD; + +	if (is_kfunc_arg_list_node(meta->btf, &args[argno])) +		return KF_ARG_PTR_TO_LIST_NODE; + +	if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { +		if (!btf_type_is_struct(ref_t)) { +			verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", +				meta->func_name, argno, btf_type_str(ref_t), ref_tname); +			return -EINVAL; +		} +		return KF_ARG_PTR_TO_BTF_ID; +	} + +	if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1])) +		arg_mem_size = true; + +	/* This is the catch all argument type of register types supported by +	 * check_helper_mem_access. However, we only allow when argument type is +	 * pointer to scalar, or struct composed (recursively) of scalars. When +	 * arg_mem_size is true, the pointer can be void *. +	 */ +	if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) && +	    (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { +		verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", +			argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); +		return -EINVAL; +	} +	return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM; +} + +static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, +					struct bpf_reg_state *reg, +					const struct btf_type *ref_t, +					const char *ref_tname, u32 ref_id, +					struct bpf_kfunc_call_arg_meta *meta, +					int argno) +{ +	const struct btf_type *reg_ref_t; +	bool strict_type_match = false; +	const struct btf *reg_btf; +	const char *reg_ref_tname; +	u32 reg_ref_id; + +	if (base_type(reg->type) == PTR_TO_BTF_ID) { +		reg_btf = reg->btf; +		reg_ref_id = reg->btf_id; +	} else { +		reg_btf = btf_vmlinux; +		reg_ref_id = *reg2btf_ids[base_type(reg->type)]; +	} + +	if (is_kfunc_trusted_args(meta) || (is_kfunc_release(meta) && reg->ref_obj_id)) +		strict_type_match = true; + +	reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, ®_ref_id); +	reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); +	if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) { +		verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", +			meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1, +			btf_type_str(reg_ref_t), reg_ref_tname); +		return -EINVAL; +	} +	return 0; +} + +static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env, +				      struct bpf_reg_state *reg, +				      const struct btf_type *ref_t, +				      const char *ref_tname, +				      struct bpf_kfunc_call_arg_meta *meta, +				      int argno) +{ +	struct btf_field *kptr_field; + +	/* check_func_arg_reg_off allows var_off for +	 * PTR_TO_MAP_VALUE, but we need fixed offset to find +	 * off_desc. +	 */ +	if (!tnum_is_const(reg->var_off)) { +		verbose(env, "arg#0 must have constant offset\n"); +		return -EINVAL; +	} + +	kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR); +	if (!kptr_field || kptr_field->type != BPF_KPTR_REF) { +		verbose(env, "arg#0 no referenced kptr at map value offset=%llu\n", +			reg->off + reg->var_off.value); +		return -EINVAL; +	} + +	if (!btf_struct_ids_match(&env->log, meta->btf, ref_t->type, 0, kptr_field->kptr.btf, +				  kptr_field->kptr.btf_id, true)) { +		verbose(env, "kernel function %s args#%d expected pointer to %s %s\n", +			meta->func_name, argno, btf_type_str(ref_t), ref_tname); +		return -EINVAL; +	} +	return 0; +} + +static int ref_set_release_on_unlock(struct bpf_verifier_env *env, u32 ref_obj_id) +{ +	struct bpf_func_state *state = cur_func(env); +	struct bpf_reg_state *reg; +	int i; + +	/* bpf_spin_lock only allows calling list_push and list_pop, no BPF +	 * subprogs, no global functions. This means that the references would +	 * not be released inside the critical section but they may be added to +	 * the reference state, and the acquired_refs are never copied out for a +	 * different frame as BPF to BPF calls don't work in bpf_spin_lock +	 * critical sections. +	 */ +	if (!ref_obj_id) { +		verbose(env, "verifier internal error: ref_obj_id is zero for release_on_unlock\n"); +		return -EFAULT; +	} +	for (i = 0; i < state->acquired_refs; i++) { +		if (state->refs[i].id == ref_obj_id) { +			if (state->refs[i].release_on_unlock) { +				verbose(env, "verifier internal error: expected false release_on_unlock"); +				return -EFAULT; +			} +			state->refs[i].release_on_unlock = true; +			/* Now mark everyone sharing same ref_obj_id as untrusted */ +			bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ +				if (reg->ref_obj_id == ref_obj_id) +					reg->type |= PTR_UNTRUSTED; +			})); +			return 0; +		} +	} +	verbose(env, "verifier internal error: ref state missing for ref_obj_id\n"); +	return -EFAULT; +} + +/* Implementation details: + * + * Each register points to some region of memory, which we define as an + * allocation. Each allocation may embed a bpf_spin_lock which protects any + * special BPF objects (bpf_list_head, bpf_rb_root, etc.) part of the same + * allocation. The lock and the data it protects are colocated in the same + * memory region. + * + * Hence, everytime a register holds a pointer value pointing to such + * allocation, the verifier preserves a unique reg->id for it. + * + * The verifier remembers the lock 'ptr' and the lock 'id' whenever + * bpf_spin_lock is called. + * + * To enable this, lock state in the verifier captures two values: + *	active_lock.ptr = Register's type specific pointer + *	active_lock.id  = A unique ID for each register pointer value + * + * Currently, PTR_TO_MAP_VALUE and PTR_TO_BTF_ID | MEM_ALLOC are the two + * supported register types. + * + * The active_lock.ptr in case of map values is the reg->map_ptr, and in case of + * allocated objects is the reg->btf pointer. + * + * The active_lock.id is non-unique for maps supporting direct_value_addr, as we + * can establish the provenance of the map value statically for each distinct + * lookup into such maps. They always contain a single map value hence unique + * IDs for each pseudo load pessimizes the algorithm and rejects valid programs. + * + * So, in case of global variables, they use array maps with max_entries = 1, + * hence their active_lock.ptr becomes map_ptr and id = 0 (since they all point + * into the same map value as max_entries is 1, as described above). + * + * In case of inner map lookups, the inner map pointer has same map_ptr as the + * outer map pointer (in verifier context), but each lookup into an inner map + * assigns a fresh reg->id to the lookup, so while lookups into distinct inner + * maps from the same outer map share the same map_ptr as active_lock.ptr, they + * will get different reg->id assigned to each lookup, hence different + * active_lock.id. + * + * In case of allocated objects, active_lock.ptr is the reg->btf, and the + * reg->id is a unique ID preserved after the NULL pointer check on the pointer + * returned from bpf_obj_new. Each allocation receives a new reg->id. + */ +static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ +	void *ptr; +	u32 id; + +	switch ((int)reg->type) { +	case PTR_TO_MAP_VALUE: +		ptr = reg->map_ptr; +		break; +	case PTR_TO_BTF_ID | MEM_ALLOC: +	case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: +		ptr = reg->btf; +		break; +	default: +		verbose(env, "verifier internal error: unknown reg type for lock check\n"); +		return -EFAULT; +	} +	id = reg->id; + +	if (!env->cur_state->active_lock.ptr) +		return -EINVAL; +	if (env->cur_state->active_lock.ptr != ptr || +	    env->cur_state->active_lock.id != id) { +		verbose(env, "held lock and object are not in the same allocation\n"); +		return -EINVAL; +	} +	return 0; +} + +static bool is_bpf_list_api_kfunc(u32 btf_id) +{ +	return btf_id == special_kfunc_list[KF_bpf_list_push_front] || +	       btf_id == special_kfunc_list[KF_bpf_list_push_back] || +	       btf_id == special_kfunc_list[KF_bpf_list_pop_front] || +	       btf_id == special_kfunc_list[KF_bpf_list_pop_back]; +} + +static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env, +					   struct bpf_reg_state *reg, u32 regno, +					   struct bpf_kfunc_call_arg_meta *meta) +{ +	struct btf_field *field; +	struct btf_record *rec; +	u32 list_head_off; + +	if (meta->btf != btf_vmlinux || !is_bpf_list_api_kfunc(meta->func_id)) { +		verbose(env, "verifier internal error: bpf_list_head argument for unknown kfunc\n"); +		return -EFAULT; +	} + +	if (!tnum_is_const(reg->var_off)) { +		verbose(env, +			"R%d doesn't have constant offset. bpf_list_head has to be at the constant offset\n", +			regno); +		return -EINVAL; +	} + +	rec = reg_btf_record(reg); +	list_head_off = reg->off + reg->var_off.value; +	field = btf_record_find(rec, list_head_off, BPF_LIST_HEAD); +	if (!field) { +		verbose(env, "bpf_list_head not found at offset=%u\n", list_head_off); +		return -EINVAL; +	} + +	/* All functions require bpf_list_head to be protected using a bpf_spin_lock */ +	if (check_reg_allocation_locked(env, reg)) { +		verbose(env, "bpf_spin_lock at off=%d must be held for bpf_list_head\n", +			rec->spin_lock_off); +		return -EINVAL; +	} + +	if (meta->arg_list_head.field) { +		verbose(env, "verifier internal error: repeating bpf_list_head arg\n"); +		return -EFAULT; +	} +	meta->arg_list_head.field = field; +	return 0; +} + +static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, +					   struct bpf_reg_state *reg, u32 regno, +					   struct bpf_kfunc_call_arg_meta *meta) +{ +	const struct btf_type *et, *t; +	struct btf_field *field; +	struct btf_record *rec; +	u32 list_node_off; + +	if (meta->btf != btf_vmlinux || +	    (meta->func_id != special_kfunc_list[KF_bpf_list_push_front] && +	     meta->func_id != special_kfunc_list[KF_bpf_list_push_back])) { +		verbose(env, "verifier internal error: bpf_list_node argument for unknown kfunc\n"); +		return -EFAULT; +	} + +	if (!tnum_is_const(reg->var_off)) { +		verbose(env, +			"R%d doesn't have constant offset. bpf_list_node has to be at the constant offset\n", +			regno); +		return -EINVAL; +	} + +	rec = reg_btf_record(reg); +	list_node_off = reg->off + reg->var_off.value; +	field = btf_record_find(rec, list_node_off, BPF_LIST_NODE); +	if (!field || field->offset != list_node_off) { +		verbose(env, "bpf_list_node not found at offset=%u\n", list_node_off); +		return -EINVAL; +	} + +	field = meta->arg_list_head.field; + +	et = btf_type_by_id(field->list_head.btf, field->list_head.value_btf_id); +	t = btf_type_by_id(reg->btf, reg->btf_id); +	if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->list_head.btf, +				  field->list_head.value_btf_id, true)) { +		verbose(env, "operation on bpf_list_head expects arg#1 bpf_list_node at offset=%d " +			"in struct %s, but arg is at offset=%d in struct %s\n", +			field->list_head.node_offset, btf_name_by_offset(field->list_head.btf, et->name_off), +			list_node_off, btf_name_by_offset(reg->btf, t->name_off)); +		return -EINVAL; +	} + +	if (list_node_off != field->list_head.node_offset) { +		verbose(env, "arg#1 offset=%d, but expected bpf_list_node at offset=%d in struct %s\n", +			list_node_off, field->list_head.node_offset, +			btf_name_by_offset(field->list_head.btf, et->name_off)); +		return -EINVAL; +	} +	/* Set arg#1 for expiration after unlock */ +	return ref_set_release_on_unlock(env, reg->ref_obj_id); +} + +static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta) +{ +	const char *func_name = meta->func_name, *ref_tname; +	const struct btf *btf = meta->btf; +	const struct btf_param *args; +	u32 i, nargs; +	int ret; + +	args = (const struct btf_param *)(meta->func_proto + 1); +	nargs = btf_type_vlen(meta->func_proto); +	if (nargs > MAX_BPF_FUNC_REG_ARGS) { +		verbose(env, "Function %s has %d > %d args\n", func_name, nargs, +			MAX_BPF_FUNC_REG_ARGS); +		return -EINVAL; +	} + +	/* Check that BTF function arguments match actual types that the +	 * verifier sees. +	 */ +	for (i = 0; i < nargs; i++) { +		struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[i + 1]; +		const struct btf_type *t, *ref_t, *resolve_ret; +		enum bpf_arg_type arg_type = ARG_DONTCARE; +		u32 regno = i + 1, ref_id, type_size; +		bool is_ret_buf_sz = false; +		int kf_arg_type; + +		t = btf_type_skip_modifiers(btf, args[i].type, NULL); + +		if (is_kfunc_arg_ignore(btf, &args[i])) +			continue; + +		if (btf_type_is_scalar(t)) { +			if (reg->type != SCALAR_VALUE) { +				verbose(env, "R%d is not a scalar\n", regno); +				return -EINVAL; +			} + +			if (is_kfunc_arg_constant(meta->btf, &args[i])) { +				if (meta->arg_constant.found) { +					verbose(env, "verifier internal error: only one constant argument permitted\n"); +					return -EFAULT; +				} +				if (!tnum_is_const(reg->var_off)) { +					verbose(env, "R%d must be a known constant\n", regno); +					return -EINVAL; +				} +				ret = mark_chain_precision(env, regno); +				if (ret < 0) +					return ret; +				meta->arg_constant.found = true; +				meta->arg_constant.value = reg->var_off.value; +			} else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdonly_buf_size")) { +				meta->r0_rdonly = true; +				is_ret_buf_sz = true; +			} else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdwr_buf_size")) { +				is_ret_buf_sz = true; +			} + +			if (is_ret_buf_sz) { +				if (meta->r0_size) { +					verbose(env, "2 or more rdonly/rdwr_buf_size parameters for kfunc"); +					return -EINVAL; +				} + +				if (!tnum_is_const(reg->var_off)) { +					verbose(env, "R%d is not a const\n", regno); +					return -EINVAL; +				} + +				meta->r0_size = reg->var_off.value; +				ret = mark_chain_precision(env, regno); +				if (ret) +					return ret; +			} +			continue; +		} + +		if (!btf_type_is_ptr(t)) { +			verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t)); +			return -EINVAL; +		} + +		if (reg->ref_obj_id) { +			if (is_kfunc_release(meta) && meta->ref_obj_id) { +				verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", +					regno, reg->ref_obj_id, +					meta->ref_obj_id); +				return -EFAULT; +			} +			meta->ref_obj_id = reg->ref_obj_id; +			if (is_kfunc_release(meta)) +				meta->release_regno = regno; +		} + +		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); +		ref_tname = btf_name_by_offset(btf, ref_t->name_off); + +		kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs); +		if (kf_arg_type < 0) +			return kf_arg_type; + +		switch (kf_arg_type) { +		case KF_ARG_PTR_TO_ALLOC_BTF_ID: +		case KF_ARG_PTR_TO_BTF_ID: +			if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) +				break; + +			if (!is_trusted_reg(reg)) { +				if (!is_kfunc_rcu(meta)) { +					verbose(env, "R%d must be referenced or trusted\n", regno); +					return -EINVAL; +				} +				if (!is_rcu_reg(reg)) { +					verbose(env, "R%d must be a rcu pointer\n", regno); +					return -EINVAL; +				} +			} + +			fallthrough; +		case KF_ARG_PTR_TO_CTX: +			/* Trusted arguments have the same offset checks as release arguments */ +			arg_type |= OBJ_RELEASE; +			break; +		case KF_ARG_PTR_TO_KPTR: +		case KF_ARG_PTR_TO_DYNPTR: +		case KF_ARG_PTR_TO_LIST_HEAD: +		case KF_ARG_PTR_TO_LIST_NODE: +		case KF_ARG_PTR_TO_MEM: +		case KF_ARG_PTR_TO_MEM_SIZE: +			/* Trusted by default */ +			break; +		default: +			WARN_ON_ONCE(1); +			return -EFAULT; +		} + +		if (is_kfunc_release(meta) && reg->ref_obj_id) +			arg_type |= OBJ_RELEASE; +		ret = check_func_arg_reg_off(env, reg, regno, arg_type); +		if (ret < 0) +			return ret; + +		switch (kf_arg_type) { +		case KF_ARG_PTR_TO_CTX: +			if (reg->type != PTR_TO_CTX) { +				verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t)); +				return -EINVAL; +			} + +			if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { +				ret = get_kern_ctx_btf_id(&env->log, resolve_prog_type(env->prog)); +				if (ret < 0) +					return -EINVAL; +				meta->ret_btf_id  = ret; +			} +			break; +		case KF_ARG_PTR_TO_ALLOC_BTF_ID: +			if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { +				verbose(env, "arg#%d expected pointer to allocated object\n", i); +				return -EINVAL; +			} +			if (!reg->ref_obj_id) { +				verbose(env, "allocated object must be referenced\n"); +				return -EINVAL; +			} +			if (meta->btf == btf_vmlinux && +			    meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { +				meta->arg_obj_drop.btf = reg->btf; +				meta->arg_obj_drop.btf_id = reg->btf_id; +			} +			break; +		case KF_ARG_PTR_TO_KPTR: +			if (reg->type != PTR_TO_MAP_VALUE) { +				verbose(env, "arg#0 expected pointer to map value\n"); +				return -EINVAL; +			} +			ret = process_kf_arg_ptr_to_kptr(env, reg, ref_t, ref_tname, meta, i); +			if (ret < 0) +				return ret; +			break; +		case KF_ARG_PTR_TO_DYNPTR: +			if (reg->type != PTR_TO_STACK && +			    reg->type != CONST_PTR_TO_DYNPTR) { +				verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i); +				return -EINVAL; +			} + +			ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL); +			if (ret < 0) +				return ret; +			break; +		case KF_ARG_PTR_TO_LIST_HEAD: +			if (reg->type != PTR_TO_MAP_VALUE && +			    reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { +				verbose(env, "arg#%d expected pointer to map value or allocated object\n", i); +				return -EINVAL; +			} +			if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { +				verbose(env, "allocated object must be referenced\n"); +				return -EINVAL; +			} +			ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta); +			if (ret < 0) +				return ret; +			break; +		case KF_ARG_PTR_TO_LIST_NODE: +			if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { +				verbose(env, "arg#%d expected pointer to allocated object\n", i); +				return -EINVAL; +			} +			if (!reg->ref_obj_id) { +				verbose(env, "allocated object must be referenced\n"); +				return -EINVAL; +			} +			ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta); +			if (ret < 0) +				return ret; +			break; +		case KF_ARG_PTR_TO_BTF_ID: +			/* Only base_type is checked, further checks are done here */ +			if ((base_type(reg->type) != PTR_TO_BTF_ID || +			     (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) && +			    !reg2btf_ids[base_type(reg->type)]) { +				verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type)); +				verbose(env, "expected %s or socket\n", +					reg_type_str(env, base_type(reg->type) | +							  (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS))); +				return -EINVAL; +			} +			ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i); +			if (ret < 0) +				return ret; +			break; +		case KF_ARG_PTR_TO_MEM: +			resolve_ret = btf_resolve_size(btf, ref_t, &type_size); +			if (IS_ERR(resolve_ret)) { +				verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n", +					i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret)); +				return -EINVAL; +			} +			ret = check_mem_reg(env, reg, regno, type_size); +			if (ret < 0) +				return ret; +			break; +		case KF_ARG_PTR_TO_MEM_SIZE: +			ret = check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1); +			if (ret < 0) { +				verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); +				return ret; +			} +			/* Skip next '__sz' argument */ +			i++; +			break; +		} +	} + +	if (is_kfunc_release(meta) && !meta->release_regno) { +		verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", +			func_name); +		return -EINVAL; +	} + +	return 0; +} +  static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  			    int *insn_idx_p)  {  	const struct btf_type *t, *func, *func_proto, *ptr_type;  	struct bpf_reg_state *regs = cur_regs(env); -	struct bpf_kfunc_arg_meta meta = { 0 };  	const char *func_name, *ptr_type_name; +	bool sleepable, rcu_lock, rcu_unlock; +	struct bpf_kfunc_call_arg_meta meta;  	u32 i, nargs, func_id, ptr_type_id;  	int err, insn_idx = *insn_idx_p;  	const struct btf_param *args; +	const struct btf_type *ret_t;  	struct btf *desc_btf;  	u32 *kfunc_flags; -	bool acq;  	/* skip for now, but return error when we find this in fixup_kfunc_call */  	if (!insn->imm) @@ -7700,24 +9092,68 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  			func_name);  		return -EACCES;  	} -	if (*kfunc_flags & KF_DESTRUCTIVE && !capable(CAP_SYS_BOOT)) { -		verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capabilities\n"); + +	/* Prepare kfunc call metadata */ +	memset(&meta, 0, sizeof(meta)); +	meta.btf = desc_btf; +	meta.func_id = func_id; +	meta.kfunc_flags = *kfunc_flags; +	meta.func_proto = func_proto; +	meta.func_name = func_name; + +	if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { +		verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");  		return -EACCES;  	} -	acq = *kfunc_flags & KF_ACQUIRE; +	sleepable = is_kfunc_sleepable(&meta); +	if (sleepable && !env->prog->aux->sleepable) { +		verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name); +		return -EACCES; +	} + +	rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); +	rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); +	if ((rcu_lock || rcu_unlock) && !env->rcu_tag_supported) { +		verbose(env, "no vmlinux btf rcu tag support for kfunc %s\n", func_name); +		return -EACCES; +	} -	meta.flags = *kfunc_flags; +	if (env->cur_state->active_rcu_lock) { +		struct bpf_func_state *state; +		struct bpf_reg_state *reg; + +		if (rcu_lock) { +			verbose(env, "nested rcu read lock (kernel function %s)\n", func_name); +			return -EINVAL; +		} else if (rcu_unlock) { +			bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ +				if (reg->type & MEM_RCU) { +					reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); +					reg->type |= PTR_UNTRUSTED; +				} +			})); +			env->cur_state->active_rcu_lock = false; +		} else if (sleepable) { +			verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name); +			return -EACCES; +		} +	} else if (rcu_lock) { +		env->cur_state->active_rcu_lock = true; +	} else if (rcu_unlock) { +		verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); +		return -EINVAL; +	}  	/* Check the arguments */ -	err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, &meta); +	err = check_kfunc_args(env, &meta);  	if (err < 0)  		return err;  	/* In case of release function, we get register number of refcounted -	 * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now +	 * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.  	 */ -	if (err) { -		err = release_reference(env, regs[err].ref_obj_id); +	if (meta.release_regno) { +		err = release_reference(env, regs[meta.release_regno].ref_obj_id);  		if (err) {  			verbose(env, "kfunc %s#%d reference has not been acquired before\n",  				func_name, func_id); @@ -7731,18 +9167,92 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  	/* Check return type */  	t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); -	if (acq && !btf_type_is_struct_ptr(desc_btf, t)) { -		verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); -		return -EINVAL; +	if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) { +		/* Only exception is bpf_obj_new_impl */ +		if (meta.btf != btf_vmlinux || meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl]) { +			verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); +			return -EINVAL; +		}  	}  	if (btf_type_is_scalar(t)) {  		mark_reg_unknown(env, regs, BPF_REG_0);  		mark_btf_func_reg_size(env, BPF_REG_0, t->size);  	} else if (btf_type_is_ptr(t)) { -		ptr_type = btf_type_skip_modifiers(desc_btf, t->type, -						   &ptr_type_id); -		if (!btf_type_is_struct(ptr_type)) { +		ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); + +		if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { +			if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { +				struct btf *ret_btf; +				u32 ret_btf_id; + +				if (unlikely(!bpf_global_ma_set)) +					return -ENOMEM; + +				if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { +					verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); +					return -EINVAL; +				} + +				ret_btf = env->prog->aux->btf; +				ret_btf_id = meta.arg_constant.value; + +				/* This may be NULL due to user not supplying a BTF */ +				if (!ret_btf) { +					verbose(env, "bpf_obj_new requires prog BTF\n"); +					return -EINVAL; +				} + +				ret_t = btf_type_by_id(ret_btf, ret_btf_id); +				if (!ret_t || !__btf_type_is_struct(ret_t)) { +					verbose(env, "bpf_obj_new type ID argument must be of a struct\n"); +					return -EINVAL; +				} + +				mark_reg_known_zero(env, regs, BPF_REG_0); +				regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; +				regs[BPF_REG_0].btf = ret_btf; +				regs[BPF_REG_0].btf_id = ret_btf_id; + +				env->insn_aux_data[insn_idx].obj_new_size = ret_t->size; +				env->insn_aux_data[insn_idx].kptr_struct_meta = +					btf_find_struct_meta(ret_btf, ret_btf_id); +			} else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { +				env->insn_aux_data[insn_idx].kptr_struct_meta = +					btf_find_struct_meta(meta.arg_obj_drop.btf, +							     meta.arg_obj_drop.btf_id); +			} else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] || +				   meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { +				struct btf_field *field = meta.arg_list_head.field; + +				mark_reg_known_zero(env, regs, BPF_REG_0); +				regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; +				regs[BPF_REG_0].btf = field->list_head.btf; +				regs[BPF_REG_0].btf_id = field->list_head.value_btf_id; +				regs[BPF_REG_0].off = field->list_head.node_offset; +			} else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { +				mark_reg_known_zero(env, regs, BPF_REG_0); +				regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; +				regs[BPF_REG_0].btf = desc_btf; +				regs[BPF_REG_0].btf_id = meta.ret_btf_id; +			} else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { +				ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value); +				if (!ret_t || !btf_type_is_struct(ret_t)) { +					verbose(env, +						"kfunc bpf_rdonly_cast type ID argument must be of a struct\n"); +					return -EINVAL; +				} + +				mark_reg_known_zero(env, regs, BPF_REG_0); +				regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; +				regs[BPF_REG_0].btf = desc_btf; +				regs[BPF_REG_0].btf_id = meta.arg_constant.value; +			} else { +				verbose(env, "kernel function %s unhandled dynamic return type\n", +					meta.func_name); +				return -EFAULT; +			} +		} else if (!__btf_type_is_struct(ptr_type)) {  			if (!meta.r0_size) {  				ptr_type_name = btf_name_by_offset(desc_btf,  								   ptr_type->name_off); @@ -7770,20 +9280,24 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  			regs[BPF_REG_0].type = PTR_TO_BTF_ID;  			regs[BPF_REG_0].btf_id = ptr_type_id;  		} -		if (*kfunc_flags & KF_RET_NULL) { + +		if (is_kfunc_ret_null(&meta)) {  			regs[BPF_REG_0].type |= PTR_MAYBE_NULL;  			/* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */  			regs[BPF_REG_0].id = ++env->id_gen;  		}  		mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); -		if (acq) { +		if (is_kfunc_acquire(&meta)) {  			int id = acquire_reference_state(env, insn_idx);  			if (id < 0)  				return id; -			regs[BPF_REG_0].id = id; +			if (is_kfunc_ret_null(&meta)) +				regs[BPF_REG_0].id = id;  			regs[BPF_REG_0].ref_obj_id = id;  		} +		if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id) +			regs[BPF_REG_0].id = ++env->id_gen;  	} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */  	nargs = btf_type_vlen(func_proto); @@ -9211,6 +10725,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,  				return err;  			return adjust_ptr_min_max_vals(env, insn,  						       dst_reg, src_reg); +		} else if (dst_reg->precise) { +			/* if dst_reg is precise, src_reg should be precise as well */ +			err = mark_chain_precision(env, insn->src_reg); +			if (err) +				return err;  		}  	} else {  		/* Pretend the src is a reg with a known value, since we only @@ -9950,17 +11469,20 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,  				 bool is_null)  {  	if (type_may_be_null(reg->type) && reg->id == id && -	    !WARN_ON_ONCE(!reg->id)) { -		if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || -				 !tnum_equals_const(reg->var_off, 0) || -				 reg->off)) { -			/* Old offset (both fixed and variable parts) should -			 * have been known-zero, because we don't allow pointer -			 * arithmetic on pointers that might be NULL. If we -			 * see this happening, don't convert the register. -			 */ +	    (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) { +		/* Old offset (both fixed and variable parts) should have been +		 * known-zero, because we don't allow pointer arithmetic on +		 * pointers that might be NULL. If we see this happening, don't +		 * convert the register. +		 * +		 * But in some cases, some helpers that return local kptrs +		 * advance offset for the returned pointer. In those cases, it +		 * is fine to expect to see reg->off. +		 */ +		if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0))) +			return; +		if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL) && WARN_ON_ONCE(reg->off))  			return; -		}  		if (is_null) {  			reg->type = SCALAR_VALUE;  			/* We don't need id and ref_obj_id from this point @@ -10134,6 +11656,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  	struct bpf_verifier_state *other_branch;  	struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;  	struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; +	struct bpf_reg_state *eq_branch_regs;  	u8 opcode = BPF_OP(insn->code);  	bool is_jmp32;  	int pred = -1; @@ -10243,8 +11766,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  	/* detect if we are comparing against a constant value so we can adjust  	 * our min/max values for our dst register.  	 * this is only legit if both are scalars (or pointers to the same -	 * object, I suppose, but we don't support that right now), because -	 * otherwise the different base pointers mean the offsets aren't +	 * object, I suppose, see the PTR_MAYBE_NULL related if block below), +	 * because otherwise the different base pointers mean the offsets aren't  	 * comparable.  	 */  	if (BPF_SRC(insn->code) == BPF_X) { @@ -10293,6 +11816,36 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  		find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);  	} +	/* if one pointer register is compared to another pointer +	 * register check if PTR_MAYBE_NULL could be lifted. +	 * E.g. register A - maybe null +	 *      register B - not null +	 * for JNE A, B, ... - A is not null in the false branch; +	 * for JEQ A, B, ... - A is not null in the true branch. +	 */ +	if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X && +	    __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) && +	    type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type)) { +		eq_branch_regs = NULL; +		switch (opcode) { +		case BPF_JEQ: +			eq_branch_regs = other_branch_regs; +			break; +		case BPF_JNE: +			eq_branch_regs = regs; +			break; +		default: +			/* do nothing */ +			break; +		} +		if (eq_branch_regs) { +			if (type_may_be_null(src_reg->type)) +				mark_ptr_not_null_reg(&eq_branch_regs[insn->src_reg]); +			else +				mark_ptr_not_null_reg(&eq_branch_regs[insn->dst_reg]); +		} +	} +  	/* detect if R == 0 where R is returned from bpf_map_lookup_elem().  	 * NOTE: these optimizations below are related with pointer comparison  	 *       which will never be JMP32. @@ -10399,8 +11952,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)  	    insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {  		dst_reg->type = PTR_TO_MAP_VALUE;  		dst_reg->off = aux->map_off; -		if (map_value_has_spin_lock(map)) -			dst_reg->id = ++env->id_gen; +		WARN_ON_ONCE(map->max_entries != 1); +		/* We want reg->id to be same (0) as map_value is not distinct */  	} else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||  		   insn->src_reg == BPF_PSEUDO_MAP_IDX) {  		dst_reg->type = CONST_PTR_TO_MAP; @@ -10478,11 +12031,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)  		return err;  	} -	if (env->cur_state->active_spin_lock) { +	if (env->cur_state->active_lock.ptr) {  		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");  		return -EINVAL;  	} +	if (env->cur_state->active_rcu_lock) { +		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n"); +		return -EINVAL; +	} +  	if (regs[ctx_reg].type != PTR_TO_CTX) {  		verbose(env,  			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -10684,7 +12242,7 @@ static int check_return_code(struct bpf_verifier_env *env)   * 3      let S be a stack   * 4      S.push(v)   * 5      while S is not empty - * 6            t <- S.pop() + * 6            t <- S.peek()   * 7            if t is what we're looking for:   * 8                return t   * 9            for all edges e in G.adjacentEdges(t) do @@ -10733,11 +12291,16 @@ static struct bpf_verifier_state_list **explored_state(  	return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];  } -static void init_explored_state(struct bpf_verifier_env *env, int idx) +static void mark_prune_point(struct bpf_verifier_env *env, int idx)  {  	env->insn_aux_data[idx].prune_point = true;  } +static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx) +{ +	return env->insn_aux_data[insn_idx].prune_point; +} +  enum {  	DONE_EXPLORING = 0,  	KEEP_EXPLORING = 1, @@ -10766,9 +12329,11 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,  		return -EINVAL;  	} -	if (e == BRANCH) +	if (e == BRANCH) {  		/* mark branch target for state pruning */ -		init_explored_state(env, w); +		mark_prune_point(env, w); +		mark_jmp_point(env, w); +	}  	if (insn_state[w] == 0) {  		/* tree-edge */ @@ -10795,8 +12360,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,  	return DONE_EXPLORING;  } -static int visit_func_call_insn(int t, int insn_cnt, -				struct bpf_insn *insns, +static int visit_func_call_insn(int t, struct bpf_insn *insns,  				struct bpf_verifier_env *env,  				bool visit_callee)  { @@ -10806,10 +12370,12 @@ static int visit_func_call_insn(int t, int insn_cnt,  	if (ret)  		return ret; -	if (t + 1 < insn_cnt) -		init_explored_state(env, t + 1); +	mark_prune_point(env, t + 1); +	/* when we exit from subprog, we need to record non-linear history */ +	mark_jmp_point(env, t + 1); +  	if (visit_callee) { -		init_explored_state(env, t); +		mark_prune_point(env, t);  		ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,  				/* It's ok to allow recursion from CFG point of  				 * view. __check_func_call() will do the actual @@ -10825,13 +12391,13 @@ static int visit_func_call_insn(int t, int insn_cnt,   *  DONE_EXPLORING - the instruction was fully explored   *  KEEP_EXPLORING - there is still work to be done before it is fully explored   */ -static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) +static int visit_insn(int t, struct bpf_verifier_env *env)  {  	struct bpf_insn *insns = env->prog->insnsi;  	int ret;  	if (bpf_pseudo_func(insns + t)) -		return visit_func_call_insn(t, insn_cnt, insns, env, true); +		return visit_func_call_insn(t, insns, env, true);  	/* All non-branch instructions have a single fall-through edge. */  	if (BPF_CLASS(insns[t].code) != BPF_JMP && @@ -10844,13 +12410,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)  	case BPF_CALL:  		if (insns[t].imm == BPF_FUNC_timer_set_callback) -			/* Mark this call insn to trigger is_state_visited() check -			 * before call itself is processed by __check_func_call(). -			 * Otherwise new async state will be pushed for further -			 * exploration. +			/* Mark this call insn as a prune point to trigger +			 * is_state_visited() check before call itself is +			 * processed by __check_func_call(). Otherwise new +			 * async state will be pushed for further exploration.  			 */ -			init_explored_state(env, t); -		return visit_func_call_insn(t, insn_cnt, insns, env, +			mark_prune_point(env, t); +		return visit_func_call_insn(t, insns, env,  					    insns[t].src_reg == BPF_PSEUDO_CALL);  	case BPF_JA: @@ -10863,22 +12429,15 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)  		if (ret)  			return ret; -		/* unconditional jmp is not a good pruning point, -		 * but it's marked, since backtracking needs -		 * to record jmp history in is_state_visited(). -		 */ -		init_explored_state(env, t + insns[t].off + 1); -		/* tell verifier to check for equivalent states -		 * after every call and jump -		 */ -		if (t + 1 < insn_cnt) -			init_explored_state(env, t + 1); +		mark_prune_point(env, t + insns[t].off + 1); +		mark_jmp_point(env, t + insns[t].off + 1);  		return ret;  	default:  		/* conditional jump with two edges */ -		init_explored_state(env, t); +		mark_prune_point(env, t); +  		ret = push_insn(t, t + 1, FALLTHROUGH, env, true);  		if (ret)  			return ret; @@ -10914,7 +12473,7 @@ static int check_cfg(struct bpf_verifier_env *env)  	while (env->cfg.cur_stack > 0) {  		int t = insn_stack[env->cfg.cur_stack - 1]; -		ret = visit_insn(t, insn_cnt, env); +		ret = visit_insn(t, env);  		switch (ret) {  		case DONE_EXPLORING:  			insn_state[t] = EXPLORED; @@ -11505,15 +13064,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,  	equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0; -	if (rold->type == PTR_TO_STACK) -		/* two stack pointers are equal only if they're pointing to -		 * the same stack frame, since fp-8 in foo != fp-8 in bar -		 */ -		return equal && rold->frameno == rcur->frameno; - -	if (equal) -		return true; -  	if (rold->type == NOT_INIT)  		/* explored state can't have used this */  		return true; @@ -11521,10 +13071,12 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,  		return false;  	switch (base_type(rold->type)) {  	case SCALAR_VALUE: +		if (equal) +			return true;  		if (env->explore_alu_limits)  			return false;  		if (rcur->type == SCALAR_VALUE) { -			if (!rold->precise && !rcur->precise) +			if (!rold->precise)  				return true;  			/* new val must satisfy old val knowledge */  			return range_within(rold, rcur) && @@ -11567,7 +13119,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,  		 */  		return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&  		       range_within(rold, rcur) && -		       tnum_in(rold->var_off, rcur->var_off); +		       tnum_in(rold->var_off, rcur->var_off) && +		       check_ids(rold->id, rcur->id, idmap);  	case PTR_TO_PACKET_META:  	case PTR_TO_PACKET:  		if (rcur->type != rold->type) @@ -11591,20 +13144,14 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,  		/* new val must satisfy old val knowledge */  		return range_within(rold, rcur) &&  		       tnum_in(rold->var_off, rcur->var_off); -	case PTR_TO_CTX: -	case CONST_PTR_TO_MAP: -	case PTR_TO_PACKET_END: -	case PTR_TO_FLOW_KEYS: -	case PTR_TO_SOCKET: -	case PTR_TO_SOCK_COMMON: -	case PTR_TO_TCP_SOCK: -	case PTR_TO_XDP_SOCK: -		/* Only valid matches are exact, which memcmp() above -		 * would have accepted +	case PTR_TO_STACK: +		/* two stack pointers are equal only if they're pointing to +		 * the same stack frame, since fp-8 in foo != fp-8 in bar  		 */ +		return equal && rold->frameno == rcur->frameno;  	default: -		/* Don't know what's going on, just say it's not safe */ -		return false; +		/* Only valid matches are exact, which memcmp() */ +		return equal;  	}  	/* Shouldn't get here; if we do, say it's not safe */ @@ -11714,7 +13261,6 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat  {  	int i; -	memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));  	for (i = 0; i < MAX_BPF_REG; i++)  		if (!regsafe(env, &old->regs[i], &cur->regs[i],  			     env->idmap_scratch)) @@ -11738,13 +13284,28 @@ static bool states_equal(struct bpf_verifier_env *env,  	if (old->curframe != cur->curframe)  		return false; +	memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); +  	/* Verification state from speculative execution simulation  	 * must never prune a non-speculative execution one.  	 */  	if (old->speculative && !cur->speculative)  		return false; -	if (old->active_spin_lock != cur->active_spin_lock) +	if (old->active_lock.ptr != cur->active_lock.ptr) +		return false; + +	/* Old and cur active_lock's have to be either both present +	 * or both absent. +	 */ +	if (!!old->active_lock.id != !!cur->active_lock.id) +		return false; + +	if (old->active_lock.id && +	    !check_ids(old->active_lock.id, cur->active_lock.id, env->idmap_scratch)) +		return false; + +	if (old->active_rcu_lock != cur->active_rcu_lock)  		return false;  	/* for states to be equal callsites have to be the same @@ -11847,34 +13408,36 @@ static int propagate_precision(struct bpf_verifier_env *env,  {  	struct bpf_reg_state *state_reg;  	struct bpf_func_state *state; -	int i, err = 0; +	int i, err = 0, fr; -	state = old->frame[old->curframe]; -	state_reg = state->regs; -	for (i = 0; i < BPF_REG_FP; i++, state_reg++) { -		if (state_reg->type != SCALAR_VALUE || -		    !state_reg->precise) -			continue; -		if (env->log.level & BPF_LOG_LEVEL2) -			verbose(env, "propagating r%d\n", i); -		err = mark_chain_precision(env, i); -		if (err < 0) -			return err; -	} +	for (fr = old->curframe; fr >= 0; fr--) { +		state = old->frame[fr]; +		state_reg = state->regs; +		for (i = 0; i < BPF_REG_FP; i++, state_reg++) { +			if (state_reg->type != SCALAR_VALUE || +			    !state_reg->precise) +				continue; +			if (env->log.level & BPF_LOG_LEVEL2) +				verbose(env, "frame %d: propagating r%d\n", i, fr); +			err = mark_chain_precision_frame(env, fr, i); +			if (err < 0) +				return err; +		} -	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { -		if (!is_spilled_reg(&state->stack[i])) -			continue; -		state_reg = &state->stack[i].spilled_ptr; -		if (state_reg->type != SCALAR_VALUE || -		    !state_reg->precise) -			continue; -		if (env->log.level & BPF_LOG_LEVEL2) -			verbose(env, "propagating fp%d\n", -				(-i - 1) * BPF_REG_SIZE); -		err = mark_chain_precision_stack(env, i); -		if (err < 0) -			return err; +		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { +			if (!is_spilled_reg(&state->stack[i])) +				continue; +			state_reg = &state->stack[i].spilled_ptr; +			if (state_reg->type != SCALAR_VALUE || +			    !state_reg->precise) +				continue; +			if (env->log.level & BPF_LOG_LEVEL2) +				verbose(env, "frame %d: propagating fp%d\n", +					(-i - 1) * BPF_REG_SIZE, fr); +			err = mark_chain_precision_stack_frame(env, fr, i); +			if (err < 0) +				return err; +		}  	}  	return 0;  } @@ -11906,13 +13469,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  	int i, j, err, states_cnt = 0;  	bool add_new_state = env->test_state_freq ? true : false; -	cur->last_insn_idx = env->prev_insn_idx; -	if (!env->insn_aux_data[insn_idx].prune_point) -		/* this 'insn_idx' instruction wasn't marked, so we will not -		 * be doing state search here -		 */ -		return 0; -  	/* bpf progs typically have pruning point every 4 instructions  	 * http://vger.kernel.org/bpfconf2019.html#session-1  	 * Do not add new state for future pruning if the verifier hasn't seen @@ -12047,10 +13603,10 @@ next:  		env->max_states_per_insn = states_cnt;  	if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) -		return push_jmp_history(env, cur); +		return 0;  	if (!add_new_state) -		return push_jmp_history(env, cur); +		return 0;  	/* There were no equivalent states, remember the current one.  	 * Technically the current state is not proven to be safe yet, @@ -12069,6 +13625,10 @@ next:  	env->prev_jmps_processed = env->jmps_processed;  	env->prev_insn_processed = env->insn_processed; +	/* forget precise markings we inherited, see __mark_chain_precision */ +	if (env->bpf_capable) +		mark_all_scalars_imprecise(env, cur); +  	/* add new state to the head of linked list */  	new = &new_sl->state;  	err = copy_verifier_state(new, cur); @@ -12186,21 +13746,31 @@ static int do_check(struct bpf_verifier_env *env)  			return -E2BIG;  		} -		err = is_state_visited(env, env->insn_idx); -		if (err < 0) -			return err; -		if (err == 1) { -			/* found equivalent state, can prune the search */ -			if (env->log.level & BPF_LOG_LEVEL) { -				if (do_print_state) -					verbose(env, "\nfrom %d to %d%s: safe\n", -						env->prev_insn_idx, env->insn_idx, -						env->cur_state->speculative ? -						" (speculative execution)" : ""); -				else -					verbose(env, "%d: safe\n", env->insn_idx); +		state->last_insn_idx = env->prev_insn_idx; + +		if (is_prune_point(env, env->insn_idx)) { +			err = is_state_visited(env, env->insn_idx); +			if (err < 0) +				return err; +			if (err == 1) { +				/* found equivalent state, can prune the search */ +				if (env->log.level & BPF_LOG_LEVEL) { +					if (do_print_state) +						verbose(env, "\nfrom %d to %d%s: safe\n", +							env->prev_insn_idx, env->insn_idx, +							env->cur_state->speculative ? +							" (speculative execution)" : ""); +					else +						verbose(env, "%d: safe\n", env->insn_idx); +				} +				goto process_bpf_exit;  			} -			goto process_bpf_exit; +		} + +		if (is_jmp_point(env, env->insn_idx)) { +			err = push_jmp_history(env, state); +			if (err) +				return err;  		}  		if (signal_pending(current)) @@ -12383,11 +13953,14 @@ static int do_check(struct bpf_verifier_env *env)  					return -EINVAL;  				} -				if (env->cur_state->active_spin_lock && -				    (insn->src_reg == BPF_PSEUDO_CALL || -				     insn->imm != BPF_FUNC_spin_unlock)) { -					verbose(env, "function calls are not allowed while holding a lock\n"); -					return -EINVAL; +				if (env->cur_state->active_lock.ptr) { +					if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) || +					    (insn->src_reg == BPF_PSEUDO_CALL) || +					    (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && +					     (insn->off != 0 || !is_bpf_list_api_kfunc(insn->imm)))) { +						verbose(env, "function calls are not allowed while holding a lock\n"); +						return -EINVAL; +					}  				}  				if (insn->src_reg == BPF_PSEUDO_CALL)  					err = check_func_call(env, insn, &env->insn_idx); @@ -12420,11 +13993,16 @@ static int do_check(struct bpf_verifier_env *env)  					return -EINVAL;  				} -				if (env->cur_state->active_spin_lock) { +				if (env->cur_state->active_lock.ptr) {  					verbose(env, "bpf_spin_unlock is missing\n");  					return -EINVAL;  				} +				if (env->cur_state->active_rcu_lock) { +					verbose(env, "bpf_rcu_read_unlock is missing\n"); +					return -EINVAL; +				} +  				/* We must do check_reference_leak here before  				 * prepare_func_exit to handle the case when  				 * state->curframe > 0, it may be a callback @@ -12677,7 +14255,14 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,  {  	enum bpf_prog_type prog_type = resolve_prog_type(prog); -	if (map_value_has_spin_lock(map)) { +	if (btf_record_has_field(map->record, BPF_LIST_HEAD)) { +		if (is_tracing_prog_type(prog_type)) { +			verbose(env, "tracing progs cannot use bpf_list_head yet\n"); +			return -EINVAL; +		} +	} + +	if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) {  		if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {  			verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");  			return -EINVAL; @@ -12694,7 +14279,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,  		}  	} -	if (map_value_has_timer(map)) { +	if (btf_record_has_field(map->record, BPF_TIMER)) {  		if (is_tracing_prog_type(prog_type)) {  			verbose(env, "tracing progs cannot use bpf_timer yet\n");  			return -EINVAL; @@ -12727,10 +14312,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,  		case BPF_MAP_TYPE_INODE_STORAGE:  		case BPF_MAP_TYPE_SK_STORAGE:  		case BPF_MAP_TYPE_TASK_STORAGE: +		case BPF_MAP_TYPE_CGRP_STORAGE:  			break;  		default:  			verbose(env, -				"Sleepable programs can only use array, hash, and ringbuf maps\n"); +				"Sleepable programs can only use array, hash, ringbuf and local storage maps\n");  			return -EINVAL;  		} @@ -13386,6 +14972,10 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,  		if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))  			continue; +		/* Zero-extension is done by the caller. */ +		if (bpf_pseudo_kfunc_call(&insn)) +			continue; +  		if (WARN_ON(load_reg == -1)) {  			verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n");  			return -EFAULT; @@ -13513,6 +15103,13 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  			break;  		case PTR_TO_BTF_ID:  		case PTR_TO_BTF_ID | PTR_UNTRUSTED: +		/* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike +		 * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot +		 * be said once it is marked PTR_UNTRUSTED, hence we must handle +		 * any faults for loads into such types. BPF_WRITE is disallowed +		 * for this case. +		 */ +		case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:  			if (type == BPF_READ) {  				insn->code = BPF_LDX | BPF_PROBE_MEM |  					BPF_SIZE((insn)->code); @@ -13878,8 +15475,8 @@ static int fixup_call_args(struct bpf_verifier_env *env)  	return err;  } -static int fixup_kfunc_call(struct bpf_verifier_env *env, -			    struct bpf_insn *insn) +static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, +			    struct bpf_insn *insn_buf, int insn_idx, int *cnt)  {  	const struct bpf_kfunc_desc *desc; @@ -13889,7 +15486,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,  	}  	/* insn->imm has the btf func_id. Replace it with -	 * an address (relative to __bpf_base_call). +	 * an address (relative to __bpf_call_base).  	 */  	desc = find_kfunc_desc(env->prog, insn->imm, insn->off);  	if (!desc) { @@ -13898,8 +15495,33 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,  		return -EFAULT;  	} +	*cnt = 0;  	insn->imm = desc->imm; - +	if (insn->off) +		return 0; +	if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { +		struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; +		struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; +		u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size; + +		insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size); +		insn_buf[1] = addr[0]; +		insn_buf[2] = addr[1]; +		insn_buf[3] = *insn; +		*cnt = 4; +	} else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { +		struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; +		struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; + +		insn_buf[0] = addr[0]; +		insn_buf[1] = addr[1]; +		insn_buf[2] = *insn; +		*cnt = 3; +	} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || +		   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { +		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); +		*cnt = 1; +	}  	return 0;  } @@ -14041,9 +15663,19 @@ static int do_misc_fixups(struct bpf_verifier_env *env)  		if (insn->src_reg == BPF_PSEUDO_CALL)  			continue;  		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { -			ret = fixup_kfunc_call(env, insn); +			ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);  			if (ret)  				return ret; +			if (cnt == 0) +				continue; + +			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); +			if (!new_prog) +				return -ENOMEM; + +			delta	 += cnt - 1; +			env->prog = prog = new_prog; +			insn	  = new_prog->insnsi + i + delta;  			continue;  		} @@ -14161,13 +15793,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)  			goto patch_call_imm;  		} -		if (insn->imm == BPF_FUNC_task_storage_get || -		    insn->imm == BPF_FUNC_sk_storage_get || -		    insn->imm == BPF_FUNC_inode_storage_get) { -			if (env->prog->aux->sleepable) -				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); -			else +		if (is_storage_get_function(insn->imm)) { +			if (!env->prog->aux->sleepable || +			    env->insn_aux_data[i + delta].storage_get_func_atomic)  				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); +			else +				insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);  			insn_buf[1] = *insn;  			cnt = 2; @@ -14237,7 +15868,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)  			BUILD_BUG_ON(!__same_type(ops->map_peek_elem,  				     (int (*)(struct bpf_map *map, void *value))NULL));  			BUILD_BUG_ON(!__same_type(ops->map_redirect, -				     (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL)); +				     (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL));  			BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,  				     (int (*)(struct bpf_map *map,  					      bpf_callback_t callback_fn, @@ -14616,6 +16247,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)  			BPF_MAIN_FUNC /* callsite */,  			0 /* frameno */,  			subprog); +	state->first_insn_idx = env->subprog_info[subprog].start; +	state->last_insn_idx = -1;  	regs = state->frame[state->curframe]->regs;  	if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { @@ -15025,12 +16658,22 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,  			ret = -EINVAL;  			switch (prog->type) {  			case BPF_PROG_TYPE_TRACING: -				/* fentry/fexit/fmod_ret progs can be sleepable only if they are + +				/* fentry/fexit/fmod_ret progs can be sleepable if they are  				 * attached to ALLOW_ERROR_INJECTION and are not in denylist.  				 */  				if (!check_non_sleepable_error_inject(btf_id) &&  				    within_error_injection_list(addr))  					ret = 0; +				/* fentry/fexit/fmod_ret progs can also be sleepable if they are +				 * in the fmodret id set with the KF_SLEEPABLE flag. +				 */ +				else { +					u32 *flags = btf_kfunc_is_modify_return(btf, btf_id); + +					if (flags && (*flags & KF_SLEEPABLE)) +						ret = 0; +				}  				break;  			case BPF_PROG_TYPE_LSM:  				/* LSM progs check that they are attached to bpf_lsm_*() funcs. @@ -15051,7 +16694,10 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,  				bpf_log(log, "can't modify return codes of BPF programs\n");  				return -EINVAL;  			} -			ret = check_attach_modify_return(addr, tname); +			ret = -EINVAL; +			if (btf_kfunc_is_modify_return(btf, btf_id) || +			    !check_attach_modify_return(addr, tname)) +				ret = 0;  			if (ret) {  				bpf_log(log, "%s() is not modifiable\n", tname);  				return ret; @@ -15240,10 +16886,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)  	env->allow_ptr_leaks = bpf_allow_ptr_leaks();  	env->allow_uninit_stack = bpf_allow_uninit_stack(); -	env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();  	env->bypass_spec_v1 = bpf_bypass_spec_v1();  	env->bypass_spec_v4 = bpf_bypass_spec_v4();  	env->bpf_capable = bpf_capable(); +	env->rcu_tag_supported = btf_vmlinux && +		btf_find_by_name_kind(btf_vmlinux, "rcu", BTF_KIND_TYPE_TAG) > 0;  	if (is_priv)  		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; |