From 3feb263bb516ee7e1da0acd22b15afbb9a7daa19 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Nov 2023 16:26:36 -0800 Subject: bpf: handle ldimm64 properly in check_cfg() ldimm64 instructions are 16-byte long, and so have to be handled appropriately in check_cfg(), just like the rest of BPF verifier does. This has implications in three places: - when determining next instruction for non-jump instructions; - when determining next instruction for callback address ldimm64 instructions (in visit_func_call_insn()); - when checking for unreachable instructions, where second half of ldimm64 is expected to be unreachable; We take this also as an opportunity to report jump into the middle of ldimm64. And adjust few test_verifier tests accordingly. Acked-by: Eduard Zingerman Reported-by: Hao Sun Fixes: 475fb78fbf48 ("bpf: verifier (add branch/goto checks)") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231110002638.4168352-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bd1c42eb540f..b87715b364fd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15439,15 +15439,16 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, struct bpf_verifier_env *env, bool visit_callee) { - int ret; + int ret, insn_sz; - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); + insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; + ret = push_insn(t, t + insn_sz, FALLTHROUGH, env, false); if (ret) return ret; - mark_prune_point(env, t + 1); + mark_prune_point(env, t + insn_sz); /* when we exit from subprog, we need to record non-linear history */ - mark_jmp_point(env, t + 1); + mark_jmp_point(env, t + insn_sz); if (visit_callee) { mark_prune_point(env, t); @@ -15469,15 +15470,17 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, static int visit_insn(int t, struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; - int ret, off; + int ret, off, insn_sz; if (bpf_pseudo_func(insn)) return visit_func_call_insn(t, insns, env, true); /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insn->code) != BPF_JMP && - BPF_CLASS(insn->code) != BPF_JMP32) - return push_insn(t, t + 1, FALLTHROUGH, env, false); + BPF_CLASS(insn->code) != BPF_JMP32) { + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + return push_insn(t, t + insn_sz, FALLTHROUGH, env, false); + } switch (BPF_OP(insn->code)) { case BPF_EXIT: @@ -15607,11 +15610,21 @@ walk_cfg: } for (i = 0; i < insn_cnt; i++) { + struct bpf_insn *insn = &env->prog->insnsi[i]; + if (insn_state[i] != EXPLORED) { verbose(env, "unreachable insn %d\n", i); ret = -EINVAL; goto err_free; } + if (bpf_is_ldimm64(insn)) { + if (insn_state[i + 1] != 0) { + verbose(env, "jump into the middle of ldimm64 insn %d\n", i); + ret = -EINVAL; + goto err_free; + } + i++; /* skip second half of ldimm64 */ + } } ret = 0; /* cfg looks good */ -- cgit v1.2.3-73-gaa49b From 4bb7ea946a370707315ab774432963ce47291946 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Nov 2023 16:26:37 -0800 Subject: bpf: fix precision backtracking instruction iteration Fix an edge case in __mark_chain_precision() which prematurely stops backtracking instructions in a state if it happens that state's first and last instruction indexes are the same. This situations doesn't necessarily mean that there were no instructions simulated in a state, but rather that we starting from the instruction, jumped around a bit, and then ended up at the same instruction before checkpointing or marking precision. To distinguish between these two possible situations, we need to consult jump history. If it's empty or contain a single record "bridging" parent state and first instruction of processed state, then we indeed backtracked all instructions in this state. But if history is not empty, we are definitely not done yet. Move this logic inside get_prev_insn_idx() to contain it more nicely. Use -ENOENT return code to denote "we are out of instructions" situation. This bug was exposed by verifier_loop1.c's bounded_recursion subtest, once the next fix in this patch set is applied. Acked-by: Eduard Zingerman Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231110002638.4168352-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b87715b364fd..484c742f733e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3516,12 +3516,29 @@ static int push_jmp_history(struct bpf_verifier_env *env, /* Backtrack one insn at a time. If idx is not at the top of recorded * history then previous instruction came from straight line execution. + * Return -ENOENT if we exhausted all instructions within given state. + * + * It's legal to have a bit of a looping with the same starting and ending + * insn index within the same state, e.g.: 3->4->5->3, so just because current + * instruction index is the same as state's first_idx doesn't mean we are + * done. If there is still some jump history left, we should keep going. We + * need to take into account that we might have a jump history between given + * state's parent and itself, due to checkpointing. In this case, we'll have + * history entry recording a jump from last instruction of parent state and + * first instruction of given state. */ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, u32 *history) { u32 cnt = *history; + if (i == st->first_insn_idx) { + if (cnt == 0) + return -ENOENT; + if (cnt == 1 && st->jmp_history[0].idx == i) + return -ENOENT; + } + if (cnt && st->jmp_history[cnt - 1].idx == i) { i = st->jmp_history[cnt - 1].prev_idx; (*history)--; @@ -4401,10 +4418,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) * Nothing to be tracked further in the parent state. */ return 0; - if (i == first_idx) - break; subseq_idx = i; i = get_prev_insn_idx(st, i, &history); + if (i == -ENOENT) + break; if (i >= env->prog->len) { /* This can happen if backtracking reached insn 0 * and there are still reg_mask or stack_mask -- cgit v1.2.3-73-gaa49b From 10e14e9652bf9e8104151bfd9200433083deae3d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Nov 2023 22:14:10 -0800 Subject: bpf: fix control-flow graph checking in privileged mode When BPF program is verified in privileged mode, BPF verifier allows bounded loops. This means that from CFG point of view there are definitely some back-edges. Original commit adjusted check_cfg() logic to not detect back-edges in control flow graph if they are resulting from conditional jumps, which the idea that subsequent full BPF verification process will determine whether such loops are bounded or not, and either accept or reject the BPF program. At least that's my reading of the intent. Unfortunately, the implementation of this idea doesn't work correctly in all possible situations. Conditional jump might not result in immediate back-edge, but just a few unconditional instructions later we can arrive at back-edge. In such situations check_cfg() would reject BPF program even in privileged mode, despite it might be bounded loop. Next patch adds one simple program demonstrating such scenario. To keep things simple, instead of trying to detect back edges in privileged mode, just assume every back edge is valid and let subsequent BPF verification prove or reject bounded loops. Note a few test changes. For unknown reason, we have a few tests that are specified to detect a back-edge in a privileged mode, but looking at their code it seems like the right outcome is passing check_cfg() and letting subsequent verification to make a decision about bounded or not bounded looping. Bounded recursion case is also interesting. The example should pass, as recursion is limited to just a few levels and so we never reach maximum number of nested frames and never exhaust maximum stack depth. But the way that max stack depth logic works today it falsely detects this as exceeding max nested frame count. This patch series doesn't attempt to fix this orthogonal problem, so we just adjust expected verifier failure. Suggested-by: Alexei Starovoitov Fixes: 2589726d12a1 ("bpf: introduce bounded loops") Reported-by: Hao Sun Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231110061412.2995786-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 23 ++++++++-------------- .../testing/selftests/bpf/progs/verifier_loops1.c | 9 ++++++--- tools/testing/selftests/bpf/verifier/calls.c | 6 +++--- 3 files changed, 17 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 484c742f733e..a2267d5ed14e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15403,8 +15403,7 @@ enum { * w - next instruction * e - edge */ -static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, - bool loop_ok) +static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) { int *insn_stack = env->cfg.insn_stack; int *insn_state = env->cfg.insn_state; @@ -15436,7 +15435,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, insn_stack[env->cfg.cur_stack++] = w; return KEEP_EXPLORING; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (loop_ok && env->bpf_capable) + if (env->bpf_capable) return DONE_EXPLORING; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); @@ -15459,7 +15458,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, int ret, insn_sz; insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; - ret = push_insn(t, t + insn_sz, FALLTHROUGH, env, false); + ret = push_insn(t, t + insn_sz, FALLTHROUGH, env); if (ret) return ret; @@ -15469,12 +15468,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, if (visit_callee) { mark_prune_point(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env, - /* It's ok to allow recursion from CFG point of - * view. __check_func_call() will do the actual - * check. - */ - bpf_pseudo_func(insns + t)); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); } return ret; } @@ -15496,7 +15490,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) if (BPF_CLASS(insn->code) != BPF_JMP && BPF_CLASS(insn->code) != BPF_JMP32) { insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; - return push_insn(t, t + insn_sz, FALLTHROUGH, env, false); + return push_insn(t, t + insn_sz, FALLTHROUGH, env); } switch (BPF_OP(insn->code)) { @@ -15543,8 +15537,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) off = insn->imm; /* unconditional jump with single edge */ - ret = push_insn(t, t + off + 1, FALLTHROUGH, env, - true); + ret = push_insn(t, t + off + 1, FALLTHROUGH, env); if (ret) return ret; @@ -15557,11 +15550,11 @@ static int visit_insn(int t, struct bpf_verifier_env *env) /* conditional jump with two edges */ mark_prune_point(env, t); - ret = push_insn(t, t + 1, FALLTHROUGH, env, true); + ret = push_insn(t, t + 1, FALLTHROUGH, env); if (ret) return ret; - return push_insn(t, t + insn->off + 1, BRANCH, env, true); + return push_insn(t, t + insn->off + 1, BRANCH, env); } } diff --git a/tools/testing/selftests/bpf/progs/verifier_loops1.c b/tools/testing/selftests/bpf/progs/verifier_loops1.c index 5bc86af80a9a..71735dbf33d4 100644 --- a/tools/testing/selftests/bpf/progs/verifier_loops1.c +++ b/tools/testing/selftests/bpf/progs/verifier_loops1.c @@ -75,9 +75,10 @@ l0_%=: r0 += 1; \ " ::: __clobber_all); } -SEC("tracepoint") +SEC("socket") __description("bounded loop, start in the middle") -__failure __msg("back-edge") +__success +__failure_unpriv __msg_unpriv("back-edge") __naked void loop_start_in_the_middle(void) { asm volatile (" \ @@ -136,7 +137,9 @@ l0_%=: exit; \ SEC("tracepoint") __description("bounded recursion") -__failure __msg("back-edge") +__failure +/* verifier limitation in detecting max stack depth */ +__msg("the call stack of 8 frames is too deep !") __naked void bounded_recursion(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 1bdf2b43e49e..3d5cd51071f0 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -442,7 +442,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, - .errstr = "back-edge from insn 0 to 0", + .errstr = "the call stack of 9 frames is too deep", .result = REJECT, }, { @@ -799,7 +799,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, - .errstr = "back-edge", + .errstr = "the call stack of 9 frames is too deep", .result = REJECT, }, { @@ -811,7 +811,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, - .errstr = "back-edge", + .errstr = "the call stack of 9 frames is too deep", .result = REJECT, }, { -- cgit v1.2.3-73-gaa49b From 1fda5bb66ad8fb24ecb3858e61a13a6548428898 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 10 Nov 2023 17:39:28 -0800 Subject: bpf: Do not allocate percpu memory at init stage Kirill Shutemov reported significant percpu memory consumption increase after booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation"). The percpu memory consumption is increased from 111MB to 969MB. The number is from /proc/meminfo. I tried to reproduce the issue with my local VM which at most supports upto 255 cpus. With 252 cpus, without the above commit, the percpu memory consumption immediately after boot is 57MB while with the above commit the percpu memory consumption is 231MB. This is not good since so far percpu memory from bpf memory allocator is not widely used yet. Let us change pre-allocation in init stage to on-demand allocation when verifier detects there is a need of percpu memory for bpf program. With this change, percpu memory consumption after boot can be reduced signicantly. [1] https://lore.kernel.org/lkml/20231109154934.4saimljtqx625l3v@box.shutemov.name/ Fixes: 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation") Reported-and-tested-by: Kirill A. Shutemov Signed-off-by: Yonghong Song Acked-by: Hou Tao Link: https://lore.kernel.org/r/20231111013928.948838-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/core.c | 8 +++----- kernel/bpf/verifier.c | 20 ++++++++++++++++++-- 3 files changed, 22 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 35bff17396c0..6762dac3ef76 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -56,7 +56,7 @@ extern struct idr btf_idr; extern spinlock_t btf_idr_lock; extern struct kobject *btf_kobj; extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; -extern bool bpf_global_ma_set, bpf_global_percpu_ma_set; +extern bool bpf_global_ma_set; typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 08626b519ce2..cd3afe57ece3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -64,8 +64,8 @@ #define OFF insn->off #define IMM insn->imm -struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; -bool bpf_global_ma_set, bpf_global_percpu_ma_set; +struct bpf_mem_alloc bpf_global_ma; +bool bpf_global_ma_set; /* No hurry in this branch * @@ -2934,9 +2934,7 @@ static int __init bpf_global_ma_init(void) ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); bpf_global_ma_set = !ret; - ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); - bpf_global_percpu_ma_set = !ret; - return !bpf_global_ma_set || !bpf_global_percpu_ma_set; + return ret; } late_initcall(bpf_global_ma_init); #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a2267d5ed14e..6da370a047fe 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include "disasm.h" @@ -41,6 +42,9 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #undef BPF_LINK_TYPE }; +struct bpf_mem_alloc bpf_global_percpu_ma; +static bool bpf_global_percpu_ma_set; + /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. * All paths of conditional branches are analyzed until 'bpf_exit' insn. @@ -336,6 +340,7 @@ struct bpf_kfunc_call_arg_meta { struct btf *btf_vmlinux; static DEFINE_MUTEX(bpf_verifier_lock); +static DEFINE_MUTEX(bpf_percpu_ma_lock); static const struct bpf_line_info * find_linfo(const struct bpf_verifier_env *env, u32 insn_off) @@ -12091,8 +12096,19 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) return -ENOMEM; - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set) - return -ENOMEM; + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (!bpf_global_percpu_ma_set) { + mutex_lock(&bpf_percpu_ma_lock); + if (!bpf_global_percpu_ma_set) { + err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); + if (!err) + bpf_global_percpu_ma_set = true; + } + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + } if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); -- cgit v1.2.3-73-gaa49b