From 30c90f6757a7b38bc95069725657a647873e2ab3 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Mon, 21 Mar 2022 11:28:48 -0400 Subject: arm64, insn: Add ldr/str with immediate offset This patch introduces ldr/str with immediate offset support to simplify the JIT implementation of BPF LDX/STX instructions on arm64. Although arm64 ldr/str immediate is available in pre-index, post-index and unsigned offset forms, the unsigned offset form is sufficient for BPF, so this patch only adds this type. Signed-off-by: Xu Kuohai Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220321152852.2334294-2-xukuohai@huawei.com --- arch/arm64/include/asm/insn.h | 9 ++++++ arch/arm64/lib/insn.c | 67 ++++++++++++++++++++++++++++++++++--------- 2 files changed, 62 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 1e5760d567ae..6aa2dc836db1 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -201,6 +201,8 @@ enum aarch64_insn_size_type { enum aarch64_insn_ldst_type { AARCH64_INSN_LDST_LOAD_REG_OFFSET, AARCH64_INSN_LDST_STORE_REG_OFFSET, + AARCH64_INSN_LDST_LOAD_IMM_OFFSET, + AARCH64_INSN_LDST_STORE_IMM_OFFSET, AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX, AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX, AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX, @@ -335,6 +337,7 @@ __AARCH64_INSN_FUNCS(load_pre, 0x3FE00C00, 0x38400C00) __AARCH64_INSN_FUNCS(store_post, 0x3FE00C00, 0x38000400) __AARCH64_INSN_FUNCS(load_post, 0x3FE00C00, 0x38400400) __AARCH64_INSN_FUNCS(str_reg, 0x3FE0EC00, 0x38206800) +__AARCH64_INSN_FUNCS(str_imm, 0x3FC00000, 0x39000000) __AARCH64_INSN_FUNCS(ldadd, 0x3F20FC00, 0x38200000) __AARCH64_INSN_FUNCS(ldclr, 0x3F20FC00, 0x38201000) __AARCH64_INSN_FUNCS(ldeor, 0x3F20FC00, 0x38202000) @@ -342,6 +345,7 @@ __AARCH64_INSN_FUNCS(ldset, 0x3F20FC00, 0x38203000) __AARCH64_INSN_FUNCS(swp, 0x3F20FC00, 0x38208000) __AARCH64_INSN_FUNCS(cas, 0x3FA07C00, 0x08A07C00) __AARCH64_INSN_FUNCS(ldr_reg, 0x3FE0EC00, 0x38606800) +__AARCH64_INSN_FUNCS(ldr_imm, 0x3FC00000, 0x39400000) __AARCH64_INSN_FUNCS(ldr_lit, 0xBF000000, 0x18000000) __AARCH64_INSN_FUNCS(ldrsw_lit, 0xFF000000, 0x98000000) __AARCH64_INSN_FUNCS(exclusive, 0x3F800000, 0x08000000) @@ -501,6 +505,11 @@ u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg, enum aarch64_insn_register offset, enum aarch64_insn_size_type size, enum aarch64_insn_ldst_type type); +u32 aarch64_insn_gen_load_store_imm(enum aarch64_insn_register reg, + enum aarch64_insn_register base, + unsigned int imm, + enum aarch64_insn_size_type size, + enum aarch64_insn_ldst_type type); u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1, enum aarch64_insn_register reg2, enum aarch64_insn_register base, diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c index 5e90887deec4..695d7368fadc 100644 --- a/arch/arm64/lib/insn.c +++ b/arch/arm64/lib/insn.c @@ -299,29 +299,24 @@ static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type, return insn; } +static const u32 aarch64_insn_ldst_size[] = { + [AARCH64_INSN_SIZE_8] = 0, + [AARCH64_INSN_SIZE_16] = 1, + [AARCH64_INSN_SIZE_32] = 2, + [AARCH64_INSN_SIZE_64] = 3, +}; + static u32 aarch64_insn_encode_ldst_size(enum aarch64_insn_size_type type, u32 insn) { u32 size; - switch (type) { - case AARCH64_INSN_SIZE_8: - size = 0; - break; - case AARCH64_INSN_SIZE_16: - size = 1; - break; - case AARCH64_INSN_SIZE_32: - size = 2; - break; - case AARCH64_INSN_SIZE_64: - size = 3; - break; - default: + if (type < AARCH64_INSN_SIZE_8 || type > AARCH64_INSN_SIZE_64) { pr_err("%s: unknown size encoding %d\n", __func__, type); return AARCH64_BREAK_FAULT; } + size = aarch64_insn_ldst_size[type]; insn &= ~GENMASK(31, 30); insn |= size << 30; @@ -504,6 +499,50 @@ u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg, offset); } +u32 aarch64_insn_gen_load_store_imm(enum aarch64_insn_register reg, + enum aarch64_insn_register base, + unsigned int imm, + enum aarch64_insn_size_type size, + enum aarch64_insn_ldst_type type) +{ + u32 insn; + u32 shift; + + if (size < AARCH64_INSN_SIZE_8 || size > AARCH64_INSN_SIZE_64) { + pr_err("%s: unknown size encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + shift = aarch64_insn_ldst_size[size]; + if (imm & ~(BIT(12 + shift) - BIT(shift))) { + pr_err("%s: invalid imm: %d\n", __func__, imm); + return AARCH64_BREAK_FAULT; + } + + imm >>= shift; + + switch (type) { + case AARCH64_INSN_LDST_LOAD_IMM_OFFSET: + insn = aarch64_insn_get_ldr_imm_value(); + break; + case AARCH64_INSN_LDST_STORE_IMM_OFFSET: + insn = aarch64_insn_get_str_imm_value(); + break; + default: + pr_err("%s: unknown load/store encoding %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_ldst_size(size, insn); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + base); + + return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm); +} + u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1, enum aarch64_insn_register reg2, enum aarch64_insn_register base, -- cgit From 7db6c0f1d8ee051e0a7d8c58c5982990e4491f39 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Mon, 21 Mar 2022 11:28:49 -0400 Subject: bpf, arm64: Optimize BPF store/load using arm64 str/ldr(immediate offset) The current BPF store/load instruction is translated by the JIT into two instructions. The first instruction moves the immediate offset into a temporary register. The second instruction uses this temporary register to do the real store/load. In fact, arm64 supports addressing with immediate offsets. So This patch introduces optimization that uses arm64 str/ldr instruction with immediate offset when the offset fits. Example of generated instuction for r2 = *(u64 *)(r1 + 0): without optimization: mov x10, 0 ldr x1, [x0, x10] with optimization: ldr x1, [x0, 0] If the offset is negative, or is not aligned correctly, or exceeds max value, rollback to the use of temporary register. Signed-off-by: Xu Kuohai Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220321152852.2334294-3-xukuohai@huawei.com --- arch/arm64/net/bpf_jit.h | 14 +++++ arch/arm64/net/bpf_jit_comp.c | 128 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 127 insertions(+), 15 deletions(-) diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index dd59b5ad8fe4..3920213244f0 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -66,6 +66,20 @@ #define A64_STR64(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 64, STORE) #define A64_LDR64(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 64, LOAD) +/* Load/store register (immediate offset) */ +#define A64_LS_IMM(Rt, Rn, imm, size, type) \ + aarch64_insn_gen_load_store_imm(Rt, Rn, imm, \ + AARCH64_INSN_SIZE_##size, \ + AARCH64_INSN_LDST_##type##_IMM_OFFSET) +#define A64_STRBI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 8, STORE) +#define A64_LDRBI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 8, LOAD) +#define A64_STRHI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 16, STORE) +#define A64_LDRHI(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 16, LOAD) +#define A64_STR32I(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 32, STORE) +#define A64_LDR32I(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 32, LOAD) +#define A64_STR64I(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 64, STORE) +#define A64_LDR64I(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 64, LOAD) + /* Load/store register pair */ #define A64_LS_PAIR(Rt, Rt2, Rn, offset, ls, type) \ aarch64_insn_gen_load_store_pair(Rt, Rt2, Rn, offset, \ diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index fcc675aa1670..4bf50458b35d 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -191,6 +191,47 @@ static bool is_addsub_imm(u32 imm) return !(imm & ~0xfff) || !(imm & ~0xfff000); } +/* + * There are 3 types of AArch64 LDR/STR (immediate) instruction: + * Post-index, Pre-index, Unsigned offset. + * + * For BPF ldr/str, the "unsigned offset" type is sufficient. + * + * "Unsigned offset" type LDR(immediate) format: + * + * 3 2 1 0 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |x x|1 1 1 0 0 1 0 1| imm12 | Rn | Rt | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * scale + * + * "Unsigned offset" type STR(immediate) format: + * 3 2 1 0 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |x x|1 1 1 0 0 1 0 0| imm12 | Rn | Rt | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * scale + * + * The offset is calculated from imm12 and scale in the following way: + * + * offset = (u64)imm12 << scale + */ +static bool is_lsi_offset(s16 offset, int scale) +{ + if (offset < 0) + return false; + + if (offset > (0xFFF << scale)) + return false; + + if (offset & ((1 << scale) - 1)) + return false; + + return true; +} + /* Tail call offset to jump into */ #if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) #define PROLOGUE_OFFSET 8 @@ -971,19 +1012,38 @@ emit_cond_jmp: case BPF_LDX | BPF_PROBE_MEM | BPF_W: case BPF_LDX | BPF_PROBE_MEM | BPF_H: case BPF_LDX | BPF_PROBE_MEM | BPF_B: - emit_a64_mov_i(1, tmp, off, ctx); switch (BPF_SIZE(code)) { case BPF_W: - emit(A64_LDR32(dst, src, tmp), ctx); + if (is_lsi_offset(off, 2)) { + emit(A64_LDR32I(dst, src, off), ctx); + } else { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_LDR32(dst, src, tmp), ctx); + } break; case BPF_H: - emit(A64_LDRH(dst, src, tmp), ctx); + if (is_lsi_offset(off, 1)) { + emit(A64_LDRHI(dst, src, off), ctx); + } else { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_LDRH(dst, src, tmp), ctx); + } break; case BPF_B: - emit(A64_LDRB(dst, src, tmp), ctx); + if (is_lsi_offset(off, 0)) { + emit(A64_LDRBI(dst, src, off), ctx); + } else { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_LDRB(dst, src, tmp), ctx); + } break; case BPF_DW: - emit(A64_LDR64(dst, src, tmp), ctx); + if (is_lsi_offset(off, 3)) { + emit(A64_LDR64I(dst, src, off), ctx); + } else { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_LDR64(dst, src, tmp), ctx); + } break; } @@ -1011,20 +1071,39 @@ emit_cond_jmp: case BPF_ST | BPF_MEM | BPF_B: case BPF_ST | BPF_MEM | BPF_DW: /* Load imm to a register then store it */ - emit_a64_mov_i(1, tmp2, off, ctx); emit_a64_mov_i(1, tmp, imm, ctx); switch (BPF_SIZE(code)) { case BPF_W: - emit(A64_STR32(tmp, dst, tmp2), ctx); + if (is_lsi_offset(off, 2)) { + emit(A64_STR32I(tmp, dst, off), ctx); + } else { + emit_a64_mov_i(1, tmp2, off, ctx); + emit(A64_STR32(tmp, dst, tmp2), ctx); + } break; case BPF_H: - emit(A64_STRH(tmp, dst, tmp2), ctx); + if (is_lsi_offset(off, 1)) { + emit(A64_STRHI(tmp, dst, off), ctx); + } else { + emit_a64_mov_i(1, tmp2, off, ctx); + emit(A64_STRH(tmp, dst, tmp2), ctx); + } break; case BPF_B: - emit(A64_STRB(tmp, dst, tmp2), ctx); + if (is_lsi_offset(off, 0)) { + emit(A64_STRBI(tmp, dst, off), ctx); + } else { + emit_a64_mov_i(1, tmp2, off, ctx); + emit(A64_STRB(tmp, dst, tmp2), ctx); + } break; case BPF_DW: - emit(A64_STR64(tmp, dst, tmp2), ctx); + if (is_lsi_offset(off, 3)) { + emit(A64_STR64I(tmp, dst, off), ctx); + } else { + emit_a64_mov_i(1, tmp2, off, ctx); + emit(A64_STR64(tmp, dst, tmp2), ctx); + } break; } break; @@ -1034,19 +1113,38 @@ emit_cond_jmp: case BPF_STX | BPF_MEM | BPF_H: case BPF_STX | BPF_MEM | BPF_B: case BPF_STX | BPF_MEM | BPF_DW: - emit_a64_mov_i(1, tmp, off, ctx); switch (BPF_SIZE(code)) { case BPF_W: - emit(A64_STR32(src, dst, tmp), ctx); + if (is_lsi_offset(off, 2)) { + emit(A64_STR32I(src, dst, off), ctx); + } else { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_STR32(src, dst, tmp), ctx); + } break; case BPF_H: - emit(A64_STRH(src, dst, tmp), ctx); + if (is_lsi_offset(off, 1)) { + emit(A64_STRHI(src, dst, off), ctx); + } else { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_STRH(src, dst, tmp), ctx); + } break; case BPF_B: - emit(A64_STRB(src, dst, tmp), ctx); + if (is_lsi_offset(off, 0)) { + emit(A64_STRBI(src, dst, off), ctx); + } else { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_STRB(src, dst, tmp), ctx); + } break; case BPF_DW: - emit(A64_STR64(src, dst, tmp), ctx); + if (is_lsi_offset(off, 3)) { + emit(A64_STR64I(src, dst, off), ctx); + } else { + emit_a64_mov_i(1, tmp, off, ctx); + emit(A64_STR64(src, dst, tmp), ctx); + } break; } break; -- cgit From 5b3d19b9bd4080d7f5e260f91ce8f639e19eb499 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Mon, 21 Mar 2022 11:28:50 -0400 Subject: bpf, arm64: Adjust the offset of str/ldr(immediate) to positive number The BPF STX/LDX instruction uses offset relative to the FP to address stack space. Since the BPF_FP locates at the top of the frame, the offset is usually a negative number. However, arm64 str/ldr immediate instruction requires that offset be a positive number. Therefore, this patch tries to convert the offsets. The method is to find the negative offset furthest from the FP firstly. Then add it to the FP, calculate a bottom position, called FPB, and then adjust the offsets in other STR/LDX instructions relative to FPB. FPB is saved using the callee-saved register x27 of arm64 which is not used yet. Before adjusting the offset, the patch checks every instruction to ensure that the FP does not change in run-time. If the FP may change, no offset is adjusted. For example, for the following bpftrace command: bpftrace -e 'kprobe:do_sys_open { printf("opening: %s\n", str(arg1)); }' Without this patch, jited code(fragment): 0: bti c 4: stp x29, x30, [sp, #-16]! 8: mov x29, sp c: stp x19, x20, [sp, #-16]! 10: stp x21, x22, [sp, #-16]! 14: stp x25, x26, [sp, #-16]! 18: mov x25, sp 1c: mov x26, #0x0 // #0 20: bti j 24: sub sp, sp, #0x90 28: add x19, x0, #0x0 2c: mov x0, #0x0 // #0 30: mov x10, #0xffffffffffffff78 // #-136 34: str x0, [x25, x10] 38: mov x10, #0xffffffffffffff80 // #-128 3c: str x0, [x25, x10] 40: mov x10, #0xffffffffffffff88 // #-120 44: str x0, [x25, x10] 48: mov x10, #0xffffffffffffff90 // #-112 4c: str x0, [x25, x10] 50: mov x10, #0xffffffffffffff98 // #-104 54: str x0, [x25, x10] 58: mov x10, #0xffffffffffffffa0 // #-96 5c: str x0, [x25, x10] 60: mov x10, #0xffffffffffffffa8 // #-88 64: str x0, [x25, x10] 68: mov x10, #0xffffffffffffffb0 // #-80 6c: str x0, [x25, x10] 70: mov x10, #0xffffffffffffffb8 // #-72 74: str x0, [x25, x10] 78: mov x10, #0xffffffffffffffc0 // #-64 7c: str x0, [x25, x10] 80: mov x10, #0xffffffffffffffc8 // #-56 84: str x0, [x25, x10] 88: mov x10, #0xffffffffffffffd0 // #-48 8c: str x0, [x25, x10] 90: mov x10, #0xffffffffffffffd8 // #-40 94: str x0, [x25, x10] 98: mov x10, #0xffffffffffffffe0 // #-32 9c: str x0, [x25, x10] a0: mov x10, #0xffffffffffffffe8 // #-24 a4: str x0, [x25, x10] a8: mov x10, #0xfffffffffffffff0 // #-16 ac: str x0, [x25, x10] b0: mov x10, #0xfffffffffffffff8 // #-8 b4: str x0, [x25, x10] b8: mov x10, #0x8 // #8 bc: ldr x2, [x19, x10] [...] With this patch, jited code(fragment): 0: bti c 4: stp x29, x30, [sp, #-16]! 8: mov x29, sp c: stp x19, x20, [sp, #-16]! 10: stp x21, x22, [sp, #-16]! 14: stp x25, x26, [sp, #-16]! 18: stp x27, x28, [sp, #-16]! 1c: mov x25, sp 20: sub x27, x25, #0x88 24: mov x26, #0x0 // #0 28: bti j 2c: sub sp, sp, #0x90 30: add x19, x0, #0x0 34: mov x0, #0x0 // #0 38: str x0, [x27] 3c: str x0, [x27, #8] 40: str x0, [x27, #16] 44: str x0, [x27, #24] 48: str x0, [x27, #32] 4c: str x0, [x27, #40] 50: str x0, [x27, #48] 54: str x0, [x27, #56] 58: str x0, [x27, #64] 5c: str x0, [x27, #72] 60: str x0, [x27, #80] 64: str x0, [x27, #88] 68: str x0, [x27, #96] 6c: str x0, [x27, #104] 70: str x0, [x27, #112] 74: str x0, [x27, #120] 78: str x0, [x27, #128] 7c: ldr x2, [x19, #8] [...] Signed-off-by: Xu Kuohai Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220321152852.2334294-4-xukuohai@huawei.com --- arch/arm64/net/bpf_jit_comp.c | 165 +++++++++++++++++++++++++++++++++++------- 1 file changed, 138 insertions(+), 27 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 4bf50458b35d..093fa9ea1083 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -26,6 +26,7 @@ #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) #define TCALL_CNT (MAX_BPF_JIT_REG + 2) #define TMP_REG_3 (MAX_BPF_JIT_REG + 3) +#define FP_BOTTOM (MAX_BPF_JIT_REG + 4) #define check_imm(bits, imm) do { \ if ((((imm) > 0) && ((imm) >> (bits))) || \ @@ -63,6 +64,7 @@ static const int bpf2a64[] = { [TCALL_CNT] = A64_R(26), /* temporary register for blinding constants */ [BPF_REG_AX] = A64_R(9), + [FP_BOTTOM] = A64_R(27), }; struct jit_ctx { @@ -73,6 +75,7 @@ struct jit_ctx { int exentry_idx; __le32 *image; u32 stack_size; + int fpb_offset; }; static inline void emit(const u32 insn, struct jit_ctx *ctx) @@ -218,7 +221,7 @@ static bool is_addsub_imm(u32 imm) * * offset = (u64)imm12 << scale */ -static bool is_lsi_offset(s16 offset, int scale) +static bool is_lsi_offset(int offset, int scale) { if (offset < 0) return false; @@ -234,9 +237,9 @@ static bool is_lsi_offset(s16 offset, int scale) /* Tail call offset to jump into */ #if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) -#define PROLOGUE_OFFSET 8 +#define PROLOGUE_OFFSET 9 #else -#define PROLOGUE_OFFSET 7 +#define PROLOGUE_OFFSET 8 #endif static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) @@ -248,6 +251,7 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) const u8 r9 = bpf2a64[BPF_REG_9]; const u8 fp = bpf2a64[BPF_REG_FP]; const u8 tcc = bpf2a64[TCALL_CNT]; + const u8 fpb = bpf2a64[FP_BOTTOM]; const int idx0 = ctx->idx; int cur_offset; @@ -286,6 +290,7 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) emit(A64_PUSH(r6, r7, A64_SP), ctx); emit(A64_PUSH(r8, r9, A64_SP), ctx); emit(A64_PUSH(fp, tcc, A64_SP), ctx); + emit(A64_PUSH(fpb, A64_R(28), A64_SP), ctx); /* Set up BPF prog stack base register */ emit(A64_MOV(1, fp, A64_SP), ctx); @@ -306,6 +311,8 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) emit(A64_BTI_J, ctx); } + emit(A64_SUB_I(1, fpb, fp, ctx->fpb_offset), ctx); + /* Stack must be multiples of 16B */ ctx->stack_size = round_up(prog->aux->stack_depth, 16); @@ -553,10 +560,13 @@ static void build_epilogue(struct jit_ctx *ctx) const u8 r8 = bpf2a64[BPF_REG_8]; const u8 r9 = bpf2a64[BPF_REG_9]; const u8 fp = bpf2a64[BPF_REG_FP]; + const u8 fpb = bpf2a64[FP_BOTTOM]; /* We're done with BPF stack */ emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx); + /* Restore x27 and x28 */ + emit(A64_POP(fpb, A64_R(28), A64_SP), ctx); /* Restore fs (x25) and x26 */ emit(A64_POP(fp, A64_R(26), A64_SP), ctx); @@ -650,6 +660,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, const u8 src = bpf2a64[insn->src_reg]; const u8 tmp = bpf2a64[TMP_REG_1]; const u8 tmp2 = bpf2a64[TMP_REG_2]; + const u8 fp = bpf2a64[BPF_REG_FP]; + const u8 fpb = bpf2a64[FP_BOTTOM]; const s16 off = insn->off; const s32 imm = insn->imm; const int i = insn - ctx->prog->insnsi; @@ -658,6 +670,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, u8 jmp_cond; s32 jmp_offset; u32 a64_insn; + u8 src_adj; + u8 dst_adj; + int off_adj; int ret; switch (code) { @@ -1012,34 +1027,41 @@ emit_cond_jmp: case BPF_LDX | BPF_PROBE_MEM | BPF_W: case BPF_LDX | BPF_PROBE_MEM | BPF_H: case BPF_LDX | BPF_PROBE_MEM | BPF_B: + if (ctx->fpb_offset > 0 && src == fp) { + src_adj = fpb; + off_adj = off + ctx->fpb_offset; + } else { + src_adj = src; + off_adj = off; + } switch (BPF_SIZE(code)) { case BPF_W: - if (is_lsi_offset(off, 2)) { - emit(A64_LDR32I(dst, src, off), ctx); + if (is_lsi_offset(off_adj, 2)) { + emit(A64_LDR32I(dst, src_adj, off_adj), ctx); } else { emit_a64_mov_i(1, tmp, off, ctx); emit(A64_LDR32(dst, src, tmp), ctx); } break; case BPF_H: - if (is_lsi_offset(off, 1)) { - emit(A64_LDRHI(dst, src, off), ctx); + if (is_lsi_offset(off_adj, 1)) { + emit(A64_LDRHI(dst, src_adj, off_adj), ctx); } else { emit_a64_mov_i(1, tmp, off, ctx); emit(A64_LDRH(dst, src, tmp), ctx); } break; case BPF_B: - if (is_lsi_offset(off, 0)) { - emit(A64_LDRBI(dst, src, off), ctx); + if (is_lsi_offset(off_adj, 0)) { + emit(A64_LDRBI(dst, src_adj, off_adj), ctx); } else { emit_a64_mov_i(1, tmp, off, ctx); emit(A64_LDRB(dst, src, tmp), ctx); } break; case BPF_DW: - if (is_lsi_offset(off, 3)) { - emit(A64_LDR64I(dst, src, off), ctx); + if (is_lsi_offset(off_adj, 3)) { + emit(A64_LDR64I(dst, src_adj, off_adj), ctx); } else { emit_a64_mov_i(1, tmp, off, ctx); emit(A64_LDR64(dst, src, tmp), ctx); @@ -1070,36 +1092,43 @@ emit_cond_jmp: case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_B: case BPF_ST | BPF_MEM | BPF_DW: + if (ctx->fpb_offset > 0 && dst == fp) { + dst_adj = fpb; + off_adj = off + ctx->fpb_offset; + } else { + dst_adj = dst; + off_adj = off; + } /* Load imm to a register then store it */ emit_a64_mov_i(1, tmp, imm, ctx); switch (BPF_SIZE(code)) { case BPF_W: - if (is_lsi_offset(off, 2)) { - emit(A64_STR32I(tmp, dst, off), ctx); + if (is_lsi_offset(off_adj, 2)) { + emit(A64_STR32I(tmp, dst_adj, off_adj), ctx); } else { emit_a64_mov_i(1, tmp2, off, ctx); emit(A64_STR32(tmp, dst, tmp2), ctx); } break; case BPF_H: - if (is_lsi_offset(off, 1)) { - emit(A64_STRHI(tmp, dst, off), ctx); + if (is_lsi_offset(off_adj, 1)) { + emit(A64_STRHI(tmp, dst_adj, off_adj), ctx); } else { emit_a64_mov_i(1, tmp2, off, ctx); emit(A64_STRH(tmp, dst, tmp2), ctx); } break; case BPF_B: - if (is_lsi_offset(off, 0)) { - emit(A64_STRBI(tmp, dst, off), ctx); + if (is_lsi_offset(off_adj, 0)) { + emit(A64_STRBI(tmp, dst_adj, off_adj), ctx); } else { emit_a64_mov_i(1, tmp2, off, ctx); emit(A64_STRB(tmp, dst, tmp2), ctx); } break; case BPF_DW: - if (is_lsi_offset(off, 3)) { - emit(A64_STR64I(tmp, dst, off), ctx); + if (is_lsi_offset(off_adj, 3)) { + emit(A64_STR64I(tmp, dst_adj, off_adj), ctx); } else { emit_a64_mov_i(1, tmp2, off, ctx); emit(A64_STR64(tmp, dst, tmp2), ctx); @@ -1113,34 +1142,41 @@ emit_cond_jmp: case BPF_STX | BPF_MEM | BPF_H: case BPF_STX | BPF_MEM | BPF_B: case BPF_STX | BPF_MEM | BPF_DW: + if (ctx->fpb_offset > 0 && dst == fp) { + dst_adj = fpb; + off_adj = off + ctx->fpb_offset; + } else { + dst_adj = dst; + off_adj = off; + } switch (BPF_SIZE(code)) { case BPF_W: - if (is_lsi_offset(off, 2)) { - emit(A64_STR32I(src, dst, off), ctx); + if (is_lsi_offset(off_adj, 2)) { + emit(A64_STR32I(src, dst_adj, off_adj), ctx); } else { emit_a64_mov_i(1, tmp, off, ctx); emit(A64_STR32(src, dst, tmp), ctx); } break; case BPF_H: - if (is_lsi_offset(off, 1)) { - emit(A64_STRHI(src, dst, off), ctx); + if (is_lsi_offset(off_adj, 1)) { + emit(A64_STRHI(src, dst_adj, off_adj), ctx); } else { emit_a64_mov_i(1, tmp, off, ctx); emit(A64_STRH(src, dst, tmp), ctx); } break; case BPF_B: - if (is_lsi_offset(off, 0)) { - emit(A64_STRBI(src, dst, off), ctx); + if (is_lsi_offset(off_adj, 0)) { + emit(A64_STRBI(src, dst_adj, off_adj), ctx); } else { emit_a64_mov_i(1, tmp, off, ctx); emit(A64_STRB(src, dst, tmp), ctx); } break; case BPF_DW: - if (is_lsi_offset(off, 3)) { - emit(A64_STR64I(src, dst, off), ctx); + if (is_lsi_offset(off_adj, 3)) { + emit(A64_STR64I(src, dst_adj, off_adj), ctx); } else { emit_a64_mov_i(1, tmp, off, ctx); emit(A64_STR64(src, dst, tmp), ctx); @@ -1167,6 +1203,79 @@ emit_cond_jmp: return 0; } +/* + * Return 0 if FP may change at runtime, otherwise find the minimum negative + * offset to FP, converts it to positive number, and align down to 8 bytes. + */ +static int find_fpb_offset(struct bpf_prog *prog) +{ + int i; + int offset = 0; + + for (i = 0; i < prog->len; i++) { + const struct bpf_insn *insn = &prog->insnsi[i]; + const u8 class = BPF_CLASS(insn->code); + const u8 mode = BPF_MODE(insn->code); + const u8 src = insn->src_reg; + const u8 dst = insn->dst_reg; + const s32 imm = insn->imm; + const s16 off = insn->off; + + switch (class) { + case BPF_STX: + case BPF_ST: + /* fp holds atomic operation result */ + if (class == BPF_STX && mode == BPF_ATOMIC && + ((imm == BPF_XCHG || + imm == (BPF_FETCH | BPF_ADD) || + imm == (BPF_FETCH | BPF_AND) || + imm == (BPF_FETCH | BPF_XOR) || + imm == (BPF_FETCH | BPF_OR)) && + src == BPF_REG_FP)) + return 0; + + if (mode == BPF_MEM && dst == BPF_REG_FP && + off < offset) + offset = insn->off; + break; + + case BPF_JMP32: + case BPF_JMP: + break; + + case BPF_LDX: + case BPF_LD: + /* fp holds load result */ + if (dst == BPF_REG_FP) + return 0; + + if (class == BPF_LDX && mode == BPF_MEM && + src == BPF_REG_FP && off < offset) + offset = off; + break; + + case BPF_ALU: + case BPF_ALU64: + default: + /* fp holds ALU result */ + if (dst == BPF_REG_FP) + return 0; + } + } + + if (offset < 0) { + /* + * safely be converted to a positive 'int', since insn->off + * is 's16' + */ + offset = -offset; + /* align down to 8 bytes */ + offset = ALIGN_DOWN(offset, 8); + } + + return offset; +} + static int build_body(struct jit_ctx *ctx, bool extra_pass) { const struct bpf_prog *prog = ctx->prog; @@ -1288,6 +1397,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) goto out_off; } + ctx.fpb_offset = find_fpb_offset(prog); + /* * 1. Initial fake pass to compute ctx->idx and ctx->offset. * -- cgit From f516420f683d147d8f4cfd83bbc7c3c6ad1c61b5 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Mon, 21 Mar 2022 11:28:51 -0400 Subject: bpf, tests: Add tests for BPF_LDX/BPF_STX with different offsets This patch adds tests to verify the behavior of BPF_LDX/BPF_STX + BPF_B/BPF_H/BPF_W/BPF_DW with negative offset, small positive offset, large positive offset, and misaligned offset. Tested on both big-endian and little-endian arm64 qemu, result: test_bpf: Summary: 1026 PASSED, 0 FAILED, [1014/1014 JIT'ed]'] test_bpf: test_tail_calls: Summary: 8 PASSED, 0 FAILED, [8/8 JIT'ed] test_bpf: test_skb_segment: Summary: 2 PASSED, 0 FAILED Signed-off-by: Xu Kuohai Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220321152852.2334294-5-xukuohai@huawei.com --- lib/test_bpf.c | 285 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 281 insertions(+), 4 deletions(-) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 0c5cb2d6436a..aa0c7c68b2be 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -53,6 +53,7 @@ #define FLAG_EXPECTED_FAIL BIT(1) #define FLAG_SKB_FRAG BIT(2) #define FLAG_VERIFIER_ZEXT BIT(3) +#define FLAG_LARGE_MEM BIT(4) enum { CLASSIC = BIT(6), /* Old BPF instructions only. */ @@ -7838,7 +7839,7 @@ static struct bpf_test tests[] = { }, /* BPF_LDX_MEM B/H/W/DW */ { - "BPF_LDX_MEM | BPF_B", + "BPF_LDX_MEM | BPF_B, base", .u.insns_int = { BPF_LD_IMM64(R1, 0x0102030405060708ULL), BPF_LD_IMM64(R2, 0x0000000000000008ULL), @@ -7878,7 +7879,56 @@ static struct bpf_test tests[] = { .stack_depth = 8, }, { - "BPF_LDX_MEM | BPF_H", + "BPF_LDX_MEM | BPF_B, negative offset", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x8182838485868788ULL), + BPF_LD_IMM64(R3, 0x0000000000000088ULL), + BPF_ALU64_IMM(BPF_ADD, R1, 512), + BPF_STX_MEM(BPF_B, R1, R2, -256), + BPF_LDX_MEM(BPF_B, R0, R1, -256), + BPF_JMP_REG(BPF_JNE, R0, R3, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL | FLAG_LARGE_MEM, + { }, + { { 512, 0 } }, + .stack_depth = 0, + }, + { + "BPF_LDX_MEM | BPF_B, small positive offset", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x8182838485868788ULL), + BPF_LD_IMM64(R3, 0x0000000000000088ULL), + BPF_STX_MEM(BPF_B, R1, R2, 256), + BPF_LDX_MEM(BPF_B, R0, R1, 256), + BPF_JMP_REG(BPF_JNE, R0, R3, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL | FLAG_LARGE_MEM, + { }, + { { 512, 0 } }, + .stack_depth = 0, + }, + { + "BPF_LDX_MEM | BPF_B, large positive offset", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x8182838485868788ULL), + BPF_LD_IMM64(R3, 0x0000000000000088ULL), + BPF_STX_MEM(BPF_B, R1, R2, 4096), + BPF_LDX_MEM(BPF_B, R0, R1, 4096), + BPF_JMP_REG(BPF_JNE, R0, R3, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL | FLAG_LARGE_MEM, + { }, + { { 4096 + 16, 0 } }, + .stack_depth = 0, + }, + { + "BPF_LDX_MEM | BPF_H, base", .u.insns_int = { BPF_LD_IMM64(R1, 0x0102030405060708ULL), BPF_LD_IMM64(R2, 0x0000000000000708ULL), @@ -7918,7 +7968,72 @@ static struct bpf_test tests[] = { .stack_depth = 8, }, { - "BPF_LDX_MEM | BPF_W", + "BPF_LDX_MEM | BPF_H, negative offset", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x8182838485868788ULL), + BPF_LD_IMM64(R3, 0x0000000000008788ULL), + BPF_ALU64_IMM(BPF_ADD, R1, 512), + BPF_STX_MEM(BPF_H, R1, R2, -256), + BPF_LDX_MEM(BPF_H, R0, R1, -256), + BPF_JMP_REG(BPF_JNE, R0, R3, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL | FLAG_LARGE_MEM, + { }, + { { 512, 0 } }, + .stack_depth = 0, + }, + { + "BPF_LDX_MEM | BPF_H, small positive offset", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x8182838485868788ULL), + BPF_LD_IMM64(R3, 0x0000000000008788ULL), + BPF_STX_MEM(BPF_H, R1, R2, 256), + BPF_LDX_MEM(BPF_H, R0, R1, 256), + BPF_JMP_REG(BPF_JNE, R0, R3, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL | FLAG_LARGE_MEM, + { }, + { { 512, 0 } }, + .stack_depth = 0, + }, + { + "BPF_LDX_MEM | BPF_H, large positive offset", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x8182838485868788ULL), + BPF_LD_IMM64(R3, 0x0000000000008788ULL), + BPF_STX_MEM(BPF_H, R1, R2, 8192), + BPF_LDX_MEM(BPF_H, R0, R1, 8192), + BPF_JMP_REG(BPF_JNE, R0, R3, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL | FLAG_LARGE_MEM, + { }, + { { 8192 + 16, 0 } }, + .stack_depth = 0, + }, + { + "BPF_LDX_MEM | BPF_H, unaligned positive offset", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x8182838485868788ULL), + BPF_LD_IMM64(R3, 0x0000000000008788ULL), + BPF_STX_MEM(BPF_H, R1, R2, 13), + BPF_LDX_MEM(BPF_H, R0, R1, 13), + BPF_JMP_REG(BPF_JNE, R0, R3, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL | FLAG_LARGE_MEM, + { }, + { { 32, 0 } }, + .stack_depth = 0, + }, + { + "BPF_LDX_MEM | BPF_W, base", .u.insns_int = { BPF_LD_IMM64(R1, 0x0102030405060708ULL), BPF_LD_IMM64(R2, 0x0000000005060708ULL), @@ -7957,6 +8072,162 @@ static struct bpf_test tests[] = { { { 0, 0 } }, .stack_depth = 8, }, + { + "BPF_LDX_MEM | BPF_W, negative offset", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x8182838485868788ULL), + BPF_LD_IMM64(R3, 0x0000000085868788ULL), + BPF_ALU64_IMM(BPF_ADD, R1, 512), + BPF_STX_MEM(BPF_W, R1, R2, -256), + BPF_LDX_MEM(BPF_W, R0, R1, -256), + BPF_JMP_REG(BPF_JNE, R0, R3, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL | FLAG_LARGE_MEM, + { }, + { { 512, 0 } }, + .stack_depth = 0, + }, + { + "BPF_LDX_MEM | BPF_W, small positive offset", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x8182838485868788ULL), + BPF_LD_IMM64(R3, 0x0000000085868788ULL), + BPF_STX_MEM(BPF_W, R1, R2, 256), + BPF_LDX_MEM(BPF_W, R0, R1, 256), + BPF_JMP_REG(BPF_JNE, R0, R3, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL | FLAG_LARGE_MEM, + { }, + { { 512, 0 } }, + .stack_depth = 0, + }, + { + "BPF_LDX_MEM | BPF_W, large positive offset", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x8182838485868788ULL), + BPF_LD_IMM64(R3, 0x0000000085868788ULL), + BPF_STX_MEM(BPF_W, R1, R2, 16384), + BPF_LDX_MEM(BPF_W, R0, R1, 16384), + BPF_JMP_REG(BPF_JNE, R0, R3, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL | FLAG_LARGE_MEM, + { }, + { { 16384 + 16, 0 } }, + .stack_depth = 0, + }, + { + "BPF_LDX_MEM | BPF_W, unaligned positive offset", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x8182838485868788ULL), + BPF_LD_IMM64(R3, 0x0000000085868788ULL), + BPF_STX_MEM(BPF_W, R1, R2, 13), + BPF_LDX_MEM(BPF_W, R0, R1, 13), + BPF_JMP_REG(BPF_JNE, R0, R3, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL | FLAG_LARGE_MEM, + { }, + { { 32, 0 } }, + .stack_depth = 0, + }, + { + "BPF_LDX_MEM | BPF_DW, base", + .u.insns_int = { + BPF_LD_IMM64(R1, 0x0102030405060708ULL), + BPF_STX_MEM(BPF_DW, R10, R1, -8), + BPF_LDX_MEM(BPF_DW, R0, R10, -8), + BPF_JMP_REG(BPF_JNE, R0, R1, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0 } }, + .stack_depth = 8, + }, + { + "BPF_LDX_MEM | BPF_DW, MSB set", + .u.insns_int = { + BPF_LD_IMM64(R1, 0x8182838485868788ULL), + BPF_STX_MEM(BPF_DW, R10, R1, -8), + BPF_LDX_MEM(BPF_DW, R0, R10, -8), + BPF_JMP_REG(BPF_JNE, R0, R1, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0 } }, + .stack_depth = 8, + }, + { + "BPF_LDX_MEM | BPF_DW, negative offset", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x8182838485868788ULL), + BPF_ALU64_IMM(BPF_ADD, R1, 512), + BPF_STX_MEM(BPF_DW, R1, R2, -256), + BPF_LDX_MEM(BPF_DW, R0, R1, -256), + BPF_JMP_REG(BPF_JNE, R0, R2, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL | FLAG_LARGE_MEM, + { }, + { { 512, 0 } }, + .stack_depth = 0, + }, + { + "BPF_LDX_MEM | BPF_DW, small positive offset", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x8182838485868788ULL), + BPF_STX_MEM(BPF_DW, R1, R2, 256), + BPF_LDX_MEM(BPF_DW, R0, R1, 256), + BPF_JMP_REG(BPF_JNE, R0, R2, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL | FLAG_LARGE_MEM, + { }, + { { 512, 0 } }, + .stack_depth = 8, + }, + { + "BPF_LDX_MEM | BPF_DW, large positive offset", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x8182838485868788ULL), + BPF_STX_MEM(BPF_DW, R1, R2, 32760), + BPF_LDX_MEM(BPF_DW, R0, R1, 32760), + BPF_JMP_REG(BPF_JNE, R0, R2, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL | FLAG_LARGE_MEM, + { }, + { { 32768, 0 } }, + .stack_depth = 0, + }, + { + "BPF_LDX_MEM | BPF_DW, unaligned positive offset", + .u.insns_int = { + BPF_LD_IMM64(R2, 0x8182838485868788ULL), + BPF_STX_MEM(BPF_DW, R1, R2, 13), + BPF_LDX_MEM(BPF_DW, R0, R1, 13), + BPF_JMP_REG(BPF_JNE, R0, R2, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL | FLAG_LARGE_MEM, + { }, + { { 32, 0 } }, + .stack_depth = 0, + }, /* BPF_STX_MEM B/H/W/DW */ { "BPF_STX_MEM | BPF_B", @@ -14094,6 +14365,9 @@ static void *generate_test_data(struct bpf_test *test, int sub) if (test->aux & FLAG_NO_DATA) return NULL; + if (test->aux & FLAG_LARGE_MEM) + return kmalloc(test->test[sub].data_size, GFP_KERNEL); + /* Test case expects an skb, so populate one. Various * subtests generate skbs of different sizes based on * the same data. @@ -14137,7 +14411,10 @@ static void release_test_data(const struct bpf_test *test, void *data) if (test->aux & FLAG_NO_DATA) return; - kfree_skb(data); + if (test->aux & FLAG_LARGE_MEM) + kfree(data); + else + kfree_skb(data); } static int filter_length(int which) -- cgit From 38608ee7b6907830f157818d81f7b2393d49c808 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Mon, 21 Mar 2022 11:28:52 -0400 Subject: bpf, tests: Add load store test case for tail call Add test case to enusre that the caller and callee's fp offsets are correct during tail call (mainly asserting for arm64 JIT). Tested on both big-endian and little-endian arm64 qemu, result: test_bpf: Summary: 1026 PASSED, 0 FAILED, [1014/1014 JIT'ed] test_bpf: test_tail_calls: Summary: 10 PASSED, 0 FAILED, [10/10 JIT'ed] test_bpf: test_skb_segment: Summary: 2 PASSED, 0 FAILED Signed-off-by: Xu Kuohai Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220321152852.2334294-6-xukuohai@huawei.com --- lib/test_bpf.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index aa0c7c68b2be..2a7836e115b4 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -14950,6 +14950,36 @@ static struct tail_call_test tail_call_tests[] = { }, .result = 10, }, + { + "Tail call load/store leaf", + .insns = { + BPF_ALU64_IMM(BPF_MOV, R1, 1), + BPF_ALU64_IMM(BPF_MOV, R2, 2), + BPF_ALU64_REG(BPF_MOV, R3, BPF_REG_FP), + BPF_STX_MEM(BPF_DW, R3, R1, -8), + BPF_STX_MEM(BPF_DW, R3, R2, -16), + BPF_LDX_MEM(BPF_DW, R0, BPF_REG_FP, -8), + BPF_JMP_REG(BPF_JNE, R0, R1, 3), + BPF_LDX_MEM(BPF_DW, R0, BPF_REG_FP, -16), + BPF_JMP_REG(BPF_JNE, R0, R2, 1), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + .result = 0, + .stack_depth = 32, + }, + { + "Tail call load/store", + .insns = { + BPF_ALU64_IMM(BPF_MOV, R0, 3), + BPF_STX_MEM(BPF_DW, BPF_REG_FP, R0, -8), + TAIL_CALL(-1), + BPF_ALU64_IMM(BPF_MOV, R0, -1), + BPF_EXIT_INSN(), + }, + .result = 0, + .stack_depth = 16, + }, { "Tail call error path, max count reached", .insns = { -- cgit From 11e17ae423778f48c84da6a2e215f140610e1973 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 22 Mar 2022 14:21:49 +0800 Subject: bpf: Use swap() instead of open coding it Clean the following coccicheck warning: ./kernel/trace/bpf_trace.c:2263:34-35: WARNING opportunity for swap(). ./kernel/trace/bpf_trace.c:2264:40-41: WARNING opportunity for swap(). Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220322062149.109180-1-jiapeng.chong@linux.alibaba.com --- kernel/trace/bpf_trace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7fa2ebc07f60..836f021cb609 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2254,15 +2254,13 @@ static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void const struct bpf_kprobe_multi_link *link = priv; unsigned long *addr_a = a, *addr_b = b; u64 *cookie_a, *cookie_b; - unsigned long tmp1; - u64 tmp2; cookie_a = link->cookies + (addr_a - link->addrs); cookie_b = link->cookies + (addr_b - link->addrs); /* swap addr_a/addr_b and cookie_a/cookie_b values */ - tmp1 = *addr_a; *addr_a = *addr_b; *addr_b = tmp1; - tmp2 = *cookie_a; *cookie_a = *cookie_b; *cookie_b = tmp2; + swap(*addr_a, *addr_b); + swap(*cookie_a, *cookie_b); } static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b) -- cgit From fe4625d8b0536c0098845065a6bb3fdeaa6ad940 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Tue, 29 Mar 2022 18:49:14 +0300 Subject: selftests/bpf: Remove unused variable from bpf_sk_assign test Was never used in bpf_sk_assign_test(), and was removed from handle_{tcp,udp}() in commit 0b9ad56b1ea6 ("selftests/bpf: Use SOCKMAP for server sockets in bpf_sk_assign test"). Fixes: 0b9ad56b1ea6 ("selftests/bpf: Use SOCKMAP for server sockets in bpf_sk_assign test") Signed-off-by: Eyal Birger Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220329154914.3718658-1-eyal.birger@gmail.com --- tools/testing/selftests/bpf/progs/test_sk_assign.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c index 02f79356d5eb..98c6493d9b91 100644 --- a/tools/testing/selftests/bpf/progs/test_sk_assign.c +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c @@ -89,7 +89,6 @@ get_tuple(struct __sk_buff *skb, bool *ipv4, bool *tcp) static inline int handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) { - struct bpf_sock_tuple ln = {0}; struct bpf_sock *sk; const int zero = 0; size_t tuple_len; @@ -121,7 +120,6 @@ assign: static inline int handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4) { - struct bpf_sock_tuple ln = {0}; struct bpf_sock *sk; const int zero = 0; size_t tuple_len; @@ -161,7 +159,7 @@ assign: SEC("tc") int bpf_sk_assign_test(struct __sk_buff *skb) { - struct bpf_sock_tuple *tuple, ln = {0}; + struct bpf_sock_tuple *tuple; bool ipv4 = false; bool tcp = false; int tuple_len; -- cgit From 8eb943fc5e5fa62751248302fe8f6c9856d81c09 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Wed, 23 Mar 2022 15:36:26 +0800 Subject: bpf: Remove redundant assignment to smap->map.value_size The attr->value_size is already assigned to smap->map.value_size in bpf_map_init_from_attr(), there is no need to do it again in stack_map_alloc(). Signed-off-by: Yuntao Wang Signed-off-by: Daniel Borkmann Acked-by: Joanne Koong Link: https://lore.kernel.org/bpf/20220323073626.958652-1-ytcoode@gmail.com --- kernel/bpf/stackmap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 34725bfa1e97..6131b4a19572 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -106,7 +106,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); - smap->map.value_size = value_size; smap->n_buckets = n_buckets; err = get_callchain_buffers(sysctl_perf_event_max_stack); -- cgit From 891663ace74c23321a40f4bae13e45a1803d5e20 Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Tue, 29 Mar 2022 11:11:00 +0300 Subject: bpf, test_offload.py: Skip base maps without names The test fails: # ./test_offload.py [...] Test bpftool bound info reporting (own ns)... FAIL: 3 BPF maps loaded, expected 2 File "/root/bpf-next/tools/testing/selftests/bpf/./test_offload.py", line 1177, in check_dev_info(False, "") File "/root/bpf-next/tools/testing/selftests/bpf/./test_offload.py", line 645, in check_dev_info maps = bpftool_map_list(expected=2, ns=ns) File "/root/bpf-next/tools/testing/selftests/bpf/./test_offload.py", line 190, in bpftool_map_list fail(True, "%d BPF maps loaded, expected %d" % File "/root/bpf-next/tools/testing/selftests/bpf/./test_offload.py", line 86, in fail tb = "".join(traceback.extract_stack().format()) Some base maps do not have names and they cannot be added due to compatibility with older kernels, see [0]. So, just skip the unnamed maps. [0] https://lore.kernel.org/bpf/CAEf4BzY66WPKQbDe74AKZ6nFtZjq5e+G3Ji2egcVytB9R6_sGQ@mail.gmail.com/ Signed-off-by: Yauheni Kaliuta Signed-off-by: Daniel Borkmann Acked-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220329081100.9705-1-ykaliuta@redhat.com --- tools/testing/selftests/bpf/test_offload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index edaffd43da83..6cd6ef9fc20b 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -184,7 +184,7 @@ def bpftool_prog_list(expected=None, ns=""): def bpftool_map_list(expected=None, ns=""): _, maps = bpftool("map show", JSON=True, ns=ns, fail=True) # Remove the base maps - maps = [m for m in maps if m not in base_maps and m.get('name') not in base_map_names] + maps = [m for m in maps if m not in base_maps and m.get('name') and m.get('name') not in base_map_names] if expected is not None: if len(maps) != expected: fail(True, "%d BPF maps loaded, expected %d" % -- cgit From 185da3da9379948ffbe45051b16d526c428fb06e Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Thu, 31 Mar 2022 11:19:29 +0200 Subject: bpf: Replace usage of supported with dedicated list iterator variable To move the list iterator variable into the list_for_each_entry_*() macro in the future it should be avoided to use the list iterator variable after the loop body. To *never* use the list iterator variable after the loop it was concluded to use a separate iterator variable instead of a found boolean [1]. This removes the need to use the found variable (existed & supported) and simply checking if the variable was set, can determine if the break/goto was hit. [1] https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ Signed-off-by: Jakob Koschel Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220331091929.647057-1-jakobkoschel@gmail.com --- kernel/bpf/bpf_iter.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 110029ede71e..dea920b3b840 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -330,35 +330,34 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo, bool bpf_iter_prog_supported(struct bpf_prog *prog) { const char *attach_fname = prog->aux->attach_func_name; + struct bpf_iter_target_info *tinfo = NULL, *iter; u32 prog_btf_id = prog->aux->attach_btf_id; const char *prefix = BPF_ITER_FUNC_PREFIX; - struct bpf_iter_target_info *tinfo; int prefix_len = strlen(prefix); - bool supported = false; if (strncmp(attach_fname, prefix, prefix_len)) return false; mutex_lock(&targets_mutex); - list_for_each_entry(tinfo, &targets, list) { - if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) { - supported = true; + list_for_each_entry(iter, &targets, list) { + if (iter->btf_id && iter->btf_id == prog_btf_id) { + tinfo = iter; break; } - if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) { - cache_btf_id(tinfo, prog); - supported = true; + if (!strcmp(attach_fname + prefix_len, iter->reg_info->target)) { + cache_btf_id(iter, prog); + tinfo = iter; break; } } mutex_unlock(&targets_mutex); - if (supported) { + if (tinfo) { prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; } - return supported; + return tinfo != NULL; } const struct bpf_func_proto * @@ -499,12 +498,11 @@ bool bpf_link_is_iter(struct bpf_link *link) int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog) { + struct bpf_iter_target_info *tinfo = NULL, *iter; struct bpf_link_primer link_primer; - struct bpf_iter_target_info *tinfo; union bpf_iter_link_info linfo; struct bpf_iter_link *link; u32 prog_btf_id, linfo_len; - bool existed = false; bpfptr_t ulinfo; int err; @@ -530,14 +528,14 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); - list_for_each_entry(tinfo, &targets, list) { - if (tinfo->btf_id == prog_btf_id) { - existed = true; + list_for_each_entry(iter, &targets, list) { + if (iter->btf_id == prog_btf_id) { + tinfo = iter; break; } } mutex_unlock(&targets_mutex); - if (!existed) + if (!tinfo) return -ENOENT; link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); -- cgit From e299bcd4d16ff86f46c48df1062c8aae0eca1ed8 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 31 Mar 2022 17:09:49 +0300 Subject: selftests/bpf: Fix vfs_link kprobe definition Since commit 6521f8917082 ("namei: prepare for idmapped mounts") vfs_link's prototype was changed, the kprobe definition in profiler selftest in turn wasn't updated. The result is that all argument after the first are now stored in different registers. This means that self-test has been broken ever since. Fix it by updating the kprobe definition accordingly. Signed-off-by: Nikolay Borisov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220331140949.1410056-1-nborisov@suse.com --- tools/testing/selftests/bpf/progs/profiler.inc.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h index 4896fdf816f7..92331053dba3 100644 --- a/tools/testing/selftests/bpf/progs/profiler.inc.h +++ b/tools/testing/selftests/bpf/progs/profiler.inc.h @@ -826,8 +826,9 @@ out: SEC("kprobe/vfs_link") int BPF_KPROBE(kprobe__vfs_link, - struct dentry* old_dentry, struct inode* dir, - struct dentry* new_dentry, struct inode** delegated_inode) + struct dentry* old_dentry, struct user_namespace *mnt_userns, + struct inode* dir, struct dentry* new_dentry, + struct inode** delegated_inode) { struct bpf_func_stats_ctx stats_ctx; bpf_stats_enter(&stats_ctx, profiler_bpf_vfs_link); -- cgit From f6d60facd9b65614594f1feaa4eee18ac60a9a18 Mon Sep 17 00:00:00 2001 From: Haowen Bai Date: Fri, 1 Apr 2022 10:15:54 +0800 Subject: selftests/bpf: Return true/false (not 1/0) from bool functions Return boolean values ("true" or "false") instead of 1 or 0 from bool functions. This fixes the following warnings from coccicheck: ./tools/testing/selftests/bpf/progs/test_xdp_noinline.c:567:9-10: WARNING: return of 0/1 in function 'get_packet_dst' with return type bool ./tools/testing/selftests/bpf/progs/test_l4lb_noinline.c:221:9-10: WARNING: return of 0/1 in function 'get_packet_dst' with return type bool Signed-off-by: Haowen Bai Signed-off-by: Andrii Nakryiko Reviewed-by: Shuah Khan Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/1648779354-14700-1-git-send-email-baihaowen@meizu.com --- tools/testing/selftests/bpf/progs/test_l4lb_noinline.c | 2 +- tools/testing/selftests/bpf/progs/test_xdp_noinline.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c index 19e4d2071c60..c8bc0c6947aa 100644 --- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c +++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c @@ -218,7 +218,7 @@ static __noinline bool get_packet_dst(struct real_definition **real, if (hash != 0x358459b7 /* jhash of ipv4 packet */ && hash != 0x2f4bc6bb /* jhash of ipv6 packet */) - return 0; + return false; real_pos = bpf_map_lookup_elem(&ch_rings, &key); if (!real_pos) diff --git a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c index 596c4e71bf3a..125d872d7981 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c @@ -564,22 +564,22 @@ static bool get_packet_dst(struct real_definition **real, hash = get_packet_hash(pckt, hash_16bytes); if (hash != 0x358459b7 /* jhash of ipv4 packet */ && hash != 0x2f4bc6bb /* jhash of ipv6 packet */) - return 0; + return false; key = 2 * vip_info->vip_num + hash % 2; real_pos = bpf_map_lookup_elem(&ch_rings, &key); if (!real_pos) - return 0; + return false; key = *real_pos; *real = bpf_map_lookup_elem(&reals, &key); if (!(*real)) - return 0; + return false; if (!(vip_info->flags & (1 << 1))) { __u32 conn_rate_key = 512 + 2; struct lb_stats *conn_rate_stats = bpf_map_lookup_elem(&stats, &conn_rate_key); if (!conn_rate_stats) - return 1; + return true; cur_time = bpf_ktime_get_ns(); if ((cur_time - conn_rate_stats->v2) >> 32 > 0xffFFFF) { conn_rate_stats->v1 = 1; @@ -587,14 +587,14 @@ static bool get_packet_dst(struct real_definition **real, } else { conn_rate_stats->v1 += 1; if (conn_rate_stats->v1 >= 1) - return 1; + return true; } if (pckt->flow.proto == IPPROTO_UDP) new_dst_lru.atime = cur_time; new_dst_lru.pos = key; bpf_map_update_elem(lru_map, &pckt->flow, &new_dst_lru, 0); } - return 1; + return true; } __attribute__ ((noinline)) -- cgit From 9bbad6dab8279905c4593be69b06704b77b31403 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Sun, 3 Apr 2022 21:52:45 +0800 Subject: selftests/bpf: Fix cd_flavor_subdir() of test_progs Currently, when we run test_progs with just executable file name, for example 'PATH=. test_progs-no_alu32', cd_flavor_subdir() will not check if test_progs is running as a flavored test runner and switch into corresponding sub-directory. This will cause test_progs-no_alu32 executed by the 'PATH=. test_progs-no_alu32' command to run in the wrong directory and load the wrong BPF objects. Signed-off-by: Yuntao Wang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220403135245.1713283-1-ytcoode@gmail.com --- tools/testing/selftests/bpf/test_progs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 2ecb73a65206..0a4b45d7b515 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -761,8 +761,10 @@ int cd_flavor_subdir(const char *exec_name) const char *flavor = strrchr(exec_name, '/'); if (!flavor) - return 0; - flavor++; + flavor = exec_name; + else + flavor++; + flavor = strrchr(flavor, '-'); if (!flavor) return 0; -- cgit From 66df0fdb5981052f3ad97c9879eda93712bdefc2 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Sun, 3 Apr 2022 19:53:26 +0800 Subject: bpf: Correct the comment for BTF kind bitfield The commit 8fd886911a6a ("bpf: Add BTF_KIND_FLOAT to uapi") has extended the BTF kind bitfield from 4 to 5 bits, correct the comment. Signed-off-by: Haiyue Wang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220403115327.205964-1-haiyue.wang@intel.com --- include/uapi/linux/btf.h | 4 ++-- tools/include/uapi/linux/btf.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index b0d8fea1951d..a9162a6c0284 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -33,8 +33,8 @@ struct btf_type { /* "info" bits arrangement * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused - * bits 24-27: kind (e.g. int, ptr, array...etc) - * bits 28-30: unused + * bits 24-28: kind (e.g. int, ptr, array...etc) + * bits 29-30: unused * bit 31: kind_flag, currently used by * struct, union and fwd */ diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h index b0d8fea1951d..a9162a6c0284 100644 --- a/tools/include/uapi/linux/btf.h +++ b/tools/include/uapi/linux/btf.h @@ -33,8 +33,8 @@ struct btf_type { /* "info" bits arrangement * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused - * bits 24-27: kind (e.g. int, ptr, array...etc) - * bits 28-30: unused + * bits 24-28: kind (e.g. int, ptr, array...etc) + * bits 29-30: unused * bit 31: kind_flag, currently used by * struct, union and fwd */ -- cgit From 85bf1f51691c078e77f524a7addf37faa6635a6e Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 16 Mar 2022 08:13:23 +0100 Subject: samples: bpf: Convert xdp_router_ipv4 to XDP samples helper Rely on the libbpf skeleton facility and other utilities provided by XDP sample helpers in xdp_router_ipv4 sample. Signed-off-by: Lorenzo Bianconi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/7f4d98ee2c13c04d5eb924eebf79ced32fee8418.1647414711.git.lorenzo@kernel.org --- samples/bpf/Makefile | 9 +- samples/bpf/xdp_router_ipv4.bpf.c | 180 ++++++++++++++ samples/bpf/xdp_router_ipv4_kern.c | 186 --------------- samples/bpf/xdp_router_ipv4_user.c | 465 ++++++++++++++++--------------------- 4 files changed, 381 insertions(+), 459 deletions(-) create mode 100644 samples/bpf/xdp_router_ipv4.bpf.c delete mode 100644 samples/bpf/xdp_router_ipv4_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 38638845db9d..b4fa0e69aa14 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -96,7 +96,6 @@ test_cgrp2_sock2-objs := test_cgrp2_sock2.o xdp1-objs := xdp1_user.o # reuse xdp1 source intentionally xdp2-objs := xdp1_user.o -xdp_router_ipv4-objs := xdp_router_ipv4_user.o test_current_task_under_cgroup-objs := $(CGROUP_HELPERS) \ test_current_task_under_cgroup_user.o trace_event-objs := trace_event_user.o $(TRACE_HELPERS) @@ -124,6 +123,7 @@ xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o $(XDP_SAMPLE) xdp_redirect_map-objs := xdp_redirect_map_user.o $(XDP_SAMPLE) xdp_redirect-objs := xdp_redirect_user.o $(XDP_SAMPLE) xdp_monitor-objs := xdp_monitor_user.o $(XDP_SAMPLE) +xdp_router_ipv4-objs := xdp_router_ipv4_user.o $(XDP_SAMPLE) # Tell kbuild to always build the programs always-y := $(tprogs-y) @@ -153,7 +153,6 @@ always-y += parse_varlen.o parse_simple.o parse_ldabs.o always-y += test_cgrp2_tc_kern.o always-y += xdp1_kern.o always-y += xdp2_kern.o -always-y += xdp_router_ipv4_kern.o always-y += test_current_task_under_cgroup_kern.o always-y += trace_event_kern.o always-y += sampleip_kern.o @@ -342,6 +341,7 @@ $(obj)/xdp_redirect_map_multi_user.o: $(obj)/xdp_redirect_map_multi.skel.h $(obj)/xdp_redirect_map_user.o: $(obj)/xdp_redirect_map.skel.h $(obj)/xdp_redirect_user.o: $(obj)/xdp_redirect.skel.h $(obj)/xdp_monitor_user.o: $(obj)/xdp_monitor.skel.h +$(obj)/xdp_router_ipv4_user.o: $(obj)/xdp_router_ipv4.skel.h $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h @@ -399,6 +399,7 @@ $(obj)/xdp_redirect_map_multi.bpf.o: $(obj)/xdp_sample.bpf.o $(obj)/xdp_redirect_map.bpf.o: $(obj)/xdp_sample.bpf.o $(obj)/xdp_redirect.bpf.o: $(obj)/xdp_sample.bpf.o $(obj)/xdp_monitor.bpf.o: $(obj)/xdp_sample.bpf.o +$(obj)/xdp_router_ipv4.bpf.o: $(obj)/xdp_sample.bpf.o $(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src)/xdp_sample.bpf.h $(src)/xdp_sample_shared.h @echo " CLANG-BPF " $@ @@ -409,7 +410,8 @@ $(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src)/xdp_sample.bpf.h $(src)/x -c $(filter %.bpf.c,$^) -o $@ LINKED_SKELS := xdp_redirect_cpu.skel.h xdp_redirect_map_multi.skel.h \ - xdp_redirect_map.skel.h xdp_redirect.skel.h xdp_monitor.skel.h + xdp_redirect_map.skel.h xdp_redirect.skel.h xdp_monitor.skel.h \ + xdp_router_ipv4.skel.h clean-files += $(LINKED_SKELS) xdp_redirect_cpu.skel.h-deps := xdp_redirect_cpu.bpf.o xdp_sample.bpf.o @@ -417,6 +419,7 @@ xdp_redirect_map_multi.skel.h-deps := xdp_redirect_map_multi.bpf.o xdp_sample.bp xdp_redirect_map.skel.h-deps := xdp_redirect_map.bpf.o xdp_sample.bpf.o xdp_redirect.skel.h-deps := xdp_redirect.bpf.o xdp_sample.bpf.o xdp_monitor.skel.h-deps := xdp_monitor.bpf.o xdp_sample.bpf.o +xdp_router_ipv4.skel.h-deps := xdp_router_ipv4.bpf.o xdp_sample.bpf.o LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.bpf.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) diff --git a/samples/bpf/xdp_router_ipv4.bpf.c b/samples/bpf/xdp_router_ipv4.bpf.c new file mode 100644 index 000000000000..248119ca7938 --- /dev/null +++ b/samples/bpf/xdp_router_ipv4.bpf.c @@ -0,0 +1,180 @@ +/* Copyright (C) 2017 Cavium, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + */ + +#include "vmlinux.h" +#include "xdp_sample.bpf.h" +#include "xdp_sample_shared.h" + +#define ETH_ALEN 6 +#define ETH_P_8021Q 0x8100 +#define ETH_P_8021AD 0x88A8 + +struct trie_value { + __u8 prefix[4]; + __be64 value; + int ifindex; + int metric; + __be32 gw; +}; + +/* Key for lpm_trie */ +union key_4 { + u32 b32[2]; + u8 b8[8]; +}; + +struct arp_entry { + __be64 mac; + __be32 dst; +}; + +struct direct_map { + struct arp_entry arp; + int ifindex; + __be64 mac; +}; + +/* Map for trie implementation */ +struct { + __uint(type, BPF_MAP_TYPE_LPM_TRIE); + __uint(key_size, 8); + __uint(value_size, sizeof(struct trie_value)); + __uint(max_entries, 50); + __uint(map_flags, BPF_F_NO_PREALLOC); +} lpm_map SEC(".maps"); + +/* Map for ARP table */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __be32); + __type(value, __be64); + __uint(max_entries, 50); +} arp_table SEC(".maps"); + +/* Map to keep the exact match entries in the route table */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __be32); + __type(value, struct direct_map); + __uint(max_entries, 50); +} exact_match SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, 100); +} tx_port SEC(".maps"); + +SEC("xdp") +int xdp_router_ipv4_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u64 nh_off = sizeof(*eth); + struct datarec *rec; + __be16 h_proto; + u32 key = 0; + + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (rec) + NO_TEAR_INC(rec->processed); + + if (data + nh_off > data_end) + goto drop; + + h_proto = eth->h_proto; + if (h_proto == bpf_htons(ETH_P_8021Q) || + h_proto == bpf_htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + goto drop; + + h_proto = vhdr->h_vlan_encapsulated_proto; + } + + switch (bpf_ntohs(h_proto)) { + case ETH_P_ARP: + if (rec) + NO_TEAR_INC(rec->xdp_pass); + return XDP_PASS; + case ETH_P_IP: { + struct iphdr *iph = data + nh_off; + struct direct_map *direct_entry; + __be64 *dest_mac, *src_mac; + int forward_to; + + if (iph + 1 > data_end) + goto drop; + + direct_entry = bpf_map_lookup_elem(&exact_match, &iph->daddr); + + /* Check for exact match, this would give a faster lookup */ + if (direct_entry && direct_entry->mac && + direct_entry->arp.mac) { + src_mac = &direct_entry->mac; + dest_mac = &direct_entry->arp.mac; + forward_to = direct_entry->ifindex; + } else { + struct trie_value *prefix_value; + union key_4 key4; + + /* Look up in the trie for lpm */ + key4.b32[0] = 32; + key4.b8[4] = iph->daddr & 0xff; + key4.b8[5] = (iph->daddr >> 8) & 0xff; + key4.b8[6] = (iph->daddr >> 16) & 0xff; + key4.b8[7] = (iph->daddr >> 24) & 0xff; + + prefix_value = bpf_map_lookup_elem(&lpm_map, &key4); + if (!prefix_value) + goto drop; + + forward_to = prefix_value->ifindex; + src_mac = &prefix_value->value; + if (!src_mac) + goto drop; + + dest_mac = bpf_map_lookup_elem(&arp_table, &iph->daddr); + if (!dest_mac) { + if (!prefix_value->gw) + goto drop; + + dest_mac = bpf_map_lookup_elem(&arp_table, + &prefix_value->gw); + } + } + + if (src_mac && dest_mac) { + int ret; + + __builtin_memcpy(eth->h_dest, dest_mac, ETH_ALEN); + __builtin_memcpy(eth->h_source, src_mac, ETH_ALEN); + + ret = bpf_redirect_map(&tx_port, forward_to, 0); + if (ret == XDP_REDIRECT) { + if (rec) + NO_TEAR_INC(rec->xdp_redirect); + return ret; + } + } + } + default: + break; + } +drop: + if (rec) + NO_TEAR_INC(rec->xdp_drop); + + return XDP_DROP; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_router_ipv4_kern.c b/samples/bpf/xdp_router_ipv4_kern.c deleted file mode 100644 index b37ca2b13063..000000000000 --- a/samples/bpf/xdp_router_ipv4_kern.c +++ /dev/null @@ -1,186 +0,0 @@ -/* Copyright (C) 2017 Cavium, Inc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - */ -#define KBUILD_MODNAME "foo" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct trie_value { - __u8 prefix[4]; - __be64 value; - int ifindex; - int metric; - __be32 gw; -}; - -/* Key for lpm_trie*/ -union key_4 { - u32 b32[2]; - u8 b8[8]; -}; - -struct arp_entry { - __be64 mac; - __be32 dst; -}; - -struct direct_map { - struct arp_entry arp; - int ifindex; - __be64 mac; -}; - -/* Map for trie implementation*/ -struct { - __uint(type, BPF_MAP_TYPE_LPM_TRIE); - __uint(key_size, 8); - __uint(value_size, sizeof(struct trie_value)); - __uint(max_entries, 50); - __uint(map_flags, BPF_F_NO_PREALLOC); -} lpm_map SEC(".maps"); - -/* Map for counter*/ -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __type(key, u32); - __type(value, u64); - __uint(max_entries, 256); -} rxcnt SEC(".maps"); - -/* Map for ARP table*/ -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, __be32); - __type(value, __be64); - __uint(max_entries, 50); -} arp_table SEC(".maps"); - -/* Map to keep the exact match entries in the route table*/ -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, __be32); - __type(value, struct direct_map); - __uint(max_entries, 50); -} exact_match SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_DEVMAP); - __uint(key_size, sizeof(int)); - __uint(value_size, sizeof(int)); - __uint(max_entries, 100); -} tx_port SEC(".maps"); - -/* Function to set source and destination mac of the packet */ -static inline void set_src_dst_mac(void *data, void *src, void *dst) -{ - unsigned short *source = src; - unsigned short *dest = dst; - unsigned short *p = data; - - __builtin_memcpy(p, dest, 6); - __builtin_memcpy(p + 3, source, 6); -} - -/* Parse IPV4 packet to get SRC, DST IP and protocol */ -static inline int parse_ipv4(void *data, u64 nh_off, void *data_end, - __be32 *src, __be32 *dest) -{ - struct iphdr *iph = data + nh_off; - - if (iph + 1 > data_end) - return 0; - *src = iph->saddr; - *dest = iph->daddr; - return iph->protocol; -} - -SEC("xdp_router_ipv4") -int xdp_router_ipv4_prog(struct xdp_md *ctx) -{ - void *data_end = (void *)(long)ctx->data_end; - __be64 *dest_mac = NULL, *src_mac = NULL; - void *data = (void *)(long)ctx->data; - struct trie_value *prefix_value; - int rc = XDP_DROP, forward_to; - struct ethhdr *eth = data; - union key_4 key4; - long *value; - u16 h_proto; - u32 ipproto; - u64 nh_off; - - nh_off = sizeof(*eth); - if (data + nh_off > data_end) - return rc; - - h_proto = eth->h_proto; - - if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { - struct vlan_hdr *vhdr; - - vhdr = data + nh_off; - nh_off += sizeof(struct vlan_hdr); - if (data + nh_off > data_end) - return rc; - h_proto = vhdr->h_vlan_encapsulated_proto; - } - if (h_proto == htons(ETH_P_ARP)) { - return XDP_PASS; - } else if (h_proto == htons(ETH_P_IP)) { - struct direct_map *direct_entry; - __be32 src_ip = 0, dest_ip = 0; - - ipproto = parse_ipv4(data, nh_off, data_end, &src_ip, &dest_ip); - direct_entry = bpf_map_lookup_elem(&exact_match, &dest_ip); - /* Check for exact match, this would give a faster lookup*/ - if (direct_entry && direct_entry->mac && direct_entry->arp.mac) { - src_mac = &direct_entry->mac; - dest_mac = &direct_entry->arp.mac; - forward_to = direct_entry->ifindex; - } else { - /* Look up in the trie for lpm*/ - key4.b32[0] = 32; - key4.b8[4] = dest_ip & 0xff; - key4.b8[5] = (dest_ip >> 8) & 0xff; - key4.b8[6] = (dest_ip >> 16) & 0xff; - key4.b8[7] = (dest_ip >> 24) & 0xff; - prefix_value = bpf_map_lookup_elem(&lpm_map, &key4); - if (!prefix_value) - return XDP_DROP; - src_mac = &prefix_value->value; - if (!src_mac) - return XDP_DROP; - dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip); - if (!dest_mac) { - if (!prefix_value->gw) - return XDP_DROP; - dest_ip = prefix_value->gw; - dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip); - } - forward_to = prefix_value->ifindex; - } - } else { - ipproto = 0; - } - if (src_mac && dest_mac) { - set_src_dst_mac(data, src_mac, dest_mac); - value = bpf_map_lookup_elem(&rxcnt, &ipproto); - if (value) - *value += 1; - return bpf_redirect_map(&tx_port, forward_to, 0); - } - return rc; -} - -char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c index 6dae87d83e1c..7828784612ec 100644 --- a/samples/bpf/xdp_router_ipv4_user.c +++ b/samples/bpf/xdp_router_ipv4_user.c @@ -24,70 +24,36 @@ #include #include #include +#include +#include "xdp_sample_user.h" +#include "xdp_router_ipv4.skel.h" -int sock, sock_arp, flags = XDP_FLAGS_UPDATE_IF_NOEXIST; -static int total_ifindex; -static int *ifindex_list; -static __u32 *prog_id_list; -char buf[8192]; +static const char *__doc__ = +"XDP IPv4 router implementation\n" +"Usage: xdp_router_ipv4 ... \n"; + +static char buf[8192]; static int lpm_map_fd; -static int rxcnt_map_fd; static int arp_table_map_fd; static int exact_match_map_fd; static int tx_port_map_fd; -static int get_route_table(int rtm_family); -static void int_exit(int sig) -{ - __u32 prog_id = 0; - int i = 0; +static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_MAP_CNT | + SAMPLE_DEVMAP_XMIT_CNT_MULTI | SAMPLE_EXCEPTION_CNT; - for (i = 0; i < total_ifindex; i++) { - if (bpf_xdp_query_id(ifindex_list[i], flags, &prog_id)) { - printf("bpf_xdp_query_id on iface %d failed\n", - ifindex_list[i]); - exit(1); - } - if (prog_id_list[i] == prog_id) - bpf_xdp_detach(ifindex_list[i], flags, NULL); - else if (!prog_id) - printf("couldn't find a prog id on iface %d\n", - ifindex_list[i]); - else - printf("program on iface %d changed, not removing\n", - ifindex_list[i]); - prog_id = 0; - } - exit(0); -} +DEFINE_SAMPLE_INIT(xdp_router_ipv4); -static void close_and_exit(int sig) -{ - close(sock); - close(sock_arp); +static const struct option long_options[] = { + { "help", no_argument, NULL, 'h' }, + { "skb-mode", no_argument, NULL, 'S' }, + { "force", no_argument, NULL, 'F' }, + { "interval", required_argument, NULL, 'i' }, + { "verbose", no_argument, NULL, 'v' }, + { "stats", no_argument, NULL, 's' }, + {} +}; - int_exit(0); -} - -/* Get the mac address of the interface given interface name */ -static __be64 getmac(char *iface) -{ - struct ifreq ifr; - __be64 mac = 0; - int fd, i; - - fd = socket(AF_INET, SOCK_DGRAM, 0); - ifr.ifr_addr.sa_family = AF_INET; - strncpy(ifr.ifr_name, iface, IFNAMSIZ - 1); - if (ioctl(fd, SIOCGIFHWADDR, &ifr) < 0) { - printf("ioctl failed leaving....\n"); - return -1; - } - for (i = 0; i < 6 ; i++) - *((__u8 *)&mac + i) = (__u8)ifr.ifr_hwaddr.sa_data[i]; - close(fd); - return mac; -} +static int get_route_table(int rtm_family); static int recv_msg(struct sockaddr_nl sock_addr, int sock) { @@ -130,7 +96,6 @@ static void read_route(struct nlmsghdr *nh, int nll) int i; struct route_table { int dst_len, iface, metric; - char *iface_name; __be32 dst, gw; __be64 mac; } route; @@ -145,17 +110,7 @@ static void read_route(struct nlmsghdr *nh, int nll) __be64 mac; } direct_entry; - if (nh->nlmsg_type == RTM_DELROUTE) - printf("DELETING Route entry\n"); - else if (nh->nlmsg_type == RTM_GETROUTE) - printf("READING Route entry\n"); - else if (nh->nlmsg_type == RTM_NEWROUTE) - printf("NEW Route entry\n"); - else - printf("%d\n", nh->nlmsg_type); - memset(&route, 0, sizeof(route)); - printf("Destination Gateway Genmask Metric Iface\n"); for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) { rt_msg = (struct rtmsg *)NLMSG_DATA(nh); rtm_family = rt_msg->rtm_family; @@ -192,11 +147,7 @@ static void read_route(struct nlmsghdr *nh, int nll) route.gw = atoi(gws); route.iface = atoi(ifs); route.metric = atoi(metrics); - route.iface_name = alloca(sizeof(char *) * IFNAMSIZ); - route.iface_name = if_indextoname(route.iface, route.iface_name); - route.mac = getmac(route.iface_name); - if (route.mac == -1) - int_exit(0); + assert(get_mac_addr(route.iface, &route.mac) == 0); assert(bpf_map_update_elem(tx_port_map_fd, &route.iface, &route.iface, 0) == 0); if (rtm_family == AF_INET) { @@ -207,7 +158,6 @@ static void read_route(struct nlmsghdr *nh, int nll) int metric; __be32 gw; } *prefix_value; - struct in_addr dst_addr, gw_addr, mask_addr; prefix_key = alloca(sizeof(*prefix_key) + 3); prefix_value = alloca(sizeof(*prefix_value)); @@ -235,17 +185,6 @@ static void read_route(struct nlmsghdr *nh, int nll) for (i = 0; i < 4; i++) prefix_key->data[i] = (route.dst >> i * 8) & 0xff; - dst_addr.s_addr = route.dst; - printf("%-16s", inet_ntoa(dst_addr)); - - gw_addr.s_addr = route.gw; - printf("%-16s", inet_ntoa(gw_addr)); - - mask_addr.s_addr = htonl(~(0xffffffffU >> route.dst_len)); - printf("%-16s%-7d%s\n", inet_ntoa(mask_addr), - route.metric, - route.iface_name); - if (bpf_map_lookup_elem(lpm_map_fd, prefix_key, prefix_value) < 0) { for (i = 0; i < 4; i++) @@ -261,13 +200,6 @@ static void read_route(struct nlmsghdr *nh, int nll) ) == 0); } else { if (nh->nlmsg_type == RTM_DELROUTE) { - printf("deleting entry\n"); - printf("prefix key=%d.%d.%d.%d/%d", - prefix_key->data[0], - prefix_key->data[1], - prefix_key->data[2], - prefix_key->data[3], - prefix_key->prefixlen); assert(bpf_map_delete_elem(lpm_map_fd, prefix_key ) == 0); @@ -331,14 +263,14 @@ static int get_route_table(int rtm_family) sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sock < 0) { - printf("open netlink socket: %s\n", strerror(errno)); - return -1; + fprintf(stderr, "open netlink socket: %s\n", strerror(errno)); + return -errno; } memset(&sa, 0, sizeof(sa)); sa.nl_family = AF_NETLINK; if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { - printf("bind to netlink: %s\n", strerror(errno)); - ret = -1; + fprintf(stderr, "bind netlink socket: %s\n", strerror(errno)); + ret = -errno; goto cleanup; } memset(&req, 0, sizeof(req)); @@ -357,15 +289,15 @@ static int get_route_table(int rtm_family) msg.msg_iovlen = 1; ret = sendmsg(sock, &msg, 0); if (ret < 0) { - printf("send to netlink: %s\n", strerror(errno)); - ret = -1; + fprintf(stderr, "send to netlink: %s\n", strerror(errno)); + ret = -errno; goto cleanup; } memset(buf, 0, sizeof(buf)); nll = recv_msg(sa, sock); if (nll < 0) { - printf("recv from netlink: %s\n", strerror(nll)); - ret = -1; + fprintf(stderr, "recv from netlink: %s\n", strerror(nll)); + ret = nll; goto cleanup; } nh = (struct nlmsghdr *)buf; @@ -395,14 +327,7 @@ static void read_arp(struct nlmsghdr *nh, int nll) __be64 mac; } direct_entry; - if (nh->nlmsg_type == RTM_GETNEIGH) - printf("READING arp entry\n"); - printf("Address HwAddress\n"); for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) { - struct in_addr dst_addr; - char mac_str[18]; - int len = 0, i; - rt_msg = (struct ndmsg *)NLMSG_DATA(nh); rt_attr = (struct rtattr *)RTM_RTA(rt_msg); ndm_family = rt_msg->ndm_family; @@ -424,13 +349,6 @@ static void read_arp(struct nlmsghdr *nh, int nll) arp_entry.dst = atoi(dsts); arp_entry.mac = atol(mac); - dst_addr.s_addr = arp_entry.dst; - for (i = 0; i < 6; i++) - len += snprintf(mac_str + len, 18 - len, "%02llx%s", - ((arp_entry.mac >> i * 8) & 0xff), - i < 5 ? ":" : ""); - printf("%-16s%s\n", inet_ntoa(dst_addr), mac_str); - if (ndm_family == AF_INET) { if (bpf_map_lookup_elem(exact_match_map_fd, &arp_entry.dst, @@ -481,14 +399,14 @@ static int get_arp_table(int rtm_family) sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sock < 0) { - printf("open netlink socket: %s\n", strerror(errno)); - return -1; + fprintf(stderr, "open netlink socket: %s\n", strerror(errno)); + return -errno; } memset(&sa, 0, sizeof(sa)); sa.nl_family = AF_NETLINK; if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { - printf("bind to netlink: %s\n", strerror(errno)); - ret = -1; + fprintf(stderr, "bind netlink socket: %s\n", strerror(errno)); + ret = -errno; goto cleanup; } memset(&req, 0, sizeof(req)); @@ -506,15 +424,15 @@ static int get_arp_table(int rtm_family) msg.msg_iovlen = 1; ret = sendmsg(sock, &msg, 0); if (ret < 0) { - printf("send to netlink: %s\n", strerror(errno)); - ret = -1; + fprintf(stderr, "send to netlink: %s\n", strerror(errno)); + ret = -errno; goto cleanup; } memset(buf, 0, sizeof(buf)); nll = recv_msg(sa, sock); if (nll < 0) { - printf("recv from netlink: %s\n", strerror(nll)); - ret = -1; + fprintf(stderr, "recv from netlink: %s\n", strerror(nll)); + ret = nll; goto cleanup; } nh = (struct nlmsghdr *)buf; @@ -527,24 +445,17 @@ cleanup: /* Function to keep track and update changes in route and arp table * Give regular statistics of packets forwarded */ -static int monitor_route(void) +static void monitor_route(void *ctx) { - unsigned int nr_cpus = bpf_num_possible_cpus(); - const unsigned int nr_keys = 256; struct pollfd fds_route, fds_arp; - __u64 prev[nr_keys][nr_cpus]; struct sockaddr_nl la, lr; - __u64 values[nr_cpus]; + int sock, sock_arp, nll; struct nlmsghdr *nh; - int nll, ret = 0; - int interval = 5; - __u32 key; - int i; sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sock < 0) { - printf("open netlink socket: %s\n", strerror(errno)); - return -1; + fprintf(stderr, "open netlink socket: %s\n", strerror(errno)); + return; } fcntl(sock, F_SETFL, O_NONBLOCK); @@ -552,17 +463,19 @@ static int monitor_route(void) lr.nl_family = AF_NETLINK; lr.nl_groups = RTMGRP_IPV6_ROUTE | RTMGRP_IPV4_ROUTE | RTMGRP_NOTIFY; if (bind(sock, (struct sockaddr *)&lr, sizeof(lr)) < 0) { - printf("bind to netlink: %s\n", strerror(errno)); - ret = -1; - goto cleanup; + fprintf(stderr, "bind netlink socket: %s\n", strerror(errno)); + close(sock); + return; } + fds_route.fd = sock; fds_route.events = POLL_IN; sock_arp = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sock_arp < 0) { - printf("open netlink socket: %s\n", strerror(errno)); - return -1; + fprintf(stderr, "open netlink socket: %s\n", strerror(errno)); + close(sock); + return; } fcntl(sock_arp, F_SETFL, O_NONBLOCK); @@ -570,184 +483,196 @@ static int monitor_route(void) la.nl_family = AF_NETLINK; la.nl_groups = RTMGRP_NEIGH | RTMGRP_NOTIFY; if (bind(sock_arp, (struct sockaddr *)&la, sizeof(la)) < 0) { - printf("bind to netlink: %s\n", strerror(errno)); - ret = -1; + fprintf(stderr, "bind netlink socket: %s\n", strerror(errno)); goto cleanup; } + fds_arp.fd = sock_arp; fds_arp.events = POLL_IN; - memset(prev, 0, sizeof(prev)); - do { - signal(SIGINT, close_and_exit); - signal(SIGTERM, close_and_exit); - - sleep(interval); - for (key = 0; key < nr_keys; key++) { - __u64 sum = 0; - - assert(bpf_map_lookup_elem(rxcnt_map_fd, - &key, values) == 0); - for (i = 0; i < nr_cpus; i++) - sum += (values[i] - prev[key][i]); - if (sum) - printf("proto %u: %10llu pkt/s\n", - key, sum / interval); - memcpy(prev[key], values, sizeof(values)); + memset(buf, 0, sizeof(buf)); + if (poll(&fds_route, 1, 3) == POLL_IN) { + nll = recv_msg(lr, sock); + if (nll < 0) { + fprintf(stderr, "recv from netlink: %s\n", + strerror(nll)); + goto cleanup; } - memset(buf, 0, sizeof(buf)); - if (poll(&fds_route, 1, 3) == POLL_IN) { - nll = recv_msg(lr, sock); - if (nll < 0) { - printf("recv from netlink: %s\n", strerror(nll)); - ret = -1; - goto cleanup; - } + nh = (struct nlmsghdr *)buf; + read_route(nh, nll); + } - nh = (struct nlmsghdr *)buf; - printf("Routing table updated.\n"); - read_route(nh, nll); + memset(buf, 0, sizeof(buf)); + if (poll(&fds_arp, 1, 3) == POLL_IN) { + nll = recv_msg(la, sock_arp); + if (nll < 0) { + fprintf(stderr, "recv from netlink: %s\n", + strerror(nll)); + goto cleanup; } - memset(buf, 0, sizeof(buf)); - if (poll(&fds_arp, 1, 3) == POLL_IN) { - nll = recv_msg(la, sock_arp); - if (nll < 0) { - printf("recv from netlink: %s\n", strerror(nll)); - ret = -1; - goto cleanup; - } - nh = (struct nlmsghdr *)buf; - read_arp(nh, nll); - } + nh = (struct nlmsghdr *)buf; + read_arp(nh, nll); + } - } while (1); cleanup: + close(sock_arp); close(sock); - return ret; } -static void usage(const char *prog) +static void usage(char *argv[], const struct option *long_options, + const char *doc, int mask, bool error, + struct bpf_object *obj) { - fprintf(stderr, - "%s: %s [OPTS] interface name list\n\n" - "OPTS:\n" - " -S use skb-mode\n" - " -F force loading prog\n", - __func__, prog); + sample_usage(argv, long_options, doc, mask, error); } -int main(int ac, char **argv) +int main(int argc, char **argv) { - struct bpf_prog_info info = {}; - __u32 info_len = sizeof(info); - const char *optstr = "SF"; - struct bpf_program *prog; - struct bpf_object *obj; - char filename[256]; - char **ifname_list; - int prog_fd, opt; - int err, i = 1; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - - total_ifindex = ac - 1; - ifname_list = (argv + 1); - - while ((opt = getopt(ac, argv, optstr)) != -1) { + bool error = true, generic = false, force = false; + int opt, interval = 5, ret = EXIT_FAIL_BPF; + struct xdp_router_ipv4 *skel; + int i, total_ifindex = argc - 1; + char **ifname_list = argv + 1; + int longindex = 0; + + if (libbpf_set_strict_mode(LIBBPF_STRICT_ALL) < 0) { + fprintf(stderr, "Failed to set libbpf strict mode: %s\n", + strerror(errno)); + goto end; + } + + skel = xdp_router_ipv4__open(); + if (!skel) { + fprintf(stderr, "Failed to xdp_router_ipv4__open: %s\n", + strerror(errno)); + goto end; + } + + ret = sample_init_pre_load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to sample_init_pre_load: %s\n", + strerror(-ret)); + ret = EXIT_FAIL_BPF; + goto end_destroy; + } + + ret = xdp_router_ipv4__load(skel); + if (ret < 0) { + fprintf(stderr, "Failed to xdp_router_ipv4__load: %s\n", + strerror(errno)); + goto end_destroy; + } + + ret = sample_init(skel, mask); + if (ret < 0) { + fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; + } + + while ((opt = getopt_long(argc, argv, "si:SFvh", + long_options, &longindex)) != -1) { switch (opt) { + case 's': + mask |= SAMPLE_REDIRECT_MAP_CNT; + total_ifindex--; + ifname_list++; + break; + case 'i': + interval = strtoul(optarg, NULL, 0); + total_ifindex -= 2; + ifname_list += 2; + break; case 'S': - flags |= XDP_FLAGS_SKB_MODE; + generic = true; total_ifindex--; ifname_list++; break; case 'F': - flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; + force = true; + total_ifindex--; + ifname_list++; + break; + case 'v': + sample_switch_mode(); total_ifindex--; ifname_list++; break; + case 'h': + error = false; default: - usage(basename(argv[0])); - return 1; + usage(argv, long_options, __doc__, mask, error, skel->obj); + goto end_destroy; } } - if (!(flags & XDP_FLAGS_SKB_MODE)) - flags |= XDP_FLAGS_DRV_MODE; - - if (optind == ac) { - usage(basename(argv[0])); - return 1; + ret = EXIT_FAIL_OPTION; + if (optind == argc) { + usage(argv, long_options, __doc__, mask, true, skel->obj); + goto end_destroy; } - obj = bpf_object__open_file(filename, NULL); - if (libbpf_get_error(obj)) - return 1; - - prog = bpf_object__next_program(obj, NULL); - bpf_program__set_type(prog, BPF_PROG_TYPE_XDP); - - printf("\n******************loading bpf file*********************\n"); - err = bpf_object__load(obj); - if (err) { - printf("bpf_object__load(): %s\n", strerror(errno)); - return 1; + lpm_map_fd = bpf_map__fd(skel->maps.lpm_map); + if (lpm_map_fd < 0) { + fprintf(stderr, "Failed loading lpm_map %s\n", + strerror(-lpm_map_fd)); + goto end_destroy; } - prog_fd = bpf_program__fd(prog); - - lpm_map_fd = bpf_object__find_map_fd_by_name(obj, "lpm_map"); - rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt"); - arp_table_map_fd = bpf_object__find_map_fd_by_name(obj, "arp_table"); - exact_match_map_fd = bpf_object__find_map_fd_by_name(obj, - "exact_match"); - tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port"); - if (lpm_map_fd < 0 || rxcnt_map_fd < 0 || arp_table_map_fd < 0 || - exact_match_map_fd < 0 || tx_port_map_fd < 0) { - printf("bpf_object__find_map_fd_by_name failed\n"); - return 1; + arp_table_map_fd = bpf_map__fd(skel->maps.arp_table); + if (arp_table_map_fd < 0) { + fprintf(stderr, "Failed loading arp_table_map_fd %s\n", + strerror(-arp_table_map_fd)); + goto end_destroy; + } + exact_match_map_fd = bpf_map__fd(skel->maps.exact_match); + if (exact_match_map_fd < 0) { + fprintf(stderr, "Failed loading exact_match_map_fd %s\n", + strerror(-exact_match_map_fd)); + goto end_destroy; + } + tx_port_map_fd = bpf_map__fd(skel->maps.tx_port); + if (tx_port_map_fd < 0) { + fprintf(stderr, "Failed loading tx_port_map_fd %s\n", + strerror(-tx_port_map_fd)); + goto end_destroy; } - ifindex_list = (int *)calloc(total_ifindex, sizeof(int *)); + ret = EXIT_FAIL_XDP; for (i = 0; i < total_ifindex; i++) { - ifindex_list[i] = if_nametoindex(ifname_list[i]); - if (!ifindex_list[i]) { - printf("Couldn't translate interface name: %s", - strerror(errno)); - return 1; + int index = if_nametoindex(ifname_list[i]); + + if (!index) { + fprintf(stderr, "Interface %s not found %s\n", + ifname_list[i], strerror(-tx_port_map_fd)); + goto end_destroy; } + if (sample_install_xdp(skel->progs.xdp_router_ipv4_prog, + index, generic, force) < 0) + goto end_destroy; } - prog_id_list = (__u32 *)calloc(total_ifindex, sizeof(__u32 *)); - for (i = 0; i < total_ifindex; i++) { - if (bpf_xdp_attach(ifindex_list[i], prog_fd, flags, NULL) < 0) { - printf("link set xdp fd failed\n"); - int recovery_index = i; - for (i = 0; i < recovery_index; i++) - bpf_xdp_detach(ifindex_list[i], flags, NULL); + if (get_route_table(AF_INET) < 0) { + fprintf(stderr, "Failed reading routing table\n"); + goto end_destroy; + } - return 1; - } - err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); - if (err) { - printf("can't get prog info - %s\n", strerror(errno)); - return err; - } - prog_id_list[i] = info.id; - memset(&info, 0, sizeof(info)); - printf("Attached to %d\n", ifindex_list[i]); + if (get_arp_table(AF_INET) < 0) { + fprintf(stderr, "Failed reading arptable\n"); + goto end_destroy; } - signal(SIGINT, int_exit); - signal(SIGTERM, int_exit); - printf("\n*******************ROUTE TABLE*************************\n"); - get_route_table(AF_INET); - printf("\n*******************ARP TABLE***************************\n"); - get_arp_table(AF_INET); - if (monitor_route() < 0) { - printf("Error in receiving route update"); - return 1; + ret = sample_run(interval, monitor_route, NULL); + if (ret < 0) { + fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret)); + ret = EXIT_FAIL; + goto end_destroy; } + ret = EXIT_OK; - return 0; +end_destroy: + xdp_router_ipv4__destroy(skel); +end: + sample_exit(ret); } -- cgit From 1ce3a60e3c287479f15147ffc61c35b2e436a0d5 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Wed, 30 Mar 2022 16:26:36 +0100 Subject: libbpf: auto-resolve programs/libraries when necessary for uprobes bpf_program__attach_uprobe_opts() requires a binary_path argument specifying binary to instrument. Supporting simply specifying "libc.so.6" or "foo" should be possible too. Library search checks LD_LIBRARY_PATH, then /usr/lib64, /usr/lib. This allows users to run BPF programs prefixed with LD_LIBRARY_PATH=/path2/lib while still searching standard locations. Similarly for non .so files, we check PATH and /usr/bin, /usr/sbin. Path determination will be useful for auto-attach of BPF uprobe programs using SEC() definition. Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1648654000-21758-2-git-send-email-alan.maguire@oracle.com --- tools/lib/bpf/libbpf.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 809fe209cdcc..ba6b61336bc7 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10517,6 +10517,46 @@ static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, return pfd; } +/* Get full path to program/shared library. */ +static int resolve_full_path(const char *file, char *result, size_t result_sz) +{ + const char *search_paths[2]; + int i; + + if (strstr(file, ".so")) { + search_paths[0] = getenv("LD_LIBRARY_PATH"); + search_paths[1] = "/usr/lib64:/usr/lib"; + } else { + search_paths[0] = getenv("PATH"); + search_paths[1] = "/usr/bin:/usr/sbin"; + } + + for (i = 0; i < ARRAY_SIZE(search_paths); i++) { + const char *s; + + if (!search_paths[i]) + continue; + for (s = search_paths[i]; s != NULL; s = strchr(s, ':')) { + char *next_path; + int seg_len; + + if (s[0] == ':') + s++; + next_path = strchr(s, ':'); + seg_len = next_path ? next_path - s : strlen(s); + if (!seg_len) + continue; + snprintf(result, result_sz, "%.*s/%s", seg_len, s, file); + /* ensure it is an executable file/link */ + if (access(result, R_OK | X_OK) < 0) + continue; + pr_debug("resolved '%s' to '%s'\n", file, result); + return 0; + } + } + return -ENOENT; +} + LIBBPF_API struct bpf_link * bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, const char *binary_path, size_t func_offset, @@ -10524,6 +10564,7 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, { DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); char errmsg[STRERR_BUFSIZE], *legacy_probe = NULL; + char full_binary_path[PATH_MAX]; struct bpf_link *link; size_t ref_ctr_off; int pfd, err; @@ -10536,12 +10577,23 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, ref_ctr_off = OPTS_GET(opts, ref_ctr_offset, 0); pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + if (binary_path && !strchr(binary_path, '/')) { + err = resolve_full_path(binary_path, full_binary_path, + sizeof(full_binary_path)); + if (err) { + pr_warn("prog '%s': failed to resolve full path for '%s'\n", + prog->name, binary_path); + return libbpf_err_ptr(err); + } + binary_path = full_binary_path; + } + legacy = determine_uprobe_perf_type() < 0; if (!legacy) { pfd = perf_event_open_probe(true /* uprobe */, retprobe, binary_path, func_offset, pid, ref_ctr_off); } else { - char probe_name[512]; + char probe_name[PATH_MAX + 64]; if (ref_ctr_off) return libbpf_err_ptr(-EINVAL); -- cgit From 433966e3ae04165811b116af492a684bad7a158c Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Wed, 30 Mar 2022 16:26:37 +0100 Subject: libbpf: Support function name-based attach uprobes kprobe attach is name-based, using lookups of kallsyms to translate a function name to an address. Currently uprobe attach is done via an offset value as described in [1]. Extend uprobe opts for attach to include a function name which can then be converted into a uprobe-friendly offset. The calcualation is done in several steps: 1. First, determine the symbol address using libelf; this gives us the offset as reported by objdump 2. If the function is a shared library function - and the binary provided is a shared library - no further work is required; the address found is the required address 3. Finally, if the function is local, subtract the base address associated with the object, retrieved from ELF program headers. The resultant value is then added to the func_offset value passed in to specify the uprobe attach address. So specifying a func_offset of 0 along with a function name "printf" will attach to printf entry. The modes of operation supported are then 1. to attach to a local function in a binary; function "foo1" in "/usr/bin/foo" 2. to attach to a shared library function in a shared library - function "malloc" in libc. [1] https://www.kernel.org/doc/html/latest/trace/uprobetracer.html Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1648654000-21758-3-git-send-email-alan.maguire@oracle.com --- tools/lib/bpf/libbpf.c | 204 +++++++++++++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf.h | 10 ++- 2 files changed, 213 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index ba6b61336bc7..f6e5a0217841 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10517,6 +10517,195 @@ static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, return pfd; } +/* uprobes deal in relative offsets; subtract the base address associated with + * the mapped binary. See Documentation/trace/uprobetracer.rst for more + * details. + */ +static long elf_find_relative_offset(const char *filename, Elf *elf, long addr) +{ + size_t n; + int i; + + if (elf_getphdrnum(elf, &n)) { + pr_warn("elf: failed to find program headers for '%s': %s\n", filename, + elf_errmsg(-1)); + return -ENOENT; + } + + for (i = 0; i < n; i++) { + int seg_start, seg_end, seg_offset; + GElf_Phdr phdr; + + if (!gelf_getphdr(elf, i, &phdr)) { + pr_warn("elf: failed to get program header %d from '%s': %s\n", i, filename, + elf_errmsg(-1)); + return -ENOENT; + } + if (phdr.p_type != PT_LOAD || !(phdr.p_flags & PF_X)) + continue; + + seg_start = phdr.p_vaddr; + seg_end = seg_start + phdr.p_memsz; + seg_offset = phdr.p_offset; + if (addr >= seg_start && addr < seg_end) + return addr - seg_start + seg_offset; + } + pr_warn("elf: failed to find prog header containing 0x%lx in '%s'\n", addr, filename); + return -ENOENT; +} + +/* Return next ELF section of sh_type after scn, or first of that type if scn is NULL. */ +static Elf_Scn *elf_find_next_scn_by_type(Elf *elf, int sh_type, Elf_Scn *scn) +{ + while ((scn = elf_nextscn(elf, scn)) != NULL) { + GElf_Shdr sh; + + if (!gelf_getshdr(scn, &sh)) + continue; + if (sh.sh_type == sh_type) + return scn; + } + return NULL; +} + +/* Find offset of function name in object specified by path. "name" matches + * symbol name or name@@LIB for library functions. + */ +static long elf_find_func_offset(const char *binary_path, const char *name) +{ + int fd, i, sh_types[2] = { SHT_DYNSYM, SHT_SYMTAB }; + bool is_shared_lib, is_name_qualified; + char errmsg[STRERR_BUFSIZE]; + long ret = -ENOENT; + size_t name_len; + GElf_Ehdr ehdr; + Elf *elf; + + fd = open(binary_path, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + ret = -errno; + pr_warn("failed to open %s: %s\n", binary_path, + libbpf_strerror_r(ret, errmsg, sizeof(errmsg))); + return ret; + } + elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); + if (!elf) { + pr_warn("elf: could not read elf from %s: %s\n", binary_path, elf_errmsg(-1)); + close(fd); + return -LIBBPF_ERRNO__FORMAT; + } + if (!gelf_getehdr(elf, &ehdr)) { + pr_warn("elf: failed to get ehdr from %s: %s\n", binary_path, elf_errmsg(-1)); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } + /* for shared lib case, we do not need to calculate relative offset */ + is_shared_lib = ehdr.e_type == ET_DYN; + + name_len = strlen(name); + /* Does name specify "@@LIB"? */ + is_name_qualified = strstr(name, "@@") != NULL; + + /* Search SHT_DYNSYM, SHT_SYMTAB for symbol. This search order is used because if + * a binary is stripped, it may only have SHT_DYNSYM, and a fully-statically + * linked binary may not have SHT_DYMSYM, so absence of a section should not be + * reported as a warning/error. + */ + for (i = 0; i < ARRAY_SIZE(sh_types); i++) { + size_t nr_syms, strtabidx, idx; + Elf_Data *symbols = NULL; + Elf_Scn *scn = NULL; + int last_bind = -1; + const char *sname; + GElf_Shdr sh; + + scn = elf_find_next_scn_by_type(elf, sh_types[i], NULL); + if (!scn) { + pr_debug("elf: failed to find symbol table ELF sections in '%s'\n", + binary_path); + continue; + } + if (!gelf_getshdr(scn, &sh)) + continue; + strtabidx = sh.sh_link; + symbols = elf_getdata(scn, 0); + if (!symbols) { + pr_warn("elf: failed to get symbols for symtab section in '%s': %s\n", + binary_path, elf_errmsg(-1)); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } + nr_syms = symbols->d_size / sh.sh_entsize; + + for (idx = 0; idx < nr_syms; idx++) { + int curr_bind; + GElf_Sym sym; + + if (!gelf_getsym(symbols, idx, &sym)) + continue; + + if (GELF_ST_TYPE(sym.st_info) != STT_FUNC) + continue; + + sname = elf_strptr(elf, strtabidx, sym.st_name); + if (!sname) + continue; + + curr_bind = GELF_ST_BIND(sym.st_info); + + /* User can specify func, func@@LIB or func@@LIB_VERSION. */ + if (strncmp(sname, name, name_len) != 0) + continue; + /* ...but we don't want a search for "foo" to match 'foo2" also, so any + * additional characters in sname should be of the form "@@LIB". + */ + if (!is_name_qualified && sname[name_len] != '\0' && sname[name_len] != '@') + continue; + + if (ret >= 0) { + /* handle multiple matches */ + if (last_bind != STB_WEAK && curr_bind != STB_WEAK) { + /* Only accept one non-weak bind. */ + pr_warn("elf: ambiguous match for '%s', '%s' in '%s'\n", + sname, name, binary_path); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } else if (curr_bind == STB_WEAK) { + /* already have a non-weak bind, and + * this is a weak bind, so ignore. + */ + continue; + } + } + ret = sym.st_value; + last_bind = curr_bind; + } + /* For binaries that are not shared libraries, we need relative offset */ + if (ret > 0 && !is_shared_lib) + ret = elf_find_relative_offset(binary_path, elf, ret); + if (ret > 0) + break; + } + + if (ret > 0) { + pr_debug("elf: symbol address match for '%s' in '%s': 0x%lx\n", name, binary_path, + ret); + } else { + if (ret == 0) { + pr_warn("elf: '%s' is 0 in symtab for '%s': %s\n", name, binary_path, + is_shared_lib ? "should not be 0 in a shared library" : + "try using shared library path instead"); + ret = -ENOENT; + } else { + pr_warn("elf: failed to find symbol '%s' in '%s'\n", name, binary_path); + } + } +out: + elf_end(elf); + close(fd); + return ret; +} + /* Get full path to program/shared library. */ static int resolve_full_path(const char *file, char *result, size_t result_sz) { @@ -10569,6 +10758,7 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, size_t ref_ctr_off; int pfd, err; bool retprobe, legacy; + const char *func_name; if (!OPTS_VALID(opts, bpf_uprobe_opts)) return libbpf_err_ptr(-EINVAL); @@ -10587,6 +10777,20 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, } binary_path = full_binary_path; } + func_name = OPTS_GET(opts, func_name, NULL); + if (func_name) { + long sym_off; + + if (!binary_path) { + pr_warn("prog '%s': name-based attach requires binary_path\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + sym_off = elf_find_func_offset(binary_path, func_name); + if (sym_off < 0) + return libbpf_err_ptr(sym_off); + func_offset += sym_off; + } legacy = determine_uprobe_perf_type() < 0; if (!legacy) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 05dde85e19a6..28cd2062d0df 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -459,9 +459,17 @@ struct bpf_uprobe_opts { __u64 bpf_cookie; /* uprobe is return probe, invoked at function return time */ bool retprobe; + /* Function name to attach to. Could be an unqualified ("abc") or library-qualified + * "abc@LIBXYZ" name. To specify function entry, func_name should be set while + * func_offset argument to bpf_prog__attach_uprobe_opts() should be 0. To trace an + * offset within a function, specify func_name and use func_offset argument to specify + * offset within the function. Shared library functions must specify the shared library + * binary_path. + */ + const char *func_name; size_t :0; }; -#define bpf_uprobe_opts__last_field retprobe +#define bpf_uprobe_opts__last_field func_name /** * @brief **bpf_program__attach_uprobe()** attaches a BPF program -- cgit From 39f8dc43b7a05ab0e352655a14a9d613c2308b92 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Wed, 30 Mar 2022 16:26:38 +0100 Subject: libbpf: Add auto-attach for uprobes based on section name Now that u[ret]probes can use name-based specification, it makes sense to add support for auto-attach based on SEC() definition. The format proposed is SEC("u[ret]probe/binary:[raw_offset|[function_name[+offset]]") For example, to trace malloc() in libc: SEC("uprobe/libc.so.6:malloc") ...or to trace function foo2 in /usr/bin/foo: SEC("uprobe//usr/bin/foo:foo2") Auto-attach is done for all tasks (pid -1). prog can be an absolute path or simply a program/library name; in the latter case, we use PATH/LD_LIBRARY_PATH to resolve the full path, falling back to standard locations (/usr/bin:/usr/sbin or /usr/lib64:/usr/lib) if the file is not found via environment-variable specified locations. Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1648654000-21758-4-git-send-email-alan.maguire@oracle.com --- tools/lib/bpf/libbpf.c | 74 +++++++++++++++++++++- .../testing/selftests/bpf/progs/test_bpf_cookie.c | 4 +- .../selftests/bpf/progs/test_task_pt_regs.c | 2 +- tools/testing/selftests/bpf/progs/trigger_bench.c | 2 +- 4 files changed, 76 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index f6e5a0217841..6d2be53e4ba9 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8630,6 +8630,7 @@ int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log } static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link); @@ -8642,9 +8643,9 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("sk_reuseport/migrate", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, SEC_ATTACHABLE | SEC_SLOPPY_PFX), SEC_DEF("sk_reuseport", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), SEC_DEF("kprobe/", KPROBE, 0, SEC_NONE, attach_kprobe), - SEC_DEF("uprobe/", KPROBE, 0, SEC_NONE), + SEC_DEF("uprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), SEC_DEF("kretprobe/", KPROBE, 0, SEC_NONE, attach_kprobe), - SEC_DEF("uretprobe/", KPROBE, 0, SEC_NONE), + SEC_DEF("uretprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), SEC_DEF("kprobe.multi/", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), SEC_DEF("kretprobe.multi/", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), SEC_DEF("tc", SCHED_CLS, 0, SEC_NONE), @@ -10845,6 +10846,75 @@ err_out: } +/* Format of u[ret]probe section definition supporting auto-attach: + * u[ret]probe/binary:function[+offset] + * + * binary can be an absolute/relative path or a filename; the latter is resolved to a + * full binary path via bpf_program__attach_uprobe_opts. + * + * Specifying uprobe+ ensures we carry out strict matching; either "uprobe" must be + * specified (and auto-attach is not possible) or the above format is specified for + * auto-attach. + */ +static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts); + char *func, *probe_name, *func_end; + char *func_name, binary_path[512]; + unsigned long long raw_offset; + size_t offset = 0; + int n; + + *link = NULL; + + opts.retprobe = str_has_pfx(prog->sec_name, "uretprobe"); + if (opts.retprobe) + probe_name = prog->sec_name + sizeof("uretprobe") - 1; + else + probe_name = prog->sec_name + sizeof("uprobe") - 1; + if (probe_name[0] == '/') + probe_name++; + + /* handle SEC("u[ret]probe") - format is valid, but auto-attach is impossible. */ + if (strlen(probe_name) == 0) + return 0; + + snprintf(binary_path, sizeof(binary_path), "%s", probe_name); + /* ':' should be prior to function+offset */ + func_name = strrchr(binary_path, ':'); + if (!func_name) { + pr_warn("section '%s' missing ':function[+offset]' specification\n", + prog->sec_name); + return -EINVAL; + } + func_name[0] = '\0'; + func_name++; + n = sscanf(func_name, "%m[a-zA-Z0-9_.]+%li", &func, &offset); + if (n < 1) { + pr_warn("uprobe name '%s' is invalid\n", func_name); + return -EINVAL; + } + if (opts.retprobe && offset != 0) { + free(func); + pr_warn("uretprobes do not support offset specification\n"); + return -EINVAL; + } + + /* Is func a raw address? */ + errno = 0; + raw_offset = strtoull(func, &func_end, 0); + if (!errno && !*func_end) { + free(func); + func = NULL; + offset = (size_t)raw_offset; + } + opts.func_name = func; + + *link = bpf_program__attach_uprobe_opts(prog, -1, binary_path, offset, &opts); + free(func); + return libbpf_get_error(*link); +} + struct bpf_link *bpf_program__attach_uprobe(const struct bpf_program *prog, bool retprobe, pid_t pid, const char *binary_path, diff --git a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c index 2d3a7710e2ce..0e2222968918 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c @@ -37,14 +37,14 @@ int handle_kretprobe(struct pt_regs *ctx) return 0; } -SEC("uprobe/trigger_func") +SEC("uprobe") int handle_uprobe(struct pt_regs *ctx) { update(ctx, &uprobe_res); return 0; } -SEC("uretprobe/trigger_func") +SEC("uretprobe") int handle_uretprobe(struct pt_regs *ctx) { update(ctx, &uretprobe_res); diff --git a/tools/testing/selftests/bpf/progs/test_task_pt_regs.c b/tools/testing/selftests/bpf/progs/test_task_pt_regs.c index e6cb09259408..1926facba122 100644 --- a/tools/testing/selftests/bpf/progs/test_task_pt_regs.c +++ b/tools/testing/selftests/bpf/progs/test_task_pt_regs.c @@ -14,7 +14,7 @@ char current_regs[PT_REGS_SIZE] = {}; char ctx_regs[PT_REGS_SIZE] = {}; int uprobe_res = 0; -SEC("uprobe/trigger_func") +SEC("uprobe") int handle_uprobe(struct pt_regs *ctx) { struct task_struct *current; diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 2ab049b54d6c..694e7cec1823 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -54,7 +54,7 @@ int bench_trigger_fmodret(void *ctx) return -22; } -SEC("uprobe/self/uprobe_target") +SEC("uprobe") int bench_trigger_uprobe(void *ctx) { __sync_add_and_fetch(&hits, 1); -- cgit From ba7499bc9d52f7ea93301a48c05caa370c68b213 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Wed, 30 Mar 2022 16:26:39 +0100 Subject: selftests/bpf: Add tests for u[ret]probe attach by name add tests that verify attaching by name for 1. local functions in a program 2. library functions in a shared object ...succeed for uprobe and uretprobes using new "func_name" option for bpf_program__attach_uprobe_opts(). Also verify auto-attach works where uprobe, path to binary and function name are specified, but fails with -EOPNOTSUPP with a SEC name that does not specify binary path/function. Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1648654000-21758-5-git-send-email-alan.maguire@oracle.com --- .../selftests/bpf/prog_tests/attach_probe.c | 85 ++++++++++++++++++---- .../selftests/bpf/progs/test_attach_probe.c | 41 ++++++++++- 2 files changed, 109 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c index d48f6e533e1e..c0c6d410751d 100644 --- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c +++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c @@ -11,15 +11,22 @@ static void trigger_func(void) asm volatile (""); } +/* attach point for byname uprobe */ +static void trigger_func2(void) +{ + asm volatile (""); +} + void test_attach_probe(void) { DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); - int duration = 0; struct bpf_link *kprobe_link, *kretprobe_link; struct bpf_link *uprobe_link, *uretprobe_link; struct test_attach_probe* skel; ssize_t uprobe_offset, ref_ctr_offset; + struct bpf_link *uprobe_err_link; bool legacy; + char *mem; /* Check if new-style kprobe/uprobe API is supported. * Kernels that support new FD-based kprobe and uprobe BPF attachment @@ -43,9 +50,9 @@ void test_attach_probe(void) return; skel = test_attach_probe__open_and_load(); - if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) + if (!ASSERT_OK_PTR(skel, "skel_open")) return; - if (CHECK(!skel->bss, "check_bss", ".bss wasn't mmap()-ed\n")) + if (!ASSERT_OK_PTR(skel->bss, "check_bss")) goto cleanup; kprobe_link = bpf_program__attach_kprobe(skel->progs.handle_kprobe, @@ -90,25 +97,73 @@ void test_attach_probe(void) goto cleanup; skel->links.handle_uretprobe = uretprobe_link; - /* trigger & validate kprobe && kretprobe */ - usleep(1); + /* verify auto-attach fails for old-style uprobe definition */ + uprobe_err_link = bpf_program__attach(skel->progs.handle_uprobe_byname); + if (!ASSERT_EQ(libbpf_get_error(uprobe_err_link), -EOPNOTSUPP, + "auto-attach should fail for old-style name")) + goto cleanup; + + uprobe_opts.func_name = "trigger_func2"; + uprobe_opts.retprobe = false; + uprobe_opts.ref_ctr_offset = 0; + skel->links.handle_uprobe_byname = + bpf_program__attach_uprobe_opts(skel->progs.handle_uprobe_byname, + 0 /* this pid */, + "/proc/self/exe", + 0, &uprobe_opts); + if (!ASSERT_OK_PTR(skel->links.handle_uprobe_byname, "attach_uprobe_byname")) + goto cleanup; + + /* verify auto-attach works */ + skel->links.handle_uretprobe_byname = + bpf_program__attach(skel->progs.handle_uretprobe_byname); + if (!ASSERT_OK_PTR(skel->links.handle_uretprobe_byname, "attach_uretprobe_byname")) + goto cleanup; - if (CHECK(skel->bss->kprobe_res != 1, "check_kprobe_res", - "wrong kprobe res: %d\n", skel->bss->kprobe_res)) + /* test attach by name for a library function, using the library + * as the binary argument. libc.so.6 will be resolved via dlopen()/dlinfo(). + */ + uprobe_opts.func_name = "malloc"; + uprobe_opts.retprobe = false; + skel->links.handle_uprobe_byname2 = + bpf_program__attach_uprobe_opts(skel->progs.handle_uprobe_byname2, + 0 /* this pid */, + "libc.so.6", + 0, &uprobe_opts); + if (!ASSERT_OK_PTR(skel->links.handle_uprobe_byname2, "attach_uprobe_byname2")) goto cleanup; - if (CHECK(skel->bss->kretprobe_res != 2, "check_kretprobe_res", - "wrong kretprobe res: %d\n", skel->bss->kretprobe_res)) + + uprobe_opts.func_name = "free"; + uprobe_opts.retprobe = true; + skel->links.handle_uretprobe_byname2 = + bpf_program__attach_uprobe_opts(skel->progs.handle_uretprobe_byname2, + -1 /* any pid */, + "libc.so.6", + 0, &uprobe_opts); + if (!ASSERT_OK_PTR(skel->links.handle_uretprobe_byname2, "attach_uretprobe_byname2")) goto cleanup; + /* trigger & validate kprobe && kretprobe */ + usleep(1); + + /* trigger & validate shared library u[ret]probes attached by name */ + mem = malloc(1); + free(mem); + /* trigger & validate uprobe & uretprobe */ trigger_func(); - if (CHECK(skel->bss->uprobe_res != 3, "check_uprobe_res", - "wrong uprobe res: %d\n", skel->bss->uprobe_res)) - goto cleanup; - if (CHECK(skel->bss->uretprobe_res != 4, "check_uretprobe_res", - "wrong uretprobe res: %d\n", skel->bss->uretprobe_res)) - goto cleanup; + /* trigger & validate uprobe attached by name */ + trigger_func2(); + + ASSERT_EQ(skel->bss->kprobe_res, 1, "check_kprobe_res"); + ASSERT_EQ(skel->bss->kretprobe_res, 2, "check_kretprobe_res"); + ASSERT_EQ(skel->bss->uprobe_res, 3, "check_uprobe_res"); + ASSERT_EQ(skel->bss->uretprobe_res, 4, "check_uretprobe_res"); + ASSERT_EQ(skel->bss->uprobe_byname_res, 5, "check_uprobe_byname_res"); + ASSERT_EQ(skel->bss->uretprobe_byname_res, 6, "check_uretprobe_byname_res"); + ASSERT_EQ(skel->bss->uprobe_byname2_res, 7, "check_uprobe_byname2_res"); + ASSERT_EQ(skel->bss->uretprobe_byname2_res, 8, "check_uretprobe_byname2_res"); cleanup: test_attach_probe__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/test_attach_probe.c b/tools/testing/selftests/bpf/progs/test_attach_probe.c index 8056a4c6d918..af994d16bb10 100644 --- a/tools/testing/selftests/bpf/progs/test_attach_probe.c +++ b/tools/testing/selftests/bpf/progs/test_attach_probe.c @@ -10,6 +10,10 @@ int kprobe_res = 0; int kretprobe_res = 0; int uprobe_res = 0; int uretprobe_res = 0; +int uprobe_byname_res = 0; +int uretprobe_byname_res = 0; +int uprobe_byname2_res = 0; +int uretprobe_byname2_res = 0; SEC("kprobe/sys_nanosleep") int handle_kprobe(struct pt_regs *ctx) @@ -25,18 +29,51 @@ int BPF_KRETPROBE(handle_kretprobe) return 0; } -SEC("uprobe/trigger_func") +SEC("uprobe") int handle_uprobe(struct pt_regs *ctx) { uprobe_res = 3; return 0; } -SEC("uretprobe/trigger_func") +SEC("uretprobe") int handle_uretprobe(struct pt_regs *ctx) { uretprobe_res = 4; return 0; } +SEC("uprobe") +int handle_uprobe_byname(struct pt_regs *ctx) +{ + uprobe_byname_res = 5; + return 0; +} + +/* use auto-attach format for section definition. */ +SEC("uretprobe//proc/self/exe:trigger_func2") +int handle_uretprobe_byname(struct pt_regs *ctx) +{ + uretprobe_byname_res = 6; + return 0; +} + +SEC("uprobe") +int handle_uprobe_byname2(struct pt_regs *ctx) +{ + unsigned int size = PT_REGS_PARM1(ctx); + + /* verify malloc size */ + if (size == 1) + uprobe_byname2_res = 7; + return 0; +} + +SEC("uretprobe") +int handle_uretprobe_byname2(struct pt_regs *ctx) +{ + uretprobe_byname2_res = 8; + return 0; +} + char _license[] SEC("license") = "GPL"; -- cgit From 579c3196b21800538e0cfd512a05619ee80fb19a Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Wed, 30 Mar 2022 16:26:40 +0100 Subject: selftests/bpf: Add tests for uprobe auto-attach via skeleton tests that verify auto-attach works for function entry/return for local functions in program and library functions in a library. Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1648654000-21758-6-git-send-email-alan.maguire@oracle.com --- .../selftests/bpf/prog_tests/uprobe_autoattach.c | 38 ++++++++++++++++ .../selftests/bpf/progs/test_uprobe_autoattach.c | 52 ++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c create mode 100644 tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c b/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c new file mode 100644 index 000000000000..03b15d632be4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022, Oracle and/or its affiliates. */ + +#include +#include "test_uprobe_autoattach.skel.h" + +/* uprobe attach point */ +static void autoattach_trigger_func(void) +{ + asm volatile (""); +} + +void test_uprobe_autoattach(void) +{ + struct test_uprobe_autoattach *skel; + char *mem; + + skel = test_uprobe_autoattach__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + if (!ASSERT_OK(test_uprobe_autoattach__attach(skel), "skel_attach")) + goto cleanup; + + /* trigger & validate uprobe & uretprobe */ + autoattach_trigger_func(); + + /* trigger & validate shared library u[ret]probes attached by name */ + mem = malloc(1); + free(mem); + + ASSERT_EQ(skel->bss->uprobe_byname_res, 1, "check_uprobe_byname_res"); + ASSERT_EQ(skel->bss->uretprobe_byname_res, 2, "check_uretprobe_byname_res"); + ASSERT_EQ(skel->bss->uprobe_byname2_res, 3, "check_uprobe_byname2_res"); + ASSERT_EQ(skel->bss->uretprobe_byname2_res, 4, "check_uretprobe_byname2_res"); +cleanup: + test_uprobe_autoattach__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c b/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c new file mode 100644 index 000000000000..b442fb5ef54b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022, Oracle and/or its affiliates. */ + +#include +#include +#include +#include + +int uprobe_byname_res = 0; +int uretprobe_byname_res = 0; +int uprobe_byname2_res = 0; +int uretprobe_byname2_res = 0; + +/* This program cannot auto-attach, but that should not stop other + * programs from attaching. + */ +SEC("uprobe") +int handle_uprobe_noautoattach(struct pt_regs *ctx) +{ + return 0; +} + +SEC("uprobe//proc/self/exe:autoattach_trigger_func") +int handle_uprobe_byname(struct pt_regs *ctx) +{ + uprobe_byname_res = 1; + return 0; +} + +SEC("uretprobe//proc/self/exe:autoattach_trigger_func") +int handle_uretprobe_byname(struct pt_regs *ctx) +{ + uretprobe_byname_res = 2; + return 0; +} + + +SEC("uprobe/libc.so.6:malloc") +int handle_uprobe_byname2(struct pt_regs *ctx) +{ + uprobe_byname2_res = 3; + return 0; +} + +SEC("uretprobe/libc.so.6:free") +int handle_uretprobe_byname2(struct pt_regs *ctx) +{ + uretprobe_byname2_res = 4; + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit From e93f39998d8f8ed456dfbb4ca68f9a159906cc6f Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Mon, 4 Apr 2022 08:53:20 +0800 Subject: libbpf: Don't return -EINVAL if hdr_len < offsetofend(core_relo_len) Since core relos is an optional part of the .BTF.ext ELF section, we should skip parsing it instead of returning -EINVAL if header size is less than offsetofend(struct btf_ext_header, core_relo_len). Signed-off-by: Yuntao Wang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220404005320.1723055-1-ytcoode@gmail.com --- tools/lib/bpf/btf.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 1383e26c5d1f..d124e9e533f0 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -2826,10 +2826,8 @@ struct btf_ext *btf_ext__new(const __u8 *data, __u32 size) if (err) goto done; - if (btf_ext->hdr->hdr_len < offsetofend(struct btf_ext_header, core_relo_len)) { - err = -EINVAL; - goto done; - } + if (btf_ext->hdr->hdr_len < offsetofend(struct btf_ext_header, core_relo_len)) + goto done; /* skip core relos parsing */ err = btf_ext_setup_core_relos(btf_ext); if (err) -- cgit From 35f91d1fe106cd474ac0463157d012c675385e18 Mon Sep 17 00:00:00 2001 From: Song Chen Date: Sat, 2 Apr 2022 16:57:08 +0800 Subject: sample: bpf: syscall_tp_user: Print result of verify_map At the end of the test, we already print out prog : map ids <...> <...> Value is the number read from kernel through bpf map, further print out verify map: val:<...> will help users to understand the program runs successfully. Signed-off-by: Song Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1648889828-12417-1-git-send-email-chensong_2000@189.cn --- samples/bpf/syscall_tp_user.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c index a0ebf1833ed3..c55383068384 100644 --- a/samples/bpf/syscall_tp_user.c +++ b/samples/bpf/syscall_tp_user.c @@ -36,6 +36,9 @@ static void verify_map(int map_id) fprintf(stderr, "failed: map #%d returns value 0\n", map_id); return; } + + printf("verify map:%d val: %d\n", map_id, val); + val = 0; if (bpf_map_update_elem(map_id, &key, &val, BPF_ANY) != 0) { fprintf(stderr, "map_update failed: %s\n", strerror(errno)); -- cgit From fc843ccd8e4c084fa5d605a876db2d3a3c38be88 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Mon, 4 Apr 2022 13:54:51 +0200 Subject: samples: bpf: Fix linking xdp_router_ipv4 after migration Users of the xdp_sample_user infra should be explicitly linked with the standard math library (`-lm`). Otherwise, the following happens: /usr/bin/ld: xdp_sample_user.c:(.text+0x59fc): undefined reference to `ceil' /usr/bin/ld: xdp_sample_user.c:(.text+0x5a0d): undefined reference to `ceil' /usr/bin/ld: xdp_sample_user.c:(.text+0x5adc): undefined reference to `floor' /usr/bin/ld: xdp_sample_user.c:(.text+0x5b01): undefined reference to `ceil' /usr/bin/ld: xdp_sample_user.c:(.text+0x5c1e): undefined reference to `floor' /usr/bin/ld: xdp_sample_user.c:(.text+0x5c43): undefined reference to `ceil [...] That happened previously, so there's a block of linkage flags in the Makefile. xdp_router_ipv4 has been transferred to this infra quite recently, but hasn't been added to it. Fix. Fixes: 85bf1f51691c ("samples: bpf: Convert xdp_router_ipv4 to XDP samples helper") Signed-off-by: Alexander Lobakin Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220404115451.1116478-1-alexandr.lobakin@intel.com --- samples/bpf/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index b4fa0e69aa14..342a41a10356 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -219,6 +219,7 @@ TPROGLDLIBS_xdp_redirect += -lm TPROGLDLIBS_xdp_redirect_cpu += -lm TPROGLDLIBS_xdp_redirect_map += -lm TPROGLDLIBS_xdp_redirect_map_multi += -lm +TPROGLDLIBS_xdp_router_ipv4 += -lm TPROGLDLIBS_tracex4 += -lrt TPROGLDLIBS_trace_output += -lrt TPROGLDLIBS_map_perf_test += -lrt -- cgit From 4eeebce6ac4ad80ee8243bb847c98e0e55848d47 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 4 Apr 2022 15:09:44 +0100 Subject: selftests/bpf: Fix parsing of prog types in UAPI hdr for bpftool sync The script for checking that various lists of types in bpftool remain in sync with the UAPI BPF header uses a regex to parse enum bpf_prog_type. If this enum contains a set of values different from the list of program types in bpftool, it complains. This script should have reported the addition, some time ago, of the new BPF_PROG_TYPE_SYSCALL, which was not reported to bpftool's program types list. It failed to do so, because it failed to parse that new type from the enum. This is because the new value, in the BPF header, has an explicative comment on the same line, and the regex does not support that. Let's update the script to support parsing enum values when they have comments on the same line. Signed-off-by: Quentin Monnet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220404140944.64744-1-quentin@isovalent.com --- tools/testing/selftests/bpf/test_bpftool_synctypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py index 6bf21e47882a..c0e7acd698ed 100755 --- a/tools/testing/selftests/bpf/test_bpftool_synctypes.py +++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py @@ -180,7 +180,7 @@ class FileExtractor(object): @enum_name: name of the enum to parse """ start_marker = re.compile(f'enum {enum_name} {{\n') - pattern = re.compile('^\s*(BPF_\w+),?$') + pattern = re.compile('^\s*(BPF_\w+),?(\s+/\*.*\*/)?$') end_marker = re.compile('^};') parser = BlockParser(self.reader) parser.search_block(start_marker) -- cgit From 380341637ebb41f56031a0fd0779e85413a1da59 Mon Sep 17 00:00:00 2001 From: Milan Landaverde Date: Thu, 31 Mar 2022 11:45:53 -0400 Subject: bpftool: Add syscall prog type In addition to displaying the program type in bpftool prog show this enables us to be able to query bpf_prog_type_syscall availability through feature probe as well as see which helpers are available in those programs (such as bpf_sys_bpf and bpf_sys_close) Signed-off-by: Milan Landaverde Signed-off-by: Andrii Nakryiko Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220331154555.422506-2-milan@mdaverde.com --- tools/bpf/bpftool/prog.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index bc4e05542c2b..8643b37d4e43 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -68,6 +68,7 @@ const char * const prog_type_name[] = { [BPF_PROG_TYPE_EXT] = "ext", [BPF_PROG_TYPE_LSM] = "lsm", [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", + [BPF_PROG_TYPE_SYSCALL] = "syscall", }; const size_t prog_type_name_size = ARRAY_SIZE(prog_type_name); -- cgit From fff3dfab17866f6ac5c5666839f6132b6c52f306 Mon Sep 17 00:00:00 2001 From: Milan Landaverde Date: Thu, 31 Mar 2022 11:45:54 -0400 Subject: bpftool: Add missing link types Will display the link type names in bpftool link show output Signed-off-by: Milan Landaverde Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220331154555.422506-3-milan@mdaverde.com --- tools/bpf/bpftool/link.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c index 97dec81950e5..8fb0116f9136 100644 --- a/tools/bpf/bpftool/link.c +++ b/tools/bpf/bpftool/link.c @@ -20,6 +20,9 @@ static const char * const link_type_name[] = { [BPF_LINK_TYPE_CGROUP] = "cgroup", [BPF_LINK_TYPE_ITER] = "iter", [BPF_LINK_TYPE_NETNS] = "netns", + [BPF_LINK_TYPE_XDP] = "xdp", + [BPF_LINK_TYPE_PERF_EVENT] = "perf_event", + [BPF_LINK_TYPE_KPROBE_MULTI] = "kprobe_multi", }; static struct hashmap *link_table; -- cgit From 7b53eaa656c34d5b521e245cbfc3aee2d7b89eac Mon Sep 17 00:00:00 2001 From: Milan Landaverde Date: Thu, 31 Mar 2022 11:45:55 -0400 Subject: bpftool: Handle libbpf_probe_prog_type errors Previously [1], we were using bpf_probe_prog_type which returned a bool, but the new libbpf_probe_bpf_prog_type can return a negative error code on failure. This change decides for bpftool to declare a program type is not available on probe failure. [1] https://lore.kernel.org/bpf/20220202225916.3313522-3-andrii@kernel.org/ Signed-off-by: Milan Landaverde Signed-off-by: Andrii Nakryiko Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20220331154555.422506-4-milan@mdaverde.com --- tools/bpf/bpftool/feature.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index 290998c82de1..f041c4a6a1f2 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -567,7 +567,7 @@ probe_prog_type(enum bpf_prog_type prog_type, bool *supported_types, res = probe_prog_type_ifindex(prog_type, ifindex); } else { - res = libbpf_probe_bpf_prog_type(prog_type, NULL); + res = libbpf_probe_bpf_prog_type(prog_type, NULL) > 0; } #ifdef USE_LIBCAP -- cgit From d298761746d59ca169dfe68b4d0a983c3053573b Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Mon, 4 Apr 2022 16:21:01 +0200 Subject: selftests/bpf: Define SYS_NANOSLEEP_KPROBE_NAME for aarch64 attach_probe selftest fails on aarch64 with `failed to create kprobe 'sys_nanosleep+0x0' perf event: No such file or directory`. This is because, like on several other architectures, nanosleep has a prefix. Signed-off-by: Ilya Leoshkevich Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Link: https://lore.kernel.org/bpf/20220404142101.27900-1-iii@linux.ibm.com --- tools/testing/selftests/bpf/test_progs.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 93c1ff705533..eec4c7385b14 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -332,6 +332,8 @@ int trigger_module_test_write(int write_sz); #define SYS_NANOSLEEP_KPROBE_NAME "__x64_sys_nanosleep" #elif defined(__s390x__) #define SYS_NANOSLEEP_KPROBE_NAME "__s390x_sys_nanosleep" +#elif defined(__aarch64__) +#define SYS_NANOSLEEP_KPROBE_NAME "__arm64_sys_nanosleep" #else #define SYS_NANOSLEEP_KPROBE_NAME "sys_nanosleep" #endif -- cgit From 568189310c2096e204674edd2f0da036cd50676a Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 5 Apr 2022 00:50:20 +0200 Subject: libbpf: Support Debian in resolve_full_path() attach_probe selftest fails on Debian-based distros with `failed to resolve full path for 'libc.so.6'`. The reason is that these distros embraced multiarch to the point where even for the "main" architecture they store libc in /lib/. This is configured in /etc/ld.so.conf and in theory it's possible to replicate the loader's parsing and processing logic in libbpf, however a much simpler solution is to just enumerate the known library paths. Signed-off-by: Ilya Leoshkevich Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220404225020.51029-1-iii@linux.ibm.com --- tools/lib/bpf/libbpf.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 6d2be53e4ba9..91ce94b61f7f 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10707,15 +10707,53 @@ out: return ret; } +static const char *arch_specific_lib_paths(void) +{ + /* + * Based on https://packages.debian.org/sid/libc6. + * + * Assume that the traced program is built for the same architecture + * as libbpf, which should cover the vast majority of cases. + */ +#if defined(__x86_64__) + return "/lib/x86_64-linux-gnu"; +#elif defined(__i386__) + return "/lib/i386-linux-gnu"; +#elif defined(__s390x__) + return "/lib/s390x-linux-gnu"; +#elif defined(__s390__) + return "/lib/s390-linux-gnu"; +#elif defined(__arm__) && defined(__SOFTFP__) + return "/lib/arm-linux-gnueabi"; +#elif defined(__arm__) && !defined(__SOFTFP__) + return "/lib/arm-linux-gnueabihf"; +#elif defined(__aarch64__) + return "/lib/aarch64-linux-gnu"; +#elif defined(__mips__) && defined(__MIPSEL__) && _MIPS_SZLONG == 64 + return "/lib/mips64el-linux-gnuabi64"; +#elif defined(__mips__) && defined(__MIPSEL__) && _MIPS_SZLONG == 32 + return "/lib/mipsel-linux-gnu"; +#elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return "/lib/powerpc64le-linux-gnu"; +#elif defined(__sparc__) && defined(__arch64__) + return "/lib/sparc64-linux-gnu"; +#elif defined(__riscv) && __riscv_xlen == 64 + return "/lib/riscv64-linux-gnu"; +#else + return NULL; +#endif +} + /* Get full path to program/shared library. */ static int resolve_full_path(const char *file, char *result, size_t result_sz) { - const char *search_paths[2]; + const char *search_paths[3] = {}; int i; if (strstr(file, ".so")) { search_paths[0] = getenv("LD_LIBRARY_PATH"); search_paths[1] = "/usr/lib64:/usr/lib"; + search_paths[2] = arch_specific_lib_paths(); } else { search_paths[0] = getenv("PATH"); search_paths[1] = "/usr/bin:/usr/sbin"; -- cgit From d72e2968fb2583460062e15d53760d44e2d09ae6 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Apr 2022 16:41:56 -0700 Subject: libbpf: Add BPF-side of USDT support Add BPF-side implementation of libbpf-provided USDT support. This consists of single header library, usdt.bpf.h, which is meant to be used from user's BPF-side source code. This header is added to the list of installed libbpf header, along bpf_helpers.h and others. BPF-side implementation consists of two BPF maps: - spec map, which contains "a USDT spec" which encodes information necessary to be able to fetch USDT arguments and other information (argument count, user-provided cookie value, etc) at runtime; - IP-to-spec-ID map, which is only used on kernels that don't support BPF cookie feature. It allows to lookup spec ID based on the place in user application that triggers USDT program. These maps have default sizes, 256 and 1024, which are chosen conservatively to not waste a lot of space, but handling a lot of common cases. But there could be cases when user application needs to either trace a lot of different USDTs, or USDTs are heavily inlined and their arguments are located in a lot of differing locations. For such cases it might be necessary to size those maps up, which libbpf allows to do by overriding BPF_USDT_MAX_SPEC_CNT and BPF_USDT_MAX_IP_CNT macros. It is an important aspect to keep in mind. Single USDT (user-space equivalent of kernel tracepoint) can have multiple USDT "call sites". That is, single logical USDT is triggered from multiple places in user application. This can happen due to function inlining. Each such inlined instance of USDT invocation can have its own unique USDT argument specification (instructions about the location of the value of each of USDT arguments). So while USDT looks very similar to usual uprobe or kernel tracepoint, under the hood it's actually a collection of uprobes, each potentially needing different spec to know how to fetch arguments. User-visible API consists of three helper functions: - bpf_usdt_arg_cnt(), which returns number of arguments of current USDT; - bpf_usdt_arg(), which reads value of specified USDT argument (by it's zero-indexed position) and returns it as 64-bit value; - bpf_usdt_cookie(), which functions like BPF cookie for USDT programs; this is necessary as libbpf doesn't allow specifying actual BPF cookie and utilizes it internally for USDT support implementation. Each bpf_usdt_xxx() APIs expect struct pt_regs * context, passed into BPF program. On kernels that don't support BPF cookie it is used to fetch absolute IP address of the underlying uprobe. usdt.bpf.h also provides BPF_USDT() macro, which functions like BPF_PROG() and BPF_KPROBE() and allows much more user-friendly way to get access to USDT arguments, if USDT definition is static and known to the user. It is expected that majority of use cases won't have to use bpf_usdt_arg_cnt() and bpf_usdt_arg() directly and BPF_USDT() will cover all their needs. Last, usdt.bpf.h is utilizing BPF CO-RE for one single purpose: to detect kernel support for BPF cookie. If BPF CO-RE dependency is undesirable, user application can redefine BPF_USDT_HAS_BPF_COOKIE to either a boolean constant (or equivalently zero and non-zero), or even point it to its own .rodata variable that can be specified from user's application user-space code. It is important that BPF_USDT_HAS_BPF_COOKIE is known to BPF verifier as static value (thus .rodata and not just .data), as otherwise BPF code will still contain bpf_get_attach_cookie() BPF helper call and will fail validation at runtime, if not dead-code eliminated. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Link: https://lore.kernel.org/bpf/20220404234202.331384-2-andrii@kernel.org --- tools/lib/bpf/Makefile | 2 +- tools/lib/bpf/usdt.bpf.h | 256 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 257 insertions(+), 1 deletion(-) create mode 100644 tools/lib/bpf/usdt.bpf.h diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index b8b37fe76006..b4fbe8bed555 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -239,7 +239,7 @@ install_lib: all_cmd SRC_HDRS := bpf.h libbpf.h btf.h libbpf_common.h libbpf_legacy.h xsk.h \ bpf_helpers.h bpf_tracing.h bpf_endian.h bpf_core_read.h \ - skel_internal.h libbpf_version.h + skel_internal.h libbpf_version.h usdt.bpf.h GEN_HDRS := $(BPF_GENERATED) INSTALL_PFX := $(DESTDIR)$(prefix)/include/bpf diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h new file mode 100644 index 000000000000..60237acf6b02 --- /dev/null +++ b/tools/lib/bpf/usdt.bpf.h @@ -0,0 +1,256 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#ifndef __USDT_BPF_H__ +#define __USDT_BPF_H__ + +#include +#include +#include +#include + +/* Below types and maps are internal implementation details of libbpf's USDT + * support and are subjects to change. Also, bpf_usdt_xxx() API helpers should + * be considered an unstable API as well and might be adjusted based on user + * feedback from using libbpf's USDT support in production. + */ + +/* User can override BPF_USDT_MAX_SPEC_CNT to change default size of internal + * map that keeps track of USDT argument specifications. This might be + * necessary if there are a lot of USDT attachments. + */ +#ifndef BPF_USDT_MAX_SPEC_CNT +#define BPF_USDT_MAX_SPEC_CNT 256 +#endif +/* User can override BPF_USDT_MAX_IP_CNT to change default size of internal + * map that keeps track of IP (memory address) mapping to USDT argument + * specification. + * Note, if kernel supports BPF cookies, this map is not used and could be + * resized all the way to 1 to save a bit of memory. + */ +#ifndef BPF_USDT_MAX_IP_CNT +#define BPF_USDT_MAX_IP_CNT (4 * BPF_USDT_MAX_SPEC_CNT) +#endif +/* We use BPF CO-RE to detect support for BPF cookie from BPF side. This is + * the only dependency on CO-RE, so if it's undesirable, user can override + * BPF_USDT_HAS_BPF_COOKIE to specify whether to BPF cookie is supported or not. + */ +#ifndef BPF_USDT_HAS_BPF_COOKIE +#define BPF_USDT_HAS_BPF_COOKIE \ + bpf_core_enum_value_exists(enum bpf_func_id___usdt, BPF_FUNC_get_attach_cookie___usdt) +#endif + +enum __bpf_usdt_arg_type { + BPF_USDT_ARG_CONST, + BPF_USDT_ARG_REG, + BPF_USDT_ARG_REG_DEREF, +}; + +struct __bpf_usdt_arg_spec { + /* u64 scalar interpreted depending on arg_type, see below */ + __u64 val_off; + /* arg location case, see bpf_udst_arg() for details */ + enum __bpf_usdt_arg_type arg_type; + /* offset of referenced register within struct pt_regs */ + short reg_off; + /* whether arg should be interpreted as signed value */ + bool arg_signed; + /* number of bits that need to be cleared and, optionally, + * sign-extended to cast arguments that are 1, 2, or 4 bytes + * long into final 8-byte u64/s64 value returned to user + */ + char arg_bitshift; +}; + +/* should match USDT_MAX_ARG_CNT in usdt.c exactly */ +#define BPF_USDT_MAX_ARG_CNT 12 +struct __bpf_usdt_spec { + struct __bpf_usdt_arg_spec args[BPF_USDT_MAX_ARG_CNT]; + __u64 usdt_cookie; + short arg_cnt; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, BPF_USDT_MAX_SPEC_CNT); + __type(key, int); + __type(value, struct __bpf_usdt_spec); +} __bpf_usdt_specs SEC(".maps") __weak; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, BPF_USDT_MAX_IP_CNT); + __type(key, long); + __type(value, __u32); +} __bpf_usdt_ip_to_spec_id SEC(".maps") __weak; + +/* don't rely on user's BPF code to have latest definition of bpf_func_id */ +enum bpf_func_id___usdt { + BPF_FUNC_get_attach_cookie___usdt = 0xBAD, /* value doesn't matter */ +}; + +static __always_inline +int __bpf_usdt_spec_id(struct pt_regs *ctx) +{ + if (!BPF_USDT_HAS_BPF_COOKIE) { + long ip = PT_REGS_IP(ctx); + int *spec_id_ptr; + + spec_id_ptr = bpf_map_lookup_elem(&__bpf_usdt_ip_to_spec_id, &ip); + return spec_id_ptr ? *spec_id_ptr : -ESRCH; + } + + return bpf_get_attach_cookie(ctx); +} + +/* Return number of USDT arguments defined for currently traced USDT. */ +static inline __noinline +int bpf_usdt_arg_cnt(struct pt_regs *ctx) +{ + struct __bpf_usdt_spec *spec; + int spec_id; + + spec_id = __bpf_usdt_spec_id(ctx); + if (spec_id < 0) + return -ESRCH; + + spec = bpf_map_lookup_elem(&__bpf_usdt_specs, &spec_id); + if (!spec) + return -ESRCH; + + return spec->arg_cnt; +} + +/* Fetch USDT argument #*arg_num* (zero-indexed) and put its value into *res. + * Returns 0 on success; negative error, otherwise. + * On error *res is guaranteed to be set to zero. + */ +static inline __noinline +int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) +{ + struct __bpf_usdt_spec *spec; + struct __bpf_usdt_arg_spec *arg_spec; + unsigned long val; + int err, spec_id; + + *res = 0; + + spec_id = __bpf_usdt_spec_id(ctx); + if (spec_id < 0) + return -ESRCH; + + spec = bpf_map_lookup_elem(&__bpf_usdt_specs, &spec_id); + if (!spec) + return -ESRCH; + + if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec->arg_cnt) + return -ENOENT; + + arg_spec = &spec->args[arg_num]; + switch (arg_spec->arg_type) { + case BPF_USDT_ARG_CONST: + /* Arg is just a constant ("-4@$-9" in USDT arg spec). + * value is recorded in arg_spec->val_off directly. + */ + val = arg_spec->val_off; + break; + case BPF_USDT_ARG_REG: + /* Arg is in a register (e.g, "8@%rax" in USDT arg spec), + * so we read the contents of that register directly from + * struct pt_regs. To keep things simple user-space parts + * record offsetof(struct pt_regs, ) in arg_spec->reg_off. + */ + err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off); + if (err) + return err; + break; + case BPF_USDT_ARG_REG_DEREF: + /* Arg is in memory addressed by register, plus some offset + * (e.g., "-4@-1204(%rbp)" in USDT arg spec). Register is + * identified lik with BPF_USDT_ARG_REG case, and the offset + * is in arg_spec->val_off. We first fetch register contents + * from pt_regs, then do another user-space probe read to + * fetch argument value itself. + */ + err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off); + if (err) + return err; + err = bpf_probe_read_user(&val, sizeof(val), (void *)val + arg_spec->val_off); + if (err) + return err; + break; + default: + return -EINVAL; + } + + /* cast arg from 1, 2, or 4 bytes to final 8 byte size clearing + * necessary upper arg_bitshift bits, with sign extension if argument + * is signed + */ + val <<= arg_spec->arg_bitshift; + if (arg_spec->arg_signed) + val = ((long)val) >> arg_spec->arg_bitshift; + else + val = val >> arg_spec->arg_bitshift; + *res = val; + return 0; +} + +/* Retrieve user-specified cookie value provided during attach as + * bpf_usdt_opts.usdt_cookie. This serves the same purpose as BPF cookie + * returned by bpf_get_attach_cookie(). Libbpf's support for USDT is itself + * utilizaing BPF cookies internally, so user can't use BPF cookie directly + * for USDT programs and has to use bpf_usdt_cookie() API instead. + */ +static inline __noinline +long bpf_usdt_cookie(struct pt_regs *ctx) +{ + struct __bpf_usdt_spec *spec; + int spec_id; + + spec_id = __bpf_usdt_spec_id(ctx); + if (spec_id < 0) + return 0; + + spec = bpf_map_lookup_elem(&__bpf_usdt_specs, &spec_id); + if (!spec) + return 0; + + return spec->usdt_cookie; +} + +/* we rely on ___bpf_apply() and ___bpf_narg() macros already defined in bpf_tracing.h */ +#define ___bpf_usdt_args0() ctx +#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); (void *)_x; }) +#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); (void *)_x; }) +#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); (void *)_x; }) +#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); (void *)_x; }) +#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); (void *)_x; }) +#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); (void *)_x; }) +#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); (void *)_x; }) +#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); (void *)_x; }) +#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); (void *)_x; }) +#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); (void *)_x; }) +#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); (void *)_x; }) +#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); (void *)_x; }) +#define ___bpf_usdt_args(args...) ___bpf_apply(___bpf_usdt_args, ___bpf_narg(args))(args) + +/* + * BPF_USDT serves the same purpose for USDT handlers as BPF_PROG for + * tp_btf/fentry/fexit BPF programs and BPF_KPROBE for kprobes. + * Original struct pt_regs * context is preserved as 'ctx' argument. + */ +#define BPF_USDT(name, args...) \ +name(struct pt_regs *ctx); \ +static __attribute__((always_inline)) typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args); \ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_usdt_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __attribute__((always_inline)) typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args) + +#endif /* __USDT_BPF_H__ */ -- cgit From 2e4913e025fdef740972ac70277297436cccb27f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Apr 2022 16:41:57 -0700 Subject: libbpf: Wire up USDT API and bpf_link integration Wire up libbpf USDT support APIs without yet implementing all the nitty-gritty details of USDT discovery, spec parsing, and BPF map initialization. User-visible user-space API is simple and is conceptually very similar to uprobe API. bpf_program__attach_usdt() API allows to programmatically attach given BPF program to a USDT, specified through binary path (executable or shared lib), USDT provider and name. Also, just like in uprobe case, PID filter is specified (0 - self, -1 - any process, or specific PID). Optionally, USDT cookie value can be specified. Such single API invocation will try to discover given USDT in specified binary and will use (potentially many) BPF uprobes to attach this program in correct locations. Just like any bpf_program__attach_xxx() APIs, bpf_link is returned that represents this attachment. It is a virtual BPF link that doesn't have direct kernel object, as it can consist of multiple underlying BPF uprobe links. As such, attachment is not atomic operation and there can be brief moment when some USDT call sites are attached while others are still in the process of attaching. This should be taken into consideration by user. But bpf_program__attach_usdt() guarantees that in the case of success all USDT call sites are successfully attached, or all the successfuly attachments will be detached as soon as some USDT call sites failed to be attached. So, in theory, there could be cases of failed bpf_program__attach_usdt() call which did trigger few USDT program invocations. This is unavoidable due to multi-uprobe nature of USDT and has to be handled by user, if it's important to create an illusion of atomicity. USDT BPF programs themselves are marked in BPF source code as either SEC("usdt"), in which case they won't be auto-attached through skeleton's __attach() method, or it can have a full definition, which follows the spirit of fully-specified uprobes: SEC("usdt/::"). In the latter case skeleton's attach method will attempt auto-attachment. Similarly, generic bpf_program__attach() will have enought information to go off of for parameterless attachment. USDT BPF programs are actually uprobes, and as such for kernel they are marked as BPF_PROG_TYPE_KPROBE. Another part of this patch is USDT-related feature probing: - BPF cookie support detection from user-space; - detection of kernel support for auto-refcounting of USDT semaphore. The latter is optional. If kernel doesn't support such feature and USDT doesn't rely on USDT semaphores, no error is returned. But if libbpf detects that USDT requires setting semaphores and kernel doesn't support this, libbpf errors out with explicit pr_warn() message. Libbpf doesn't support poking process's memory directly to increment semaphore value, like BCC does on legacy kernels, due to inherent raciness and danger of such process memory manipulation. Libbpf let's kernel take care of this properly or gives up. Logistically, all the extra USDT-related infrastructure of libbpf is put into a separate usdt.c file and abstracted behind struct usdt_manager. Each bpf_object has lazily-initialized usdt_manager pointer, which is only instantiated if USDT programs are attempted to be attached. Closing BPF object frees up usdt_manager resources. usdt_manager keeps track of USDT spec ID assignment and few other small things. Subsequent patches will fill out remaining missing pieces of USDT initialization and setup logic. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Link: https://lore.kernel.org/bpf/20220404234202.331384-3-andrii@kernel.org --- tools/lib/bpf/Build | 3 +- tools/lib/bpf/libbpf.c | 115 ++++++++++- tools/lib/bpf/libbpf.h | 31 +++ tools/lib/bpf/libbpf.map | 1 + tools/lib/bpf/libbpf_internal.h | 19 ++ tools/lib/bpf/usdt.c | 429 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 587 insertions(+), 11 deletions(-) create mode 100644 tools/lib/bpf/usdt.c diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index 94f0a146bb7b..31a1a9015902 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1,3 +1,4 @@ libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \ netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \ - btf_dump.o ringbuf.o strset.o linker.o gen_loader.o relo_core.o + btf_dump.o ringbuf.o strset.o linker.o gen_loader.o relo_core.o \ + usdt.o diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 91ce94b61f7f..1111e9d16e01 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -483,6 +483,8 @@ struct elf_state { int st_ops_shndx; }; +struct usdt_manager; + struct bpf_object { char name[BPF_OBJ_NAME_LEN]; char license[64]; @@ -545,6 +547,8 @@ struct bpf_object { size_t fd_array_cap; size_t fd_array_cnt; + struct usdt_manager *usdt_man; + char path[]; }; @@ -4678,6 +4682,18 @@ static int probe_perf_link(void) return link_fd < 0 && err == -EBADF; } +static int probe_kern_bpf_cookie(void) +{ + struct bpf_insn insns[] = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_attach_cookie), + BPF_EXIT_INSN(), + }; + int ret, insn_cnt = ARRAY_SIZE(insns); + + ret = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL", insns, insn_cnt, NULL); + return probe_fd(ret); +} + enum kern_feature_result { FEAT_UNKNOWN = 0, FEAT_SUPPORTED = 1, @@ -4740,6 +4756,9 @@ static struct kern_feature_desc { [FEAT_MEMCG_ACCOUNT] = { "memcg-based memory accounting", probe_memcg_account, }, + [FEAT_BPF_COOKIE] = { + "BPF cookie support", probe_kern_bpf_cookie, + }, }; bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id) @@ -8200,6 +8219,9 @@ void bpf_object__close(struct bpf_object *obj) if (obj->clear_priv) obj->clear_priv(obj, obj->priv); + usdt_manager_free(obj->usdt_man); + obj->usdt_man = NULL; + bpf_gen__free(obj->gen_loader); bpf_object__elf_finish(obj); bpf_object_unload(obj); @@ -8631,6 +8653,7 @@ int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_usdt(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link); @@ -8648,6 +8671,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("uretprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), SEC_DEF("kprobe.multi/", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), SEC_DEF("kretprobe.multi/", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), + SEC_DEF("usdt+", KPROBE, 0, SEC_NONE, attach_usdt), SEC_DEF("tc", SCHED_CLS, 0, SEC_NONE), SEC_DEF("classifier", SCHED_CLS, 0, SEC_NONE | SEC_SLOPPY_PFX | SEC_DEPRECATED), SEC_DEF("action", SCHED_ACT, 0, SEC_NONE | SEC_SLOPPY_PFX), @@ -9693,14 +9717,6 @@ int bpf_prog_load_deprecated(const char *file, enum bpf_prog_type type, return bpf_prog_load_xattr2(&attr, pobj, prog_fd); } -struct bpf_link { - int (*detach)(struct bpf_link *link); - void (*dealloc)(struct bpf_link *link); - char *pin_path; /* NULL, if not pinned */ - int fd; /* hook FD, -1 if not applicable */ - bool disconnected; -}; - /* Replace link's underlying BPF program with the new one */ int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog) { @@ -10810,8 +10826,8 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, err = resolve_full_path(binary_path, full_binary_path, sizeof(full_binary_path)); if (err) { - pr_warn("prog '%s': failed to resolve full path for '%s'\n", - prog->name, binary_path); + pr_warn("prog '%s': failed to resolve full path for '%s': %d\n", + prog->name, binary_path, err); return libbpf_err_ptr(err); } binary_path = full_binary_path; @@ -10963,6 +10979,85 @@ struct bpf_link *bpf_program__attach_uprobe(const struct bpf_program *prog, return bpf_program__attach_uprobe_opts(prog, pid, binary_path, func_offset, &opts); } +struct bpf_link *bpf_program__attach_usdt(const struct bpf_program *prog, + pid_t pid, const char *binary_path, + const char *usdt_provider, const char *usdt_name, + const struct bpf_usdt_opts *opts) +{ + char resolved_path[512]; + struct bpf_object *obj = prog->obj; + struct bpf_link *link; + long usdt_cookie; + int err; + + if (!OPTS_VALID(opts, bpf_uprobe_opts)) + return libbpf_err_ptr(-EINVAL); + + if (bpf_program__fd(prog) < 0) { + pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + if (!strchr(binary_path, '/')) { + err = resolve_full_path(binary_path, resolved_path, sizeof(resolved_path)); + if (err) { + pr_warn("prog '%s': failed to resolve full path for '%s': %d\n", + prog->name, binary_path, err); + return libbpf_err_ptr(err); + } + binary_path = resolved_path; + } + + /* USDT manager is instantiated lazily on first USDT attach. It will + * be destroyed together with BPF object in bpf_object__close(). + */ + if (IS_ERR(obj->usdt_man)) + return libbpf_ptr(obj->usdt_man); + if (!obj->usdt_man) { + obj->usdt_man = usdt_manager_new(obj); + if (IS_ERR(obj->usdt_man)) + return libbpf_ptr(obj->usdt_man); + } + + usdt_cookie = OPTS_GET(opts, usdt_cookie, 0); + link = usdt_manager_attach_usdt(obj->usdt_man, prog, pid, binary_path, + usdt_provider, usdt_name, usdt_cookie); + err = libbpf_get_error(link); + if (err) + return libbpf_err_ptr(err); + return link; +} + +static int attach_usdt(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + char *path = NULL, *provider = NULL, *name = NULL; + const char *sec_name; + int n, err; + + sec_name = bpf_program__section_name(prog); + if (strcmp(sec_name, "usdt") == 0) { + /* no auto-attach for just SEC("usdt") */ + *link = NULL; + return 0; + } + + n = sscanf(sec_name, "usdt/%m[^:]:%m[^:]:%m[^:]", &path, &provider, &name); + if (n != 3) { + pr_warn("invalid section '%s', expected SEC(\"usdt/::\")\n", + sec_name); + err = -EINVAL; + } else { + *link = bpf_program__attach_usdt(prog, -1 /* any process */, path, + provider, name, NULL); + err = libbpf_get_error(*link); + } + free(path); + free(provider); + free(name); + return err; +} + static int determine_tracepoint_id(const char *tp_category, const char *tp_name) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 28cd2062d0df..63d66f1adf1a 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -511,6 +511,37 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, const char *binary_path, size_t func_offset, const struct bpf_uprobe_opts *opts); +struct bpf_usdt_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value accessible through usdt_cookie() */ + __u64 usdt_cookie; + size_t :0; +}; +#define bpf_usdt_opts__last_field usdt_cookie + +/** + * @brief **bpf_program__attach_usdt()** is just like + * bpf_program__attach_uprobe_opts() except it covers USDT (User-space + * Statically Defined Tracepoint) attachment, instead of attaching to + * user-space function entry or exit. + * + * @param prog BPF program to attach + * @param pid Process ID to attach the uprobe to, 0 for self (own process), + * -1 for all processes + * @param binary_path Path to binary that contains provided USDT probe + * @param usdt_provider USDT provider name + * @param usdt_name USDT probe name + * @param opts Options for altering program attachment + * @return Reference to the newly created BPF link; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_usdt(const struct bpf_program *prog, + pid_t pid, const char *binary_path, + const char *usdt_provider, const char *usdt_name, + const struct bpf_usdt_opts *opts); + struct bpf_tracepoint_opts { /* size of this struct, for forward/backward compatiblity */ size_t sz; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index dd35ee58bfaa..82f6d62176dd 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -444,6 +444,7 @@ LIBBPF_0.8.0 { global: bpf_object__destroy_subskeleton; bpf_object__open_subskeleton; + bpf_program__attach_usdt; libbpf_register_prog_handler; libbpf_unregister_prog_handler; bpf_program__attach_kprobe_multi_opts; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index b6247dc7f8eb..dd0d4ccfa649 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -148,6 +148,15 @@ do { \ #ifndef __has_builtin #define __has_builtin(x) 0 #endif + +struct bpf_link { + int (*detach)(struct bpf_link *link); + void (*dealloc)(struct bpf_link *link); + char *pin_path; /* NULL, if not pinned */ + int fd; /* hook FD, -1 if not applicable */ + bool disconnected; +}; + /* * Re-implement glibc's reallocarray() for libbpf internal-only use. * reallocarray(), unfortunately, is not available in all versions of glibc, @@ -329,6 +338,8 @@ enum kern_feature_id { FEAT_BTF_TYPE_TAG, /* memcg-based accounting for BPF maps and progs */ FEAT_MEMCG_ACCOUNT, + /* BPF cookie (bpf_get_attach_cookie() BPF helper) support */ + FEAT_BPF_COOKIE, __FEAT_CNT, }; @@ -543,4 +554,12 @@ int bpf_core_add_cands(struct bpf_core_cand *local_cand, struct bpf_core_cand_list *cands); void bpf_core_free_cands(struct bpf_core_cand_list *cands); +struct usdt_manager *usdt_manager_new(struct bpf_object *obj); +void usdt_manager_free(struct usdt_manager *man); +struct bpf_link * usdt_manager_attach_usdt(struct usdt_manager *man, + const struct bpf_program *prog, + pid_t pid, const char *path, + const char *usdt_provider, const char *usdt_name, + long usdt_cookie); + #endif /* __LIBBPF_LIBBPF_INTERNAL_H */ diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c new file mode 100644 index 000000000000..781aa1d128f1 --- /dev/null +++ b/tools/lib/bpf/usdt.c @@ -0,0 +1,429 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_common.h" +#include "libbpf_internal.h" +#include "hashmap.h" + +/* libbpf's USDT support consists of BPF-side state/code and user-space + * state/code working together in concert. BPF-side parts are defined in + * usdt.bpf.h header library. User-space state is encapsulated by struct + * usdt_manager and all the supporting code centered around usdt_manager. + * + * usdt.bpf.h defines two BPF maps that usdt_manager expects: USDT spec map + * and IP-to-spec-ID map, which is auxiliary map necessary for kernels that + * don't support BPF cookie (see below). These two maps are implicitly + * embedded into user's end BPF object file when user's code included + * usdt.bpf.h. This means that libbpf doesn't do anything special to create + * these USDT support maps. They are created by normal libbpf logic of + * instantiating BPF maps when opening and loading BPF object. + * + * As such, libbpf is basically unaware of the need to do anything + * USDT-related until the very first call to bpf_program__attach_usdt(), which + * can be called by user explicitly or happen automatically during skeleton + * attach (or, equivalently, through generic bpf_program__attach() call). At + * this point, libbpf will instantiate and initialize struct usdt_manager and + * store it in bpf_object. USDT manager is per-BPF object construct, as each + * independent BPF object might or might not have USDT programs, and thus all + * the expected USDT-related state. There is no coordination between two + * bpf_object in parts of USDT attachment, they are oblivious of each other's + * existence and libbpf is just oblivious, dealing with bpf_object-specific + * USDT state. + * + * Quick crash course on USDTs. + * + * From user-space application's point of view, USDT is essentially just + * a slightly special function call that normally has zero overhead, unless it + * is being traced by some external entity (e.g, BPF-based tool). Here's how + * a typical application can trigger USDT probe: + * + * #include // provided by systemtap-sdt-devel package + * // folly also provide similar functionality in folly/tracing/StaticTracepoint.h + * + * STAP_PROBE3(my_usdt_provider, my_usdt_probe_name, 123, x, &y); + * + * USDT is identified by it's : pair of names. Each + * individual USDT has a fixed number of arguments (3 in the above example) + * and specifies values of each argument as if it was a function call. + * + * USDT call is actually not a function call, but is instead replaced by + * a single NOP instruction (thus zero overhead, effectively). But in addition + * to that, those USDT macros generate special SHT_NOTE ELF records in + * .note.stapsdt ELF section. Here's an example USDT definition as emitted by + * `readelf -n `: + * + * stapsdt 0x00000089 NT_STAPSDT (SystemTap probe descriptors) + * Provider: test + * Name: usdt12 + * Location: 0x0000000000549df3, Base: 0x00000000008effa4, Semaphore: 0x0000000000a4606e + * Arguments: -4@-1204(%rbp) -4@%edi -8@-1216(%rbp) -8@%r8 -4@$5 -8@%r9 8@%rdx 8@%r10 -4@$-9 -2@%cx -2@%ax -1@%sil + * + * In this case we have USDT test:usdt12 with 12 arguments. + * + * Location and base are offsets used to calculate absolute IP address of that + * NOP instruction that kernel can replace with an interrupt instruction to + * trigger instrumentation code (BPF program for all that we care about). + * + * Semaphore above is and optional feature. It records an address of a 2-byte + * refcount variable (normally in '.probes' ELF section) used for signaling if + * there is anything that is attached to USDT. This is useful for user + * applications if, for example, they need to prepare some arguments that are + * passed only to USDTs and preparation is expensive. By checking if USDT is + * "activated", an application can avoid paying those costs unnecessarily. + * Recent enough kernel has built-in support for automatically managing this + * refcount, which libbpf expects and relies on. If USDT is defined without + * associated semaphore, this value will be zero. See selftests for semaphore + * examples. + * + * Arguments is the most interesting part. This USDT specification string is + * providing information about all the USDT arguments and their locations. The + * part before @ sign defined byte size of the argument (1, 2, 4, or 8) and + * whether the argument is signed or unsigned (negative size means signed). + * The part after @ sign is assembly-like definition of argument location + * (see [0] for more details). Technically, assembler can provide some pretty + * advanced definitions, but libbpf is currently supporting three most common + * cases: + * 1) immediate constant, see 5th and 9th args above (-4@$5 and -4@-9); + * 2) register value, e.g., 8@%rdx, which means "unsigned 8-byte integer + * whose value is in register %rdx"; + * 3) memory dereference addressed by register, e.g., -4@-1204(%rbp), which + * specifies signed 32-bit integer stored at offset -1204 bytes from + * memory address stored in %rbp. + * + * [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + * + * During attachment, libbpf parses all the relevant USDT specifications and + * prepares `struct usdt_spec` (USDT spec), which is then provided to BPF-side + * code through spec map. This allows BPF applications to quickly fetch the + * actual value at runtime using a simple BPF-side code. + * + * With basics out of the way, let's go over less immeditately obvious aspects + * of supporting USDTs. + * + * First, there is no special USDT BPF program type. It is actually just + * a uprobe BPF program (which for kernel, at least currently, is just a kprobe + * program, so BPF_PROG_TYPE_KPROBE program type). With the only difference + * that uprobe is usually attached at the function entry, while USDT will + * normally will be somewhere inside the function. But it should always be + * pointing to NOP instruction, which makes such uprobes the fastest uprobe + * kind. + * + * Second, it's important to realize that such STAP_PROBEn(provider, name, ...) + * macro invocations can end up being inlined many-many times, depending on + * specifics of each individual user application. So single conceptual USDT + * (identified by provider:name pair of identifiers) is, generally speaking, + * multiple uprobe locations (USDT call sites) in different places in user + * application. Further, again due to inlining, each USDT call site might end + * up having the same argument #N be located in a different place. In one call + * site it could be a constant, in another will end up in a register, and in + * yet another could be some other register or even somewhere on the stack. + * + * As such, "attaching to USDT" means (in general case) attaching the same + * uprobe BPF program to multiple target locations in user application, each + * potentially having a completely different USDT spec associated with it. + * To wire all this up together libbpf allocates a unique integer spec ID for + * each unique USDT spec. Spec IDs are allocated as sequential small integers + * so that they can be used as keys in array BPF map (for performance reasons). + * Spec ID allocation and accounting is big part of what usdt_manager is + * about. This state has to be maintained per-BPF object and coordinate + * between different USDT attachments within the same BPF object. + * + * Spec ID is the key in spec BPF map, value is the actual USDT spec layed out + * as struct usdt_spec. Each invocation of BPF program at runtime needs to + * know its associated spec ID. It gets it either through BPF cookie, which + * libbpf sets to spec ID during attach time, or, if kernel is too old to + * support BPF cookie, through IP-to-spec-ID map that libbpf maintains in such + * case. The latter means that some modes of operation can't be supported + * without BPF cookie. Such mode is attaching to shared library "generically", + * without specifying target process. In such case, it's impossible to + * calculate absolute IP addresses for IP-to-spec-ID map, and thus such mode + * is not supported without BPF cookie support. + * + * Note that libbpf is using BPF cookie functionality for its own internal + * needs, so user itself can't rely on BPF cookie feature. To that end, libbpf + * provides conceptually equivalent USDT cookie support. It's still u64 + * user-provided value that can be associated with USDT attachment. Note that + * this will be the same value for all USDT call sites within the same single + * *logical* USDT attachment. This makes sense because to user attaching to + * USDT is a single BPF program triggered for singular USDT probe. The fact + * that this is done at multiple actual locations is a mostly hidden + * implementation details. This USDT cookie value can be fetched with + * bpf_usdt_cookie(ctx) API provided by usdt.bpf.h + * + * Lastly, while single USDT can have tons of USDT call sites, it doesn't + * necessarily have that many different USDT specs. It very well might be + * that 1000 USDT call sites only need 5 different USDT specs, because all the + * arguments are typically contained in a small set of registers or stack + * locations. As such, it's wasteful to allocate as many USDT spec IDs as + * there are USDT call sites. So libbpf tries to be frugal and performs + * on-the-fly deduplication during a single USDT attachment to only allocate + * the minimal required amount of unique USDT specs (and thus spec IDs). This + * is trivially achieved by using USDT spec string (Arguments string from USDT + * note) as a lookup key in a hashmap. USDT spec string uniquely defines + * everything about how to fetch USDT arguments, so two USDT call sites + * sharing USDT spec string can safely share the same USDT spec and spec ID. + * Note, this spec string deduplication is happening only during the same USDT + * attachment, so each USDT spec shares the same USDT cookie value. This is + * not generally true for other USDT attachments within the same BPF object, + * as even if USDT spec string is the same, USDT cookie value can be + * different. It was deemed excessive to try to deduplicate across independent + * USDT attachments by taking into account USDT spec string *and* USDT cookie + * value, which would complicated spec ID accounting significantly for little + * gain. + */ + +struct usdt_target { + long abs_ip; + long rel_ip; + long sema_off; +}; + +struct usdt_manager { + struct bpf_map *specs_map; + struct bpf_map *ip_to_spec_id_map; + + bool has_bpf_cookie; + bool has_sema_refcnt; +}; + +struct usdt_manager *usdt_manager_new(struct bpf_object *obj) +{ + static const char *ref_ctr_sysfs_path = "/sys/bus/event_source/devices/uprobe/format/ref_ctr_offset"; + struct usdt_manager *man; + struct bpf_map *specs_map, *ip_to_spec_id_map; + + specs_map = bpf_object__find_map_by_name(obj, "__bpf_usdt_specs"); + ip_to_spec_id_map = bpf_object__find_map_by_name(obj, "__bpf_usdt_ip_to_spec_id"); + if (!specs_map || !ip_to_spec_id_map) { + pr_warn("usdt: failed to find USDT support BPF maps, did you forget to include bpf/usdt.bpf.h?\n"); + return ERR_PTR(-ESRCH); + } + + man = calloc(1, sizeof(*man)); + if (!man) + return ERR_PTR(-ENOMEM); + + man->specs_map = specs_map; + man->ip_to_spec_id_map = ip_to_spec_id_map; + + /* Detect if BPF cookie is supported for kprobes. + * We don't need IP-to-ID mapping if we can use BPF cookies. + * Added in: 7adfc6c9b315 ("bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value") + */ + man->has_bpf_cookie = kernel_supports(obj, FEAT_BPF_COOKIE); + + /* Detect kernel support for automatic refcounting of USDT semaphore. + * If this is not supported, USDTs with semaphores will not be supported. + * Added in: a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe") + */ + man->has_sema_refcnt = access(ref_ctr_sysfs_path, F_OK) == 0; + + return man; +} + +void usdt_manager_free(struct usdt_manager *man) +{ + if (IS_ERR_OR_NULL(man)) + return; + + free(man); +} + +static int sanity_check_usdt_elf(Elf *elf, const char *path) +{ + GElf_Ehdr ehdr; + int endianness; + + if (elf_kind(elf) != ELF_K_ELF) { + pr_warn("usdt: unrecognized ELF kind %d for '%s'\n", elf_kind(elf), path); + return -EBADF; + } + + switch (gelf_getclass(elf)) { + case ELFCLASS64: + if (sizeof(void *) != 8) { + pr_warn("usdt: attaching to 64-bit ELF binary '%s' is not supported\n", path); + return -EBADF; + } + break; + case ELFCLASS32: + if (sizeof(void *) != 4) { + pr_warn("usdt: attaching to 32-bit ELF binary '%s' is not supported\n", path); + return -EBADF; + } + break; + default: + pr_warn("usdt: unsupported ELF class for '%s'\n", path); + return -EBADF; + } + + if (!gelf_getehdr(elf, &ehdr)) + return -EINVAL; + + if (ehdr.e_type != ET_EXEC && ehdr.e_type != ET_DYN) { + pr_warn("usdt: unsupported type of ELF binary '%s' (%d), only ET_EXEC and ET_DYN are supported\n", + path, ehdr.e_type); + return -EBADF; + } + +#if __BYTE_ORDER == __LITTLE_ENDIAN + endianness = ELFDATA2LSB; +#elif __BYTE_ORDER == __BIG_ENDIAN + endianness = ELFDATA2MSB; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif + if (endianness != ehdr.e_ident[EI_DATA]) { + pr_warn("usdt: ELF endianness mismatch for '%s'\n", path); + return -EBADF; + } + + return 0; +} + +static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char *path, pid_t pid, + const char *usdt_provider, const char *usdt_name, long usdt_cookie, + struct usdt_target **out_targets, size_t *out_target_cnt) +{ + return -ENOTSUP; +} + +struct bpf_link_usdt { + struct bpf_link link; + + struct usdt_manager *usdt_man; + + size_t uprobe_cnt; + struct { + long abs_ip; + struct bpf_link *link; + } *uprobes; +}; + +static int bpf_link_usdt_detach(struct bpf_link *link) +{ + struct bpf_link_usdt *usdt_link = container_of(link, struct bpf_link_usdt, link); + int i; + + for (i = 0; i < usdt_link->uprobe_cnt; i++) { + /* detach underlying uprobe link */ + bpf_link__destroy(usdt_link->uprobes[i].link); + } + + return 0; +} + +static void bpf_link_usdt_dealloc(struct bpf_link *link) +{ + struct bpf_link_usdt *usdt_link = container_of(link, struct bpf_link_usdt, link); + + free(usdt_link->uprobes); + free(usdt_link); +} + +struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct bpf_program *prog, + pid_t pid, const char *path, + const char *usdt_provider, const char *usdt_name, + long usdt_cookie) +{ + LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct bpf_link_usdt *link = NULL; + struct usdt_target *targets = NULL; + size_t target_cnt; + int i, fd, err; + Elf *elf; + + /* TODO: perform path resolution similar to uprobe's */ + fd = open(path, O_RDONLY); + if (fd < 0) { + err = -errno; + pr_warn("usdt: failed to open ELF binary '%s': %d\n", path, err); + return libbpf_err_ptr(err); + } + + elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); + if (!elf) { + err = -EBADF; + pr_warn("usdt: failed to parse ELF binary '%s': %s\n", path, elf_errmsg(-1)); + goto err_out; + } + + err = sanity_check_usdt_elf(elf, path); + if (err) + goto err_out; + + /* normalize PID filter */ + if (pid < 0) + pid = -1; + else if (pid == 0) + pid = getpid(); + + /* discover USDT in given binary, optionally limiting + * activations to a given PID, if pid > 0 + */ + err = collect_usdt_targets(man, elf, path, pid, usdt_provider, usdt_name, + usdt_cookie, &targets, &target_cnt); + if (err <= 0) { + err = (err == 0) ? -ENOENT : err; + goto err_out; + } + + link = calloc(1, sizeof(*link)); + if (!link) { + err = -ENOMEM; + goto err_out; + } + + link->usdt_man = man; + link->link.detach = &bpf_link_usdt_detach; + link->link.dealloc = &bpf_link_usdt_dealloc; + + link->uprobes = calloc(target_cnt, sizeof(*link->uprobes)); + if (!link->uprobes) { + err = -ENOMEM; + goto err_out; + } + + for (i = 0; i < target_cnt; i++) { + struct usdt_target *target = &targets[i]; + struct bpf_link *uprobe_link; + + opts.ref_ctr_offset = target->sema_off; + uprobe_link = bpf_program__attach_uprobe_opts(prog, pid, path, + target->rel_ip, &opts); + err = libbpf_get_error(uprobe_link); + if (err) { + pr_warn("usdt: failed to attach uprobe #%d for '%s:%s' in '%s': %d\n", + i, usdt_provider, usdt_name, path, err); + goto err_out; + } + + link->uprobes[i].link = uprobe_link; + link->uprobes[i].abs_ip = target->abs_ip; + link->uprobe_cnt++; + } + + elf_end(elf); + close(fd); + + return &link->link; + +err_out: + bpf_link__destroy(&link->link); + + if (elf) + elf_end(elf); + close(fd); + return libbpf_err_ptr(err); +} -- cgit From 74cc6311cec906daf1d64cefe4922dbf79c416c9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Apr 2022 16:41:58 -0700 Subject: libbpf: Add USDT notes parsing and resolution logic Implement architecture-agnostic parts of USDT parsing logic. The code is the documentation in this case, it's futile to try to succinctly describe how USDT parsing is done in any sort of concreteness. But still, USDTs are recorded in special ELF notes section (.note.stapsdt), where each USDT call site is described separately. Along with USDT provider and USDT name, each such note contains USDT argument specification, which uses assembly-like syntax to describe how to fetch value of USDT argument. USDT arg spec could be just a constant, or a register, or a register dereference (most common cases in x86_64), but it technically can be much more complicated cases, like offset relative to global symbol and stuff like that. One of the later patches will implement most common subset of this for x86 and x86-64 architectures, which seems to handle a lot of real-world production application. USDT arg spec contains a compact encoding allowing usdt.bpf.h from previous patch to handle the above 3 cases. Instead of recording which register might be needed, we encode register's offset within struct pt_regs to simplify BPF-side implementation. USDT argument can be of different byte sizes (1, 2, 4, and 8) and signed or unsigned. To handle this, libbpf pre-calculates necessary bit shifts to do proper casting and sign-extension in a short sequences of left and right shifts. The rest is in the code with sometimes extensive comments and references to external "documentation" for USDTs. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Reviewed-by: Dave Marchevsky Link: https://lore.kernel.org/bpf/20220404234202.331384-4-andrii@kernel.org --- tools/lib/bpf/usdt.c | 582 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 581 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index 781aa1d128f1..f1670e3014ed 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -183,10 +183,56 @@ * gain. */ +#define USDT_BASE_SEC ".stapsdt.base" +#define USDT_SEMA_SEC ".probes" +#define USDT_NOTE_SEC ".note.stapsdt" +#define USDT_NOTE_TYPE 3 +#define USDT_NOTE_NAME "stapsdt" + +/* should match exactly enum __bpf_usdt_arg_type from bpf_usdt.bpf.h */ +enum usdt_arg_type { + USDT_ARG_CONST, + USDT_ARG_REG, + USDT_ARG_REG_DEREF, +}; + +/* should match exactly struct __bpf_usdt_arg_spec from bpf_usdt.bpf.h */ +struct usdt_arg_spec { + __u64 val_off; + enum usdt_arg_type arg_type; + short reg_off; + bool arg_signed; + char arg_bitshift; +}; + +/* should match BPF_USDT_MAX_ARG_CNT in usdt.bpf.h */ +#define USDT_MAX_ARG_CNT 12 + +/* should match struct __bpf_usdt_spec from usdt.bpf.h */ +struct usdt_spec { + struct usdt_arg_spec args[USDT_MAX_ARG_CNT]; + __u64 usdt_cookie; + short arg_cnt; +}; + +struct usdt_note { + const char *provider; + const char *name; + /* USDT args specification string, e.g.: + * "-4@%esi -4@-24(%rbp) -4@%ecx 2@%ax 8@%rdx" + */ + const char *args; + long loc_addr; + long base_addr; + long sema_addr; +}; + struct usdt_target { long abs_ip; long rel_ip; long sema_off; + struct usdt_spec spec; + const char *spec_str; }; struct usdt_manager { @@ -292,11 +338,450 @@ static int sanity_check_usdt_elf(Elf *elf, const char *path) return 0; } +static int find_elf_sec_by_name(Elf *elf, const char *sec_name, GElf_Shdr *shdr, Elf_Scn **scn) +{ + Elf_Scn *sec = NULL; + size_t shstrndx; + + if (elf_getshdrstrndx(elf, &shstrndx)) + return -EINVAL; + + /* check if ELF is corrupted and avoid calling elf_strptr if yes */ + if (!elf_rawdata(elf_getscn(elf, shstrndx), NULL)) + return -EINVAL; + + while ((sec = elf_nextscn(elf, sec)) != NULL) { + char *name; + + if (!gelf_getshdr(sec, shdr)) + return -EINVAL; + + name = elf_strptr(elf, shstrndx, shdr->sh_name); + if (name && strcmp(sec_name, name) == 0) { + *scn = sec; + return 0; + } + } + + return -ENOENT; +} + +struct elf_seg { + long start; + long end; + long offset; + bool is_exec; +}; + +static int cmp_elf_segs(const void *_a, const void *_b) +{ + const struct elf_seg *a = _a; + const struct elf_seg *b = _b; + + return a->start < b->start ? -1 : 1; +} + +static int parse_elf_segs(Elf *elf, const char *path, struct elf_seg **segs, size_t *seg_cnt) +{ + GElf_Phdr phdr; + size_t n; + int i, err; + struct elf_seg *seg; + void *tmp; + + *seg_cnt = 0; + + if (elf_getphdrnum(elf, &n)) { + err = -errno; + return err; + } + + for (i = 0; i < n; i++) { + if (!gelf_getphdr(elf, i, &phdr)) { + err = -errno; + return err; + } + + pr_debug("usdt: discovered PHDR #%d in '%s': vaddr 0x%lx memsz 0x%lx offset 0x%lx type 0x%lx flags 0x%lx\n", + i, path, (long)phdr.p_vaddr, (long)phdr.p_memsz, (long)phdr.p_offset, + (long)phdr.p_type, (long)phdr.p_flags); + if (phdr.p_type != PT_LOAD) + continue; + + tmp = libbpf_reallocarray(*segs, *seg_cnt + 1, sizeof(**segs)); + if (!tmp) + return -ENOMEM; + + *segs = tmp; + seg = *segs + *seg_cnt; + (*seg_cnt)++; + + seg->start = phdr.p_vaddr; + seg->end = phdr.p_vaddr + phdr.p_memsz; + seg->offset = phdr.p_offset; + seg->is_exec = phdr.p_flags & PF_X; + } + + if (*seg_cnt == 0) { + pr_warn("usdt: failed to find PT_LOAD program headers in '%s'\n", path); + return -ESRCH; + } + + qsort(*segs, *seg_cnt, sizeof(**segs), cmp_elf_segs); + return 0; +} + +static int parse_lib_segs(int pid, const char *lib_path, struct elf_seg **segs, size_t *seg_cnt) +{ + char path[PATH_MAX], line[PATH_MAX], mode[16]; + size_t seg_start, seg_end, seg_off; + struct elf_seg *seg; + int tmp_pid, i, err; + FILE *f; + + *seg_cnt = 0; + + /* Handle containerized binaries only accessible from + * /proc//root/. They will be reported as just / in + * /proc//maps. + */ + if (sscanf(lib_path, "/proc/%d/root%s", &tmp_pid, path) == 2 && pid == tmp_pid) + goto proceed; + + if (!realpath(lib_path, path)) { + pr_warn("usdt: failed to get absolute path of '%s' (err %d), using path as is...\n", + lib_path, -errno); + strcpy(path, lib_path); + } + +proceed: + sprintf(line, "/proc/%d/maps", pid); + f = fopen(line, "r"); + if (!f) { + err = -errno; + pr_warn("usdt: failed to open '%s' to get base addr of '%s': %d\n", + line, lib_path, err); + return err; + } + + /* We need to handle lines with no path at the end: + * + * 7f5c6f5d1000-7f5c6f5d3000 rw-p 001c7000 08:04 21238613 /usr/lib64/libc-2.17.so + * 7f5c6f5d3000-7f5c6f5d8000 rw-p 00000000 00:00 0 + * 7f5c6f5d8000-7f5c6f5d9000 r-xp 00000000 103:01 362990598 /data/users/andriin/linux/tools/bpf/usdt/libhello_usdt.so + */ + while (fscanf(f, "%zx-%zx %s %zx %*s %*d%[^\n]\n", + &seg_start, &seg_end, mode, &seg_off, line) == 5) { + void *tmp; + + /* to handle no path case (see above) we need to capture line + * without skipping any whitespaces. So we need to strip + * leading whitespaces manually here + */ + i = 0; + while (isblank(line[i])) + i++; + if (strcmp(line + i, path) != 0) + continue; + + pr_debug("usdt: discovered segment for lib '%s': addrs %zx-%zx mode %s offset %zx\n", + path, seg_start, seg_end, mode, seg_off); + + /* ignore non-executable sections for shared libs */ + if (mode[2] != 'x') + continue; + + tmp = libbpf_reallocarray(*segs, *seg_cnt + 1, sizeof(**segs)); + if (!tmp) { + err = -ENOMEM; + goto err_out; + } + + *segs = tmp; + seg = *segs + *seg_cnt; + *seg_cnt += 1; + + seg->start = seg_start; + seg->end = seg_end; + seg->offset = seg_off; + seg->is_exec = true; + } + + if (*seg_cnt == 0) { + pr_warn("usdt: failed to find '%s' (resolved to '%s') within PID %d memory mappings\n", + lib_path, path, pid); + err = -ESRCH; + goto err_out; + } + + qsort(*segs, *seg_cnt, sizeof(**segs), cmp_elf_segs); + err = 0; +err_out: + fclose(f); + return err; +} + +static struct elf_seg *find_elf_seg(struct elf_seg *segs, size_t seg_cnt, long addr, bool relative) +{ + struct elf_seg *seg; + int i; + + if (relative) { + /* for shared libraries, address is relative offset and thus + * should be fall within logical offset-based range of + * [offset_start, offset_end) + */ + for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { + if (seg->offset <= addr && addr < seg->offset + (seg->end - seg->start)) + return seg; + } + } else { + /* for binaries, address is absolute and thus should be within + * absolute address range of [seg_start, seg_end) + */ + for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { + if (seg->start <= addr && addr < seg->end) + return seg; + } + } + + return NULL; +} + +static int parse_usdt_note(Elf *elf, const char *path, long base_addr, + GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off, + struct usdt_note *usdt_note); + +static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, long usdt_cookie); + static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char *path, pid_t pid, const char *usdt_provider, const char *usdt_name, long usdt_cookie, struct usdt_target **out_targets, size_t *out_target_cnt) { - return -ENOTSUP; + size_t off, name_off, desc_off, seg_cnt = 0, lib_seg_cnt = 0, target_cnt = 0; + struct elf_seg *segs = NULL, *lib_segs = NULL; + struct usdt_target *targets = NULL, *target; + long base_addr = 0; + Elf_Scn *notes_scn, *base_scn; + GElf_Shdr base_shdr, notes_shdr; + GElf_Ehdr ehdr; + GElf_Nhdr nhdr; + Elf_Data *data; + int err; + + *out_targets = NULL; + *out_target_cnt = 0; + + err = find_elf_sec_by_name(elf, USDT_NOTE_SEC, ¬es_shdr, ¬es_scn); + if (err) { + pr_warn("usdt: no USDT notes section (%s) found in '%s'\n", USDT_NOTE_SEC, path); + return err; + } + + if (notes_shdr.sh_type != SHT_NOTE || !gelf_getehdr(elf, &ehdr)) { + pr_warn("usdt: invalid USDT notes section (%s) in '%s'\n", USDT_NOTE_SEC, path); + return -EINVAL; + } + + err = parse_elf_segs(elf, path, &segs, &seg_cnt); + if (err) { + pr_warn("usdt: failed to process ELF program segments for '%s': %d\n", path, err); + goto err_out; + } + + /* .stapsdt.base ELF section is optional, but is used for prelink + * offset compensation (see a big comment further below) + */ + if (find_elf_sec_by_name(elf, USDT_BASE_SEC, &base_shdr, &base_scn) == 0) + base_addr = base_shdr.sh_addr; + + data = elf_getdata(notes_scn, 0); + off = 0; + while ((off = gelf_getnote(data, off, &nhdr, &name_off, &desc_off)) > 0) { + long usdt_abs_ip, usdt_rel_ip, usdt_sema_off = 0; + struct usdt_note note; + struct elf_seg *seg = NULL; + void *tmp; + + err = parse_usdt_note(elf, path, base_addr, &nhdr, + data->d_buf, name_off, desc_off, ¬e); + if (err) + goto err_out; + + if (strcmp(note.provider, usdt_provider) != 0 || strcmp(note.name, usdt_name) != 0) + continue; + + /* We need to compensate "prelink effect". See [0] for details, + * relevant parts quoted here: + * + * Each SDT probe also expands into a non-allocated ELF note. You can + * find this by looking at SHT_NOTE sections and decoding the format; + * see below for details. Because the note is non-allocated, it means + * there is no runtime cost, and also preserved in both stripped files + * and .debug files. + * + * However, this means that prelink won't adjust the note's contents + * for address offsets. Instead, this is done via the .stapsdt.base + * section. This is a special section that is added to the text. We + * will only ever have one of these sections in a final link and it + * will only ever be one byte long. Nothing about this section itself + * matters, we just use it as a marker to detect prelink address + * adjustments. + * + * Each probe note records the link-time address of the .stapsdt.base + * section alongside the probe PC address. The decoder compares the + * base address stored in the note with the .stapsdt.base section's + * sh_addr. Initially these are the same, but the section header will + * be adjusted by prelink. So the decoder applies the difference to + * the probe PC address to get the correct prelinked PC address; the + * same adjustment is applied to the semaphore address, if any. + * + * [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + */ + usdt_rel_ip = usdt_abs_ip = note.loc_addr; + if (base_addr) { + usdt_abs_ip += base_addr - note.base_addr; + usdt_rel_ip += base_addr - note.base_addr; + } + + if (ehdr.e_type == ET_EXEC) { + /* When attaching uprobes (which what USDTs basically + * are) kernel expects a relative IP to be specified, + * so if we are attaching to an executable ELF binary + * (i.e., not a shared library), we need to calculate + * proper relative IP based on ELF's load address + */ + seg = find_elf_seg(segs, seg_cnt, usdt_abs_ip, false /* relative */); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find ELF program segment for '%s:%s' in '%s' at IP 0x%lx\n", + usdt_provider, usdt_name, path, usdt_abs_ip); + goto err_out; + } + if (!seg->is_exec) { + err = -ESRCH; + pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx) for '%s:%s' at IP 0x%lx is not executable\n", + path, seg->start, seg->end, usdt_provider, usdt_name, + usdt_abs_ip); + goto err_out; + } + + usdt_rel_ip = usdt_abs_ip - (seg->start - seg->offset); + } else if (!man->has_bpf_cookie) { /* ehdr.e_type == ET_DYN */ + /* If we don't have BPF cookie support but need to + * attach to a shared library, we'll need to know and + * record absolute addresses of attach points due to + * the need to lookup USDT spec by absolute IP of + * triggered uprobe. Doing this resolution is only + * possible when we have a specific PID of the process + * that's using specified shared library. BPF cookie + * removes the absolute address limitation as we don't + * need to do this lookup (we just use BPF cookie as + * an index of USDT spec), so for newer kernels with + * BPF cookie support libbpf supports USDT attachment + * to shared libraries with no PID filter. + */ + if (pid < 0) { + pr_warn("usdt: attaching to shared libaries without specific PID is not supported on current kernel\n"); + err = -ENOTSUP; + goto err_out; + } + + /* lib_segs are lazily initialized only if necessary */ + if (lib_seg_cnt == 0) { + err = parse_lib_segs(pid, path, &lib_segs, &lib_seg_cnt); + if (err) { + pr_warn("usdt: failed to get memory segments in PID %d for shared library '%s': %d\n", + pid, path, err); + goto err_out; + } + } + + seg = find_elf_seg(lib_segs, lib_seg_cnt, usdt_rel_ip, true /* relative */); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find shared lib memory segment for '%s:%s' in '%s' at relative IP 0x%lx\n", + usdt_provider, usdt_name, path, usdt_rel_ip); + goto err_out; + } + + usdt_abs_ip = seg->start + (usdt_rel_ip - seg->offset); + } + + pr_debug("usdt: probe for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved abs_ip 0x%lx rel_ip 0x%lx) args '%s' in segment [0x%lx, 0x%lx) at offset 0x%lx\n", + usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", path, + note.loc_addr, note.base_addr, usdt_abs_ip, usdt_rel_ip, note.args, + seg ? seg->start : 0, seg ? seg->end : 0, seg ? seg->offset : 0); + + /* Adjust semaphore address to be a relative offset */ + if (note.sema_addr) { + if (!man->has_sema_refcnt) { + pr_warn("usdt: kernel doesn't support USDT semaphore refcounting for '%s:%s' in '%s'\n", + usdt_provider, usdt_name, path); + err = -ENOTSUP; + goto err_out; + } + + seg = find_elf_seg(segs, seg_cnt, note.sema_addr, false /* relative */); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find ELF loadable segment with semaphore of '%s:%s' in '%s' at 0x%lx\n", + usdt_provider, usdt_name, path, note.sema_addr); + goto err_out; + } + if (seg->is_exec) { + err = -ESRCH; + pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx] for semaphore of '%s:%s' at 0x%lx is executable\n", + path, seg->start, seg->end, usdt_provider, usdt_name, + note.sema_addr); + goto err_out; + } + + usdt_sema_off = note.sema_addr - (seg->start - seg->offset); + + pr_debug("usdt: sema for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved 0x%lx) in segment [0x%lx, 0x%lx] at offset 0x%lx\n", + usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", + path, note.sema_addr, note.base_addr, usdt_sema_off, + seg->start, seg->end, seg->offset); + } + + /* Record adjusted addresses and offsets and parse USDT spec */ + tmp = libbpf_reallocarray(targets, target_cnt + 1, sizeof(*targets)); + if (!tmp) { + err = -ENOMEM; + goto err_out; + } + targets = tmp; + + target = &targets[target_cnt]; + memset(target, 0, sizeof(*target)); + + target->abs_ip = usdt_abs_ip; + target->rel_ip = usdt_rel_ip; + target->sema_off = usdt_sema_off; + + /* notes->args references strings from Elf itself, so they can + * be referenced safely until elf_end() call + */ + target->spec_str = note.args; + + err = parse_usdt_spec(&target->spec, ¬e, usdt_cookie); + if (err) + goto err_out; + + target_cnt++; + } + + *out_targets = targets; + *out_target_cnt = target_cnt; + err = target_cnt; + +err_out: + free(segs); + free(lib_segs); + if (err < 0) + free(targets); + return err; } struct bpf_link_usdt { @@ -414,6 +899,7 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct link->uprobe_cnt++; } + free(targets); elf_end(elf); close(fd); @@ -422,8 +908,102 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct err_out: bpf_link__destroy(&link->link); + free(targets); if (elf) elf_end(elf); close(fd); return libbpf_err_ptr(err); } + +/* Parse out USDT ELF note from '.note.stapsdt' section. + * Logic inspired by perf's code. + */ +static int parse_usdt_note(Elf *elf, const char *path, long base_addr, + GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off, + struct usdt_note *note) +{ + const char *provider, *name, *args; + long addrs[3]; + size_t len; + + /* sanity check USDT note name and type first */ + if (strncmp(data + name_off, USDT_NOTE_NAME, nhdr->n_namesz) != 0) + return -EINVAL; + if (nhdr->n_type != USDT_NOTE_TYPE) + return -EINVAL; + + /* sanity check USDT note contents ("description" in ELF terminology) */ + len = nhdr->n_descsz; + data = data + desc_off; + + /* +3 is the very minimum required to store three empty strings */ + if (len < sizeof(addrs) + 3) + return -EINVAL; + + /* get location, base, and semaphore addrs */ + memcpy(&addrs, data, sizeof(addrs)); + + /* parse string fields: provider, name, args */ + provider = data + sizeof(addrs); + + name = (const char *)memchr(provider, '\0', data + len - provider); + if (!name) /* non-zero-terminated provider */ + return -EINVAL; + name++; + if (name >= data + len || *name == '\0') /* missing or empty name */ + return -EINVAL; + + args = memchr(name, '\0', data + len - name); + if (!args) /* non-zero-terminated name */ + return -EINVAL; + ++args; + if (args >= data + len) /* missing arguments spec */ + return -EINVAL; + + note->provider = provider; + note->name = name; + if (*args == '\0' || *args == ':') + note->args = ""; + else + note->args = args; + note->loc_addr = addrs[0]; + note->base_addr = addrs[1]; + note->sema_addr = addrs[2]; + + return 0; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg); + +static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, long usdt_cookie) +{ + const char *s; + int len; + + spec->usdt_cookie = usdt_cookie; + spec->arg_cnt = 0; + + s = note->args; + while (s[0]) { + if (spec->arg_cnt >= USDT_MAX_ARG_CNT) { + pr_warn("usdt: too many USDT arguments (> %d) for '%s:%s' with args spec '%s'\n", + USDT_MAX_ARG_CNT, note->provider, note->name, note->args); + return -E2BIG; + } + + len = parse_usdt_arg(s, spec->arg_cnt, &spec->args[spec->arg_cnt]); + if (len < 0) + return len; + + s += len; + spec->arg_cnt++; + } + + return 0; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +{ + pr_warn("usdt: libbpf doesn't support USDTs on current architecture\n"); + return -ENOTSUP; +} -- cgit From 999783c8bbda2e82390cb8c39ed9e3954cf51b82 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Apr 2022 16:41:59 -0700 Subject: libbpf: Wire up spec management and other arch-independent USDT logic Last part of architecture-agnostic user-space USDT handling logic is to set up BPF spec and, optionally, IP-to-ID maps from user-space. usdt_manager performs a compact spec ID allocation to utilize fixed-sized BPF maps as efficiently as possible. We also use hashmap to deduplicate USDT arg spec strings and map identical strings to single USDT spec, minimizing the necessary BPF map size. usdt_manager supports arbitrary sequences of attachment and detachment, both of the same USDT and multiple different USDTs and internally maintains a free list of unused spec IDs. bpf_link_usdt's logic is extended with proper setup and teardown of this spec ID free list and supporting BPF maps. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Reviewed-by: Dave Marchevsky Link: https://lore.kernel.org/bpf/20220404234202.331384-5-andrii@kernel.org --- tools/lib/bpf/usdt.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 167 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index f1670e3014ed..2799387c5465 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -239,6 +239,10 @@ struct usdt_manager { struct bpf_map *specs_map; struct bpf_map *ip_to_spec_id_map; + int *free_spec_ids; + size_t free_spec_cnt; + size_t next_free_spec_id; + bool has_bpf_cookie; bool has_sema_refcnt; }; @@ -283,6 +287,7 @@ void usdt_manager_free(struct usdt_manager *man) if (IS_ERR_OR_NULL(man)) return; + free(man->free_spec_ids); free(man); } @@ -789,6 +794,9 @@ struct bpf_link_usdt { struct usdt_manager *usdt_man; + size_t spec_cnt; + int *spec_ids; + size_t uprobe_cnt; struct { long abs_ip; @@ -799,11 +807,52 @@ struct bpf_link_usdt { static int bpf_link_usdt_detach(struct bpf_link *link) { struct bpf_link_usdt *usdt_link = container_of(link, struct bpf_link_usdt, link); + struct usdt_manager *man = usdt_link->usdt_man; int i; for (i = 0; i < usdt_link->uprobe_cnt; i++) { /* detach underlying uprobe link */ bpf_link__destroy(usdt_link->uprobes[i].link); + /* there is no need to update specs map because it will be + * unconditionally overwritten on subsequent USDT attaches, + * but if BPF cookies are not used we need to remove entry + * from ip_to_spec_id map, otherwise we'll run into false + * conflicting IP errors + */ + if (!man->has_bpf_cookie) { + /* not much we can do about errors here */ + (void)bpf_map_delete_elem(bpf_map__fd(man->ip_to_spec_id_map), + &usdt_link->uprobes[i].abs_ip); + } + } + + /* try to return the list of previously used spec IDs to usdt_manager + * for future reuse for subsequent USDT attaches + */ + if (!man->free_spec_ids) { + /* if there were no free spec IDs yet, just transfer our IDs */ + man->free_spec_ids = usdt_link->spec_ids; + man->free_spec_cnt = usdt_link->spec_cnt; + usdt_link->spec_ids = NULL; + } else { + /* otherwise concat IDs */ + size_t new_cnt = man->free_spec_cnt + usdt_link->spec_cnt; + int *new_free_ids; + + new_free_ids = libbpf_reallocarray(man->free_spec_ids, new_cnt, + sizeof(*new_free_ids)); + /* If we couldn't resize free_spec_ids, we'll just leak + * a bunch of free IDs; this is very unlikely to happen and if + * system is so exausted on memory, it's the least of user's + * concerns, probably. + * So just do our best here to return those IDs to usdt_manager. + */ + if (new_free_ids) { + memcpy(new_free_ids + man->free_spec_cnt, usdt_link->spec_ids, + usdt_link->spec_cnt * sizeof(*usdt_link->spec_ids)); + man->free_spec_ids = new_free_ids; + man->free_spec_cnt = new_cnt; + } } return 0; @@ -813,22 +862,96 @@ static void bpf_link_usdt_dealloc(struct bpf_link *link) { struct bpf_link_usdt *usdt_link = container_of(link, struct bpf_link_usdt, link); + free(usdt_link->spec_ids); free(usdt_link->uprobes); free(usdt_link); } +static size_t specs_hash_fn(const void *key, void *ctx) +{ + const char *s = key; + + return str_hash(s); +} + +static bool specs_equal_fn(const void *key1, const void *key2, void *ctx) +{ + const char *s1 = key1; + const char *s2 = key2; + + return strcmp(s1, s2) == 0; +} + +static int allocate_spec_id(struct usdt_manager *man, struct hashmap *specs_hash, + struct bpf_link_usdt *link, struct usdt_target *target, + int *spec_id, bool *is_new) +{ + void *tmp; + int err; + + /* check if we already allocated spec ID for this spec string */ + if (hashmap__find(specs_hash, target->spec_str, &tmp)) { + *spec_id = (long)tmp; + *is_new = false; + return 0; + } + + /* otherwise it's a new ID that needs to be set up in specs map and + * returned back to usdt_manager when USDT link is detached + */ + tmp = libbpf_reallocarray(link->spec_ids, link->spec_cnt + 1, sizeof(*link->spec_ids)); + if (!tmp) + return -ENOMEM; + link->spec_ids = tmp; + + /* get next free spec ID, giving preference to free list, if not empty */ + if (man->free_spec_cnt) { + *spec_id = man->free_spec_ids[man->free_spec_cnt - 1]; + + /* cache spec ID for current spec string for future lookups */ + err = hashmap__add(specs_hash, target->spec_str, (void *)(long)*spec_id); + if (err) + return err; + + man->free_spec_cnt--; + } else { + /* don't allocate spec ID bigger than what fits in specs map */ + if (man->next_free_spec_id >= bpf_map__max_entries(man->specs_map)) + return -E2BIG; + + *spec_id = man->next_free_spec_id; + + /* cache spec ID for current spec string for future lookups */ + err = hashmap__add(specs_hash, target->spec_str, (void *)(long)*spec_id); + if (err) + return err; + + man->next_free_spec_id++; + } + + /* remember new spec ID in the link for later return back to free list on detach */ + link->spec_ids[link->spec_cnt] = *spec_id; + link->spec_cnt++; + *is_new = true; + return 0; +} + struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct bpf_program *prog, pid_t pid, const char *path, const char *usdt_provider, const char *usdt_name, long usdt_cookie) { + int i, fd, err, spec_map_fd, ip_map_fd; LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct hashmap *specs_hash = NULL; struct bpf_link_usdt *link = NULL; struct usdt_target *targets = NULL; size_t target_cnt; - int i, fd, err; Elf *elf; + spec_map_fd = bpf_map__fd(man->specs_map); + ip_map_fd = bpf_map__fd(man->ip_to_spec_id_map); + /* TODO: perform path resolution similar to uprobe's */ fd = open(path, O_RDONLY); if (fd < 0) { @@ -864,6 +987,12 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct goto err_out; } + specs_hash = hashmap__new(specs_hash_fn, specs_equal_fn, NULL); + if (IS_ERR(specs_hash)) { + err = PTR_ERR(specs_hash); + goto err_out; + } + link = calloc(1, sizeof(*link)); if (!link) { err = -ENOMEM; @@ -883,8 +1012,43 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct for (i = 0; i < target_cnt; i++) { struct usdt_target *target = &targets[i]; struct bpf_link *uprobe_link; + bool is_new; + int spec_id; + + /* Spec ID can be either reused or newly allocated. If it is + * newly allocated, we'll need to fill out spec map, otherwise + * entire spec should be valid and can be just used by a new + * uprobe. We reuse spec when USDT arg spec is identical. We + * also never share specs between two different USDT + * attachments ("links"), so all the reused specs already + * share USDT cookie value implicitly. + */ + err = allocate_spec_id(man, specs_hash, link, target, &spec_id, &is_new); + if (err) + goto err_out; + + if (is_new && bpf_map_update_elem(spec_map_fd, &spec_id, &target->spec, BPF_ANY)) { + err = -errno; + pr_warn("usdt: failed to set USDT spec #%d for '%s:%s' in '%s': %d\n", + spec_id, usdt_provider, usdt_name, path, err); + goto err_out; + } + if (!man->has_bpf_cookie && + bpf_map_update_elem(ip_map_fd, &target->abs_ip, &spec_id, BPF_NOEXIST)) { + err = -errno; + if (err == -EEXIST) { + pr_warn("usdt: IP collision detected for spec #%d for '%s:%s' in '%s'\n", + spec_id, usdt_provider, usdt_name, path); + } else { + pr_warn("usdt: failed to map IP 0x%lx to spec #%d for '%s:%s' in '%s': %d\n", + target->abs_ip, spec_id, usdt_provider, usdt_name, + path, err); + } + goto err_out; + } opts.ref_ctr_offset = target->sema_off; + opts.bpf_cookie = man->has_bpf_cookie ? spec_id : 0; uprobe_link = bpf_program__attach_uprobe_opts(prog, pid, path, target->rel_ip, &opts); err = libbpf_get_error(uprobe_link); @@ -900,6 +1064,7 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct } free(targets); + hashmap__free(specs_hash); elf_end(elf); close(fd); @@ -909,6 +1074,7 @@ err_out: bpf_link__destroy(&link->link); free(targets); + hashmap__free(specs_hash); if (elf) elf_end(elf); close(fd); -- cgit From 4c59e584d1581b1bca143dda83d5c3e5baddbf20 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Apr 2022 16:42:00 -0700 Subject: libbpf: Add x86-specific USDT arg spec parsing logic Add x86/x86_64-specific USDT argument specification parsing. Each architecture will require their own logic, as all this is arch-specific assembly-based notation. Architectures that libbpf doesn't support for USDTs will pr_warn() with specific error and return -ENOTSUP. We use sscanf() as a very powerful and easy to use string parser. Those spaces in sscanf's format string mean "skip any whitespaces", which is pretty nifty (and somewhat little known) feature. All this was tested on little-endian architecture, so bit shifts are probably off on big-endian, which our CI will hopefully prove. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Reviewed-by: Dave Marchevsky Link: https://lore.kernel.org/bpf/20220404234202.331384-6-andrii@kernel.org --- tools/lib/bpf/usdt.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index 2799387c5465..1bce2eab5e89 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -1168,8 +1168,113 @@ static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, return 0; } +/* Architecture-specific logic for parsing USDT argument location specs */ + +#if defined(__x86_64__) || defined(__i386__) + +static int calc_pt_regs_off(const char *reg_name) +{ + static struct { + const char *names[4]; + size_t pt_regs_off; + } reg_map[] = { +#if __x86_64__ +#define reg_off(reg64, reg32) offsetof(struct pt_regs, reg64) +#else +#define reg_off(reg64, reg32) offsetof(struct pt_regs, reg32) +#endif + { {"rip", "eip", "", ""}, reg_off(rip, eip) }, + { {"rax", "eax", "ax", "al"}, reg_off(rax, eax) }, + { {"rbx", "ebx", "bx", "bl"}, reg_off(rbx, ebx) }, + { {"rcx", "ecx", "cx", "cl"}, reg_off(rcx, ecx) }, + { {"rdx", "edx", "dx", "dl"}, reg_off(rdx, edx) }, + { {"rsi", "esi", "si", "sil"}, reg_off(rsi, esi) }, + { {"rdi", "edi", "di", "dil"}, reg_off(rdi, edi) }, + { {"rbp", "ebp", "bp", "bpl"}, reg_off(rbp, ebp) }, + { {"rsp", "esp", "sp", "spl"}, reg_off(rsp, esp) }, +#undef reg_off +#if __x86_64__ + { {"r8", "r8d", "r8w", "r8b"}, offsetof(struct pt_regs, r8) }, + { {"r9", "r9d", "r9w", "r9b"}, offsetof(struct pt_regs, r9) }, + { {"r10", "r10d", "r10w", "r10b"}, offsetof(struct pt_regs, r10) }, + { {"r11", "r11d", "r11w", "r11b"}, offsetof(struct pt_regs, r11) }, + { {"r12", "r12d", "r12w", "r12b"}, offsetof(struct pt_regs, r12) }, + { {"r13", "r13d", "r13w", "r13b"}, offsetof(struct pt_regs, r13) }, + { {"r14", "r14d", "r14w", "r14b"}, offsetof(struct pt_regs, r14) }, + { {"r15", "r15d", "r15w", "r15b"}, offsetof(struct pt_regs, r15) }, +#endif + }; + int i, j; + + for (i = 0; i < ARRAY_SIZE(reg_map); i++) { + for (j = 0; j < ARRAY_SIZE(reg_map[i].names); j++) { + if (strcmp(reg_name, reg_map[i].names[j]) == 0) + return reg_map[i].pt_regs_off; + } + } + + pr_warn("usdt: unrecognized register '%s'\n", reg_name); + return -ENOENT; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +{ + char *reg_name = NULL; + int arg_sz, len, reg_off; + long off; + + if (sscanf(arg_str, " %d @ %ld ( %%%m[^)] ) %n", &arg_sz, &off, ®_name, &len) == 3) { + /* Memory dereference case, e.g., -4@-20(%rbp) */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + reg_off = calc_pt_regs_off(reg_name); + free(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ %%%ms %n", &arg_sz, ®_name, &len) == 2) { + /* Register read case, e.g., -4@%eax */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + + reg_off = calc_pt_regs_off(reg_name); + free(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ $%ld %n", &arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@$71 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); + return -EINVAL; + } + + arg->arg_signed = arg_sz < 0; + if (arg_sz < 0) + arg_sz = -arg_sz; + + switch (arg_sz) { + case 1: case 2: case 4: case 8: + arg->arg_bitshift = 64 - arg_sz * 8; + break; + default: + pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", + arg_num, arg_str, arg_sz); + return -EINVAL; + } + + return len; +} + +#else + static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) { pr_warn("usdt: libbpf doesn't support USDTs on current architecture\n"); return -ENOTSUP; } + +#endif -- cgit From 630301b0d59dd85e43cca29382c459f9880af5f0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Apr 2022 16:42:01 -0700 Subject: selftests/bpf: Add basic USDT selftests Add semaphore-based USDT to test_progs itself and write basic tests to valicate both auto-attachment and manual attachment logic, as well as BPF-side functionality. Also add subtests to validate that libbpf properly deduplicates USDT specs and handles spec overflow situations correctly, as well as proper "rollback" of partially-attached multi-spec USDT. BPF-side of selftest intentionally consists of two files to validate that usdt.bpf.h header can be included from multiple source code files that are subsequently linked into final BPF object file without causing any symbol duplication or other issues. We are validating that __weak maps and bpf_usdt_xxx() API functions defined in usdt.bpf.h do work as intended. USDT selftests utilize sys/sdt.h header that on Ubuntu systems comes from systemtap-sdt-devel package. But to simplify everyone's life, including CI but especially casual contributors to bpf/bpf-next that are trying to build selftests, I've checked in sys/sdt.h header from [0] directly. This way it will work on all architectures and distros without having to figure it out for every relevant combination and adding any extra implicit package dependencies. [0] https://sourceware.org/git?p=systemtap.git;a=blob_plain;f=includes/sys/sdt.h;h=ca0162b4dc57520b96638c8ae79ad547eb1dd3a1;hb=HEAD Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Acked-by: Dave Marchevsky Link: https://lore.kernel.org/bpf/20220404234202.331384-7-andrii@kernel.org --- tools/testing/selftests/bpf/Makefile | 14 +- tools/testing/selftests/bpf/prog_tests/usdt.c | 313 +++++++++++++ tools/testing/selftests/bpf/progs/test_usdt.c | 96 ++++ .../selftests/bpf/progs/test_usdt_multispec.c | 34 ++ tools/testing/selftests/bpf/sdt-config.h | 6 + tools/testing/selftests/bpf/sdt.h | 513 +++++++++++++++++++++ 6 files changed, 970 insertions(+), 6 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/usdt.c create mode 100644 tools/testing/selftests/bpf/progs/test_usdt.c create mode 100644 tools/testing/selftests/bpf/progs/test_usdt_multispec.c create mode 100644 tools/testing/selftests/bpf/sdt-config.h create mode 100644 tools/testing/selftests/bpf/sdt.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 3820608faf57..0f8c55dfd844 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -328,12 +328,8 @@ SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ linked_vars.skel.h linked_maps.skel.h \ - test_subskeleton.skel.h test_subskeleton_lib.skel.h - -# In the subskeleton case, we want the test_subskeleton_lib.subskel.h file -# but that's created as a side-effect of the skel.h generation. -test_subskeleton.skel.h-deps := test_subskeleton_lib2.o test_subskeleton_lib.o test_subskeleton.o -test_subskeleton_lib.skel.h-deps := test_subskeleton_lib2.o test_subskeleton_lib.o + test_subskeleton.skel.h test_subskeleton_lib.skel.h \ + test_usdt.skel.h LSKELS := kfunc_call_test.c fentry_test.c fexit_test.c fexit_sleep.c \ test_ringbuf.c atomics.c trace_printk.c trace_vprintk.c \ @@ -346,6 +342,11 @@ test_static_linked.skel.h-deps := test_static_linked1.o test_static_linked2.o linked_funcs.skel.h-deps := linked_funcs1.o linked_funcs2.o linked_vars.skel.h-deps := linked_vars1.o linked_vars2.o linked_maps.skel.h-deps := linked_maps1.o linked_maps2.o +# In the subskeleton case, we want the test_subskeleton_lib.subskel.h file +# but that's created as a side-effect of the skel.h generation. +test_subskeleton.skel.h-deps := test_subskeleton_lib2.o test_subskeleton_lib.o test_subskeleton.o +test_subskeleton_lib.skel.h-deps := test_subskeleton_lib2.o test_subskeleton_lib.o +test_usdt.skel.h-deps := test_usdt.o test_usdt_multispec.o LINKED_BPF_SRCS := $(patsubst %.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps))) @@ -400,6 +401,7 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $(TRUNNER_BPF_PROGS_DIR)/*.h \ $$(INCLUDE_DIR)/vmlinux.h \ $(wildcard $(BPFDIR)/bpf_*.h) \ + $(wildcard $(BPFDIR)/*.bpf.h) \ | $(TRUNNER_OUTPUT) $$(BPFOBJ) $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS)) diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c new file mode 100644 index 000000000000..0677c9bfd40b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/usdt.c @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include + +#define _SDT_HAS_SEMAPHORES 1 +#include "../sdt.h" + +#include "test_usdt.skel.h" + +int lets_test_this(int); + +static volatile int idx = 2; +static volatile __u64 bla = 0xFEDCBA9876543210ULL; +static volatile short nums[] = {-1, -2, -3, }; + +static volatile struct { + int x; + signed char y; +} t1 = { 1, -127 }; + +#define SEC(name) __attribute__((section(name), used)) + +unsigned short test_usdt0_semaphore SEC(".probes"); +unsigned short test_usdt3_semaphore SEC(".probes"); +unsigned short test_usdt12_semaphore SEC(".probes"); + +static void __always_inline trigger_func(int x) { + long y = 42; + + if (test_usdt0_semaphore) + STAP_PROBE(test, usdt0); + if (test_usdt3_semaphore) + STAP_PROBE3(test, usdt3, x, y, &bla); + if (test_usdt12_semaphore) { + STAP_PROBE12(test, usdt12, + x, x + 1, y, x + y, 5, + y / 7, bla, &bla, -9, nums[x], + nums[idx], t1.y); + } +} + +static void subtest_basic_usdt(void) +{ + LIBBPF_OPTS(bpf_usdt_opts, opts); + struct test_usdt *skel; + struct test_usdt__bss *bss; + int err; + + skel = test_usdt__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + bss = skel->bss; + bss->my_pid = getpid(); + + err = test_usdt__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* usdt0 won't be auto-attached */ + opts.usdt_cookie = 0xcafedeadbeeffeed; + skel->links.usdt0 = bpf_program__attach_usdt(skel->progs.usdt0, + 0 /*self*/, "/proc/self/exe", + "test", "usdt0", &opts); + if (!ASSERT_OK_PTR(skel->links.usdt0, "usdt0_link")) + goto cleanup; + + trigger_func(1); + + ASSERT_EQ(bss->usdt0_called, 1, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, 1, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, 1, "usdt12_called"); + + ASSERT_EQ(bss->usdt0_cookie, 0xcafedeadbeeffeed, "usdt0_cookie"); + ASSERT_EQ(bss->usdt0_arg_cnt, 0, "usdt0_arg_cnt"); + ASSERT_EQ(bss->usdt0_arg_ret, -ENOENT, "usdt0_arg_ret"); + + /* auto-attached usdt3 gets default zero cookie value */ + ASSERT_EQ(bss->usdt3_cookie, 0, "usdt3_cookie"); + ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt"); + + ASSERT_EQ(bss->usdt3_arg_rets[0], 0, "usdt3_arg1_ret"); + ASSERT_EQ(bss->usdt3_arg_rets[1], 0, "usdt3_arg2_ret"); + ASSERT_EQ(bss->usdt3_arg_rets[2], 0, "usdt3_arg3_ret"); + ASSERT_EQ(bss->usdt3_args[0], 1, "usdt3_arg1"); + ASSERT_EQ(bss->usdt3_args[1], 42, "usdt3_arg2"); + ASSERT_EQ(bss->usdt3_args[2], (uintptr_t)&bla, "usdt3_arg3"); + + /* auto-attached usdt12 gets default zero cookie value */ + ASSERT_EQ(bss->usdt12_cookie, 0, "usdt12_cookie"); + ASSERT_EQ(bss->usdt12_arg_cnt, 12, "usdt12_arg_cnt"); + + ASSERT_EQ(bss->usdt12_args[0], 1, "usdt12_arg1"); + ASSERT_EQ(bss->usdt12_args[1], 1 + 1, "usdt12_arg2"); + ASSERT_EQ(bss->usdt12_args[2], 42, "usdt12_arg3"); + ASSERT_EQ(bss->usdt12_args[3], 42 + 1, "usdt12_arg4"); + ASSERT_EQ(bss->usdt12_args[4], 5, "usdt12_arg5"); + ASSERT_EQ(bss->usdt12_args[5], 42 / 7, "usdt12_arg6"); + ASSERT_EQ(bss->usdt12_args[6], bla, "usdt12_arg7"); + ASSERT_EQ(bss->usdt12_args[7], (uintptr_t)&bla, "usdt12_arg8"); + ASSERT_EQ(bss->usdt12_args[8], -9, "usdt12_arg9"); + ASSERT_EQ(bss->usdt12_args[9], nums[1], "usdt12_arg10"); + ASSERT_EQ(bss->usdt12_args[10], nums[idx], "usdt12_arg11"); + ASSERT_EQ(bss->usdt12_args[11], t1.y, "usdt12_arg12"); + + /* trigger_func() is marked __always_inline, so USDT invocations will be + * inlined in two different places, meaning that each USDT will have + * at least 2 different places to be attached to. This verifies that + * bpf_program__attach_usdt() handles this properly and attaches to + * all possible places of USDT invocation. + */ + trigger_func(2); + + ASSERT_EQ(bss->usdt0_called, 2, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, 2, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, 2, "usdt12_called"); + + /* only check values that depend on trigger_func()'s input value */ + ASSERT_EQ(bss->usdt3_args[0], 2, "usdt3_arg1"); + + ASSERT_EQ(bss->usdt12_args[0], 2, "usdt12_arg1"); + ASSERT_EQ(bss->usdt12_args[1], 2 + 1, "usdt12_arg2"); + ASSERT_EQ(bss->usdt12_args[3], 42 + 2, "usdt12_arg4"); + ASSERT_EQ(bss->usdt12_args[9], nums[2], "usdt12_arg10"); + + /* detach and re-attach usdt3 */ + bpf_link__destroy(skel->links.usdt3); + + opts.usdt_cookie = 0xBADC00C51E; + skel->links.usdt3 = bpf_program__attach_usdt(skel->progs.usdt3, -1 /* any pid */, + "/proc/self/exe", "test", "usdt3", &opts); + if (!ASSERT_OK_PTR(skel->links.usdt3, "usdt3_reattach")) + goto cleanup; + + trigger_func(3); + + ASSERT_EQ(bss->usdt3_called, 3, "usdt3_called"); + /* this time usdt3 has custom cookie */ + ASSERT_EQ(bss->usdt3_cookie, 0xBADC00C51E, "usdt3_cookie"); + ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt"); + + ASSERT_EQ(bss->usdt3_arg_rets[0], 0, "usdt3_arg1_ret"); + ASSERT_EQ(bss->usdt3_arg_rets[1], 0, "usdt3_arg2_ret"); + ASSERT_EQ(bss->usdt3_arg_rets[2], 0, "usdt3_arg3_ret"); + ASSERT_EQ(bss->usdt3_args[0], 3, "usdt3_arg1"); + ASSERT_EQ(bss->usdt3_args[1], 42, "usdt3_arg2"); + ASSERT_EQ(bss->usdt3_args[2], (uintptr_t)&bla, "usdt3_arg3"); + +cleanup: + test_usdt__destroy(skel); +} + +unsigned short test_usdt_100_semaphore SEC(".probes"); +unsigned short test_usdt_300_semaphore SEC(".probes"); +unsigned short test_usdt_400_semaphore SEC(".probes"); + +#define R10(F, X) F(X+0); F(X+1);F(X+2); F(X+3); F(X+4); \ + F(X+5); F(X+6); F(X+7); F(X+8); F(X+9); +#define R100(F, X) R10(F,X+ 0);R10(F,X+10);R10(F,X+20);R10(F,X+30);R10(F,X+40); \ + R10(F,X+50);R10(F,X+60);R10(F,X+70);R10(F,X+80);R10(F,X+90); + +/* carefully control that we get exactly 100 inlines by preventing inlining */ +static void __always_inline f100(int x) +{ + STAP_PROBE1(test, usdt_100, x); +} + +__weak void trigger_100_usdts(void) +{ + R100(f100, 0); +} + +/* we shouldn't be able to attach to test:usdt2_300 USDT as we don't have as + * many slots for specs. It's important that each STAP_PROBE2() invocation + * (after untolling) gets different arg spec due to compiler inlining i as + * a constant + */ +static void __always_inline f300(int x) +{ + STAP_PROBE1(test, usdt_300, x); +} + +__weak void trigger_300_usdts(void) +{ + R100(f300, 0); + R100(f300, 100); + R100(f300, 200); +} + +static void __always_inline f400(int x __attribute__((unused))) +{ + static int y; + + STAP_PROBE1(test, usdt_400, y++); +} + +/* this time we have 400 different USDT call sites, but they have uniform + * argument location, so libbpf's spec string deduplication logic should keep + * spec count use very small and so we should be able to attach to all 400 + * call sites + */ +__weak void trigger_400_usdts(void) +{ + R100(f400, 0); + R100(f400, 100); + R100(f400, 200); + R100(f400, 300); +} + +static void subtest_multispec_usdt(void) +{ + LIBBPF_OPTS(bpf_usdt_opts, opts); + struct test_usdt *skel; + struct test_usdt__bss *bss; + int err, i; + + skel = test_usdt__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + bss = skel->bss; + bss->my_pid = getpid(); + + err = test_usdt__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* usdt_100 is auto-attached and there are 100 inlined call sites, + * let's validate that all of them are properly attached to and + * handled from BPF side + */ + trigger_100_usdts(); + + ASSERT_EQ(bss->usdt_100_called, 100, "usdt_100_called"); + ASSERT_EQ(bss->usdt_100_sum, 99 * 100 / 2, "usdt_100_sum"); + + /* Stress test free spec ID tracking. By default libbpf allows up to + * 256 specs to be used, so if we don't return free spec IDs back + * after few detachments and re-attachments we should run out of + * available spec IDs. + */ + for (i = 0; i < 2; i++) { + bpf_link__destroy(skel->links.usdt_100); + + skel->links.usdt_100 = bpf_program__attach_usdt(skel->progs.usdt_100, -1, + "/proc/self/exe", + "test", "usdt_100", NULL); + if (!ASSERT_OK_PTR(skel->links.usdt_100, "usdt_100_reattach")) + goto cleanup; + + bss->usdt_100_sum = 0; + trigger_100_usdts(); + + ASSERT_EQ(bss->usdt_100_called, (i + 1) * 100 + 100, "usdt_100_called"); + ASSERT_EQ(bss->usdt_100_sum, 99 * 100 / 2, "usdt_100_sum"); + } + + /* Now let's step it up and try to attach USDT that requires more than + * 256 attach points with different specs for each. + * Note that we need trigger_300_usdts() only to actually have 300 + * USDT call sites, we are not going to actually trace them. + */ + trigger_300_usdts(); + + /* we'll reuse usdt_100 BPF program for usdt_300 test */ + bpf_link__destroy(skel->links.usdt_100); + skel->links.usdt_100 = bpf_program__attach_usdt(skel->progs.usdt_100, -1, "/proc/self/exe", + "test", "usdt_300", NULL); + err = -errno; + if (!ASSERT_ERR_PTR(skel->links.usdt_100, "usdt_300_bad_attach")) + goto cleanup; + ASSERT_EQ(err, -E2BIG, "usdt_300_attach_err"); + + /* let's check that there are no "dangling" BPF programs attached due + * to partial success of the above test:usdt_300 attachment + */ + bss->usdt_100_called = 0; + bss->usdt_100_sum = 0; + + f300(777); /* this is 301st instance of usdt_300 */ + + ASSERT_EQ(bss->usdt_100_called, 0, "usdt_301_called"); + ASSERT_EQ(bss->usdt_100_sum, 0, "usdt_301_sum"); + + /* This time we have USDT with 400 inlined invocations, but arg specs + * should be the same across all sites, so libbpf will only need to + * use one spec and thus we'll be able to attach 400 uprobes + * successfully. + * + * Again, we are reusing usdt_100 BPF program. + */ + skel->links.usdt_100 = bpf_program__attach_usdt(skel->progs.usdt_100, -1, + "/proc/self/exe", + "test", "usdt_400", NULL); + if (!ASSERT_OK_PTR(skel->links.usdt_100, "usdt_400_attach")) + goto cleanup; + + trigger_400_usdts(); + + ASSERT_EQ(bss->usdt_100_called, 400, "usdt_400_called"); + ASSERT_EQ(bss->usdt_100_sum, 399 * 400 / 2, "usdt_400_sum"); + +cleanup: + test_usdt__destroy(skel); +} + +void test_usdt(void) +{ + if (test__start_subtest("basic")) + subtest_basic_usdt(); + if (test__start_subtest("multispec")) + subtest_multispec_usdt(); +} diff --git a/tools/testing/selftests/bpf/progs/test_usdt.c b/tools/testing/selftests/bpf/progs/test_usdt.c new file mode 100644 index 000000000000..505aab9a5234 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_usdt.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include +#include + +int my_pid; + +int usdt0_called; +u64 usdt0_cookie; +int usdt0_arg_cnt; +int usdt0_arg_ret; + +SEC("usdt") +int usdt0(struct pt_regs *ctx) +{ + long tmp; + + if (my_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + __sync_fetch_and_add(&usdt0_called, 1); + + usdt0_cookie = bpf_usdt_cookie(ctx); + usdt0_arg_cnt = bpf_usdt_arg_cnt(ctx); + /* should return -ENOENT for any arg_num */ + usdt0_arg_ret = bpf_usdt_arg(ctx, bpf_get_prandom_u32(), &tmp); + return 0; +} + +int usdt3_called; +u64 usdt3_cookie; +int usdt3_arg_cnt; +int usdt3_arg_rets[3]; +u64 usdt3_args[3]; + +SEC("usdt//proc/self/exe:test:usdt3") +int usdt3(struct pt_regs *ctx) +{ + long tmp; + + if (my_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + __sync_fetch_and_add(&usdt3_called, 1); + + usdt3_cookie = bpf_usdt_cookie(ctx); + usdt3_arg_cnt = bpf_usdt_arg_cnt(ctx); + + usdt3_arg_rets[0] = bpf_usdt_arg(ctx, 0, &tmp); + usdt3_args[0] = (int)tmp; + + usdt3_arg_rets[1] = bpf_usdt_arg(ctx, 1, &tmp); + usdt3_args[1] = (long)tmp; + + usdt3_arg_rets[2] = bpf_usdt_arg(ctx, 2, &tmp); + usdt3_args[2] = (uintptr_t)tmp; + + return 0; +} + +int usdt12_called; +u64 usdt12_cookie; +int usdt12_arg_cnt; +u64 usdt12_args[12]; + +SEC("usdt//proc/self/exe:test:usdt12") +int BPF_USDT(usdt12, int a1, int a2, long a3, long a4, unsigned a5, + long a6, __u64 a7, uintptr_t a8, int a9, short a10, + short a11, signed char a12) +{ + if (my_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + __sync_fetch_and_add(&usdt12_called, 1); + + usdt12_cookie = bpf_usdt_cookie(ctx); + usdt12_arg_cnt = bpf_usdt_arg_cnt(ctx); + + usdt12_args[0] = a1; + usdt12_args[1] = a2; + usdt12_args[2] = a3; + usdt12_args[3] = a4; + usdt12_args[4] = a5; + usdt12_args[5] = a6; + usdt12_args[6] = a7; + usdt12_args[7] = a8; + usdt12_args[8] = a9; + usdt12_args[9] = a10; + usdt12_args[10] = a11; + usdt12_args[11] = a12; + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_usdt_multispec.c b/tools/testing/selftests/bpf/progs/test_usdt_multispec.c new file mode 100644 index 000000000000..3a090681f981 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_usdt_multispec.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include +#include + +/* this file is linked together with test_usdt.c to validate that usdt.bpf.h + * can be included in multiple .bpf.c files forming single final BPF object + * file + */ + +extern int my_pid; + +int usdt_100_called; +int usdt_100_sum; + +SEC("usdt//proc/self/exe:test:usdt_100") +int BPF_USDT(usdt_100, int x) +{ + long tmp; + + if (my_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + __sync_fetch_and_add(&usdt_100_called, 1); + __sync_fetch_and_add(&usdt_100_sum, x); + + bpf_printk("X is %d, sum is %d", x, usdt_100_sum); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/sdt-config.h b/tools/testing/selftests/bpf/sdt-config.h new file mode 100644 index 000000000000..733045a52771 --- /dev/null +++ b/tools/testing/selftests/bpf/sdt-config.h @@ -0,0 +1,6 @@ +/* includes/sys/sdt-config.h. Generated from sdt-config.h.in by configure. + + This file just defines _SDT_ASM_SECTION_AUTOGROUP_SUPPORT to 0 or 1 to + indicate whether the assembler supports "?" in .pushsection directives. */ + +#define _SDT_ASM_SECTION_AUTOGROUP_SUPPORT 1 diff --git a/tools/testing/selftests/bpf/sdt.h b/tools/testing/selftests/bpf/sdt.h new file mode 100644 index 000000000000..ca0162b4dc57 --- /dev/null +++ b/tools/testing/selftests/bpf/sdt.h @@ -0,0 +1,513 @@ +/* - Systemtap static probe definition macros. + + This file is dedicated to the public domain, pursuant to CC0 + (https://creativecommons.org/publicdomain/zero/1.0/) +*/ + +#ifndef _SYS_SDT_H +#define _SYS_SDT_H 1 + +/* + This file defines a family of macros + + STAP_PROBEn(op1, ..., opn) + + that emit a nop into the instruction stream, and some data into an auxiliary + note section. The data in the note section describes the operands, in terms + of size and location. Each location is encoded as assembler operand string. + Consumer tools such as gdb or systemtap insert breakpoints on top of + the nop, and decode the location operand-strings, like an assembler, + to find the values being passed. + + The operand strings are selected by the compiler for each operand. + They are constrained by gcc inline-assembler codes. The default is: + + #define STAP_SDT_ARG_CONSTRAINT nor + + This is a good default if the operands tend to be integral and + moderate in number (smaller than number of registers). In other + cases, the compiler may report "'asm' requires impossible reload" or + similar. In this case, consider simplifying the macro call (fewer + and simpler operands), reduce optimization, or override the default + constraints string via: + + #define STAP_SDT_ARG_CONSTRAINT g + #include + + See also: + https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + https://gcc.gnu.org/onlinedocs/gcc/Constraints.html + */ + + + +#ifdef __ASSEMBLER__ +# define _SDT_PROBE(provider, name, n, arglist) \ + _SDT_ASM_BODY(provider, name, _SDT_ASM_SUBSTR_1, (_SDT_DEPAREN_##n arglist)) \ + _SDT_ASM_BASE +# define _SDT_ASM_1(x) x; +# define _SDT_ASM_2(a, b) a,b; +# define _SDT_ASM_3(a, b, c) a,b,c; +# define _SDT_ASM_5(a, b, c, d, e) a,b,c,d,e; +# define _SDT_ASM_STRING_1(x) .asciz #x; +# define _SDT_ASM_SUBSTR_1(x) .ascii #x; +# define _SDT_DEPAREN_0() /* empty */ +# define _SDT_DEPAREN_1(a) a +# define _SDT_DEPAREN_2(a,b) a b +# define _SDT_DEPAREN_3(a,b,c) a b c +# define _SDT_DEPAREN_4(a,b,c,d) a b c d +# define _SDT_DEPAREN_5(a,b,c,d,e) a b c d e +# define _SDT_DEPAREN_6(a,b,c,d,e,f) a b c d e f +# define _SDT_DEPAREN_7(a,b,c,d,e,f,g) a b c d e f g +# define _SDT_DEPAREN_8(a,b,c,d,e,f,g,h) a b c d e f g h +# define _SDT_DEPAREN_9(a,b,c,d,e,f,g,h,i) a b c d e f g h i +# define _SDT_DEPAREN_10(a,b,c,d,e,f,g,h,i,j) a b c d e f g h i j +# define _SDT_DEPAREN_11(a,b,c,d,e,f,g,h,i,j,k) a b c d e f g h i j k +# define _SDT_DEPAREN_12(a,b,c,d,e,f,g,h,i,j,k,l) a b c d e f g h i j k l +#else +#if defined _SDT_HAS_SEMAPHORES +#define _SDT_NOTE_SEMAPHORE_USE(provider, name) \ + __asm__ __volatile__ ("" :: "m" (provider##_##name##_semaphore)); +#else +#define _SDT_NOTE_SEMAPHORE_USE(provider, name) +#endif + +# define _SDT_PROBE(provider, name, n, arglist) \ + do { \ + _SDT_NOTE_SEMAPHORE_USE(provider, name); \ + __asm__ __volatile__ (_SDT_ASM_BODY(provider, name, _SDT_ASM_ARGS, (n)) \ + :: _SDT_ASM_OPERANDS_##n arglist); \ + __asm__ __volatile__ (_SDT_ASM_BASE); \ + } while (0) +# define _SDT_S(x) #x +# define _SDT_ASM_1(x) _SDT_S(x) "\n" +# define _SDT_ASM_2(a, b) _SDT_S(a) "," _SDT_S(b) "\n" +# define _SDT_ASM_3(a, b, c) _SDT_S(a) "," _SDT_S(b) "," \ + _SDT_S(c) "\n" +# define _SDT_ASM_5(a, b, c, d, e) _SDT_S(a) "," _SDT_S(b) "," \ + _SDT_S(c) "," _SDT_S(d) "," \ + _SDT_S(e) "\n" +# define _SDT_ASM_ARGS(n) _SDT_ASM_TEMPLATE_##n +# define _SDT_ASM_STRING_1(x) _SDT_ASM_1(.asciz #x) +# define _SDT_ASM_SUBSTR_1(x) _SDT_ASM_1(.ascii #x) + +# define _SDT_ARGFMT(no) _SDT_ASM_1(_SDT_SIGN %n[_SDT_S##no]) \ + _SDT_ASM_1(_SDT_SIZE %n[_SDT_S##no]) \ + _SDT_ASM_1(_SDT_TYPE %n[_SDT_S##no]) \ + _SDT_ASM_SUBSTR(_SDT_ARGTMPL(_SDT_A##no)) + + +# ifndef STAP_SDT_ARG_CONSTRAINT +# if defined __powerpc__ +# define STAP_SDT_ARG_CONSTRAINT nZr +# elif defined __arm__ +# define STAP_SDT_ARG_CONSTRAINT g +# else +# define STAP_SDT_ARG_CONSTRAINT nor +# endif +# endif + +# define _SDT_STRINGIFY(x) #x +# define _SDT_ARG_CONSTRAINT_STRING(x) _SDT_STRINGIFY(x) +/* _SDT_S encodes the size and type as 0xSSTT which is decoded by the assembler + macros _SDT_SIZE and _SDT_TYPE */ +# define _SDT_ARG(n, x) \ + [_SDT_S##n] "n" ((_SDT_ARGSIGNED (x) ? (int)-1 : 1) * (-(((int) _SDT_ARGSIZE (x)) << 8) + (-(0x7f & __builtin_classify_type (x))))), \ + [_SDT_A##n] _SDT_ARG_CONSTRAINT_STRING (STAP_SDT_ARG_CONSTRAINT) (_SDT_ARGVAL (x)) +#endif +#define _SDT_ASM_STRING(x) _SDT_ASM_STRING_1(x) +#define _SDT_ASM_SUBSTR(x) _SDT_ASM_SUBSTR_1(x) + +#define _SDT_ARGARRAY(x) (__builtin_classify_type (x) == 14 \ + || __builtin_classify_type (x) == 5) + +#ifdef __cplusplus +# define _SDT_ARGSIGNED(x) (!_SDT_ARGARRAY (x) \ + && __sdt_type<__typeof (x)>::__sdt_signed) +# define _SDT_ARGSIZE(x) (_SDT_ARGARRAY (x) \ + ? sizeof (void *) : sizeof (x)) +# define _SDT_ARGVAL(x) (x) + +# include + +template +struct __sdt_type +{ + static const bool __sdt_signed = false; +}; + +#define __SDT_ALWAYS_SIGNED(T) \ +template<> struct __sdt_type { static const bool __sdt_signed = true; }; +#define __SDT_COND_SIGNED(T,CT) \ +template<> struct __sdt_type { static const bool __sdt_signed = ((CT)(-1) < 1); }; +__SDT_ALWAYS_SIGNED(signed char) +__SDT_ALWAYS_SIGNED(short) +__SDT_ALWAYS_SIGNED(int) +__SDT_ALWAYS_SIGNED(long) +__SDT_ALWAYS_SIGNED(long long) +__SDT_ALWAYS_SIGNED(volatile signed char) +__SDT_ALWAYS_SIGNED(volatile short) +__SDT_ALWAYS_SIGNED(volatile int) +__SDT_ALWAYS_SIGNED(volatile long) +__SDT_ALWAYS_SIGNED(volatile long long) +__SDT_ALWAYS_SIGNED(const signed char) +__SDT_ALWAYS_SIGNED(const short) +__SDT_ALWAYS_SIGNED(const int) +__SDT_ALWAYS_SIGNED(const long) +__SDT_ALWAYS_SIGNED(const long long) +__SDT_ALWAYS_SIGNED(const volatile signed char) +__SDT_ALWAYS_SIGNED(const volatile short) +__SDT_ALWAYS_SIGNED(const volatile int) +__SDT_ALWAYS_SIGNED(const volatile long) +__SDT_ALWAYS_SIGNED(const volatile long long) +__SDT_COND_SIGNED(char, char) +__SDT_COND_SIGNED(wchar_t, wchar_t) +__SDT_COND_SIGNED(volatile char, char) +__SDT_COND_SIGNED(volatile wchar_t, wchar_t) +__SDT_COND_SIGNED(const char, char) +__SDT_COND_SIGNED(const wchar_t, wchar_t) +__SDT_COND_SIGNED(const volatile char, char) +__SDT_COND_SIGNED(const volatile wchar_t, wchar_t) +#if defined (__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) +/* __SDT_COND_SIGNED(char16_t) */ +/* __SDT_COND_SIGNED(char32_t) */ +#endif + +template +struct __sdt_type<__sdt_E[]> : public __sdt_type<__sdt_E *> {}; + +template +struct __sdt_type<__sdt_E[__sdt_N]> : public __sdt_type<__sdt_E *> {}; + +#elif !defined(__ASSEMBLER__) +__extension__ extern unsigned long long __sdt_unsp; +# define _SDT_ARGINTTYPE(x) \ + __typeof (__builtin_choose_expr (((__builtin_classify_type (x) \ + + 3) & -4) == 4, (x), 0U)) +# define _SDT_ARGSIGNED(x) \ + (!__extension__ \ + (__builtin_constant_p ((((unsigned long long) \ + (_SDT_ARGINTTYPE (x)) __sdt_unsp) \ + & ((unsigned long long)1 << (sizeof (unsigned long long) \ + * __CHAR_BIT__ - 1))) == 0) \ + || (_SDT_ARGINTTYPE (x)) -1 > (_SDT_ARGINTTYPE (x)) 0)) +# define _SDT_ARGSIZE(x) \ + (_SDT_ARGARRAY (x) ? sizeof (void *) : sizeof (x)) +# define _SDT_ARGVAL(x) (x) +#endif + +#if defined __powerpc__ || defined __powerpc64__ +# define _SDT_ARGTMPL(id) %I[id]%[id] +#elif defined __i386__ +# define _SDT_ARGTMPL(id) %k[id] /* gcc.gnu.org/PR80115 sourceware.org/PR24541 */ +#else +# define _SDT_ARGTMPL(id) %[id] +#endif + +/* NB: gdb PR24541 highlighted an unspecified corner of the sdt.h + operand note format. + + The named register may be a longer or shorter (!) alias for the + storage where the value in question is found. For example, on + i386, 64-bit value may be put in register pairs, and the register + name stored would identify just one of them. Previously, gcc was + asked to emit the %w[id] (16-bit alias of some registers holding + operands), even when a wider 32-bit value was used. + + Bottom line: the byte-width given before the @ sign governs. If + there is a mismatch between that width and that of the named + register, then a sys/sdt.h note consumer may need to employ + architecture-specific heuristics to figure out where the compiler + has actually put the complete value. +*/ + +#ifdef __LP64__ +# define _SDT_ASM_ADDR .8byte +#else +# define _SDT_ASM_ADDR .4byte +#endif + +/* The ia64 and s390 nop instructions take an argument. */ +#if defined(__ia64__) || defined(__s390__) || defined(__s390x__) +#define _SDT_NOP nop 0 +#else +#define _SDT_NOP nop +#endif + +#define _SDT_NOTE_NAME "stapsdt" +#define _SDT_NOTE_TYPE 3 + +/* If the assembler supports the necessary feature, then we can play + nice with code in COMDAT sections, which comes up in C++ code. + Without that assembler support, some combinations of probe placements + in certain kinds of C++ code may produce link-time errors. */ +#include "sdt-config.h" +#if _SDT_ASM_SECTION_AUTOGROUP_SUPPORT +# define _SDT_ASM_AUTOGROUP "?" +#else +# define _SDT_ASM_AUTOGROUP "" +#endif + +#define _SDT_DEF_MACROS \ + _SDT_ASM_1(.altmacro) \ + _SDT_ASM_1(.macro _SDT_SIGN x) \ + _SDT_ASM_3(.pushsection .note.stapsdt,"","note") \ + _SDT_ASM_1(.iflt \\x) \ + _SDT_ASM_1(.ascii "-") \ + _SDT_ASM_1(.endif) \ + _SDT_ASM_1(.popsection) \ + _SDT_ASM_1(.endm) \ + _SDT_ASM_1(.macro _SDT_SIZE_ x) \ + _SDT_ASM_3(.pushsection .note.stapsdt,"","note") \ + _SDT_ASM_1(.ascii "\x") \ + _SDT_ASM_1(.popsection) \ + _SDT_ASM_1(.endm) \ + _SDT_ASM_1(.macro _SDT_SIZE x) \ + _SDT_ASM_1(_SDT_SIZE_ %%((-(-\\x*((-\\x>0)-(-\\x<0))))>>8)) \ + _SDT_ASM_1(.endm) \ + _SDT_ASM_1(.macro _SDT_TYPE_ x) \ + _SDT_ASM_3(.pushsection .note.stapsdt,"","note") \ + _SDT_ASM_2(.ifc 8,\\x) \ + _SDT_ASM_1(.ascii "f") \ + _SDT_ASM_1(.endif) \ + _SDT_ASM_1(.ascii "@") \ + _SDT_ASM_1(.popsection) \ + _SDT_ASM_1(.endm) \ + _SDT_ASM_1(.macro _SDT_TYPE x) \ + _SDT_ASM_1(_SDT_TYPE_ %%((\\x)&(0xff))) \ + _SDT_ASM_1(.endm) + +#define _SDT_UNDEF_MACROS \ + _SDT_ASM_1(.purgem _SDT_SIGN) \ + _SDT_ASM_1(.purgem _SDT_SIZE_) \ + _SDT_ASM_1(.purgem _SDT_SIZE) \ + _SDT_ASM_1(.purgem _SDT_TYPE_) \ + _SDT_ASM_1(.purgem _SDT_TYPE) + +#define _SDT_ASM_BODY(provider, name, pack_args, args, ...) \ + _SDT_DEF_MACROS \ + _SDT_ASM_1(990: _SDT_NOP) \ + _SDT_ASM_3( .pushsection .note.stapsdt,_SDT_ASM_AUTOGROUP,"note") \ + _SDT_ASM_1( .balign 4) \ + _SDT_ASM_3( .4byte 992f-991f, 994f-993f, _SDT_NOTE_TYPE) \ + _SDT_ASM_1(991: .asciz _SDT_NOTE_NAME) \ + _SDT_ASM_1(992: .balign 4) \ + _SDT_ASM_1(993: _SDT_ASM_ADDR 990b) \ + _SDT_ASM_1( _SDT_ASM_ADDR _.stapsdt.base) \ + _SDT_SEMAPHORE(provider,name) \ + _SDT_ASM_STRING(provider) \ + _SDT_ASM_STRING(name) \ + pack_args args \ + _SDT_ASM_SUBSTR(\x00) \ + _SDT_UNDEF_MACROS \ + _SDT_ASM_1(994: .balign 4) \ + _SDT_ASM_1( .popsection) + +#define _SDT_ASM_BASE \ + _SDT_ASM_1(.ifndef _.stapsdt.base) \ + _SDT_ASM_5( .pushsection .stapsdt.base,"aG","progbits", \ + .stapsdt.base,comdat) \ + _SDT_ASM_1( .weak _.stapsdt.base) \ + _SDT_ASM_1( .hidden _.stapsdt.base) \ + _SDT_ASM_1( _.stapsdt.base: .space 1) \ + _SDT_ASM_2( .size _.stapsdt.base, 1) \ + _SDT_ASM_1( .popsection) \ + _SDT_ASM_1(.endif) + +#if defined _SDT_HAS_SEMAPHORES +#define _SDT_SEMAPHORE(p,n) \ + _SDT_ASM_1( _SDT_ASM_ADDR p##_##n##_semaphore) +#else +#define _SDT_SEMAPHORE(p,n) _SDT_ASM_1( _SDT_ASM_ADDR 0) +#endif + +#define _SDT_ASM_BLANK _SDT_ASM_SUBSTR(\x20) +#define _SDT_ASM_TEMPLATE_0 /* no arguments */ +#define _SDT_ASM_TEMPLATE_1 _SDT_ARGFMT(1) +#define _SDT_ASM_TEMPLATE_2 _SDT_ASM_TEMPLATE_1 _SDT_ASM_BLANK _SDT_ARGFMT(2) +#define _SDT_ASM_TEMPLATE_3 _SDT_ASM_TEMPLATE_2 _SDT_ASM_BLANK _SDT_ARGFMT(3) +#define _SDT_ASM_TEMPLATE_4 _SDT_ASM_TEMPLATE_3 _SDT_ASM_BLANK _SDT_ARGFMT(4) +#define _SDT_ASM_TEMPLATE_5 _SDT_ASM_TEMPLATE_4 _SDT_ASM_BLANK _SDT_ARGFMT(5) +#define _SDT_ASM_TEMPLATE_6 _SDT_ASM_TEMPLATE_5 _SDT_ASM_BLANK _SDT_ARGFMT(6) +#define _SDT_ASM_TEMPLATE_7 _SDT_ASM_TEMPLATE_6 _SDT_ASM_BLANK _SDT_ARGFMT(7) +#define _SDT_ASM_TEMPLATE_8 _SDT_ASM_TEMPLATE_7 _SDT_ASM_BLANK _SDT_ARGFMT(8) +#define _SDT_ASM_TEMPLATE_9 _SDT_ASM_TEMPLATE_8 _SDT_ASM_BLANK _SDT_ARGFMT(9) +#define _SDT_ASM_TEMPLATE_10 _SDT_ASM_TEMPLATE_9 _SDT_ASM_BLANK _SDT_ARGFMT(10) +#define _SDT_ASM_TEMPLATE_11 _SDT_ASM_TEMPLATE_10 _SDT_ASM_BLANK _SDT_ARGFMT(11) +#define _SDT_ASM_TEMPLATE_12 _SDT_ASM_TEMPLATE_11 _SDT_ASM_BLANK _SDT_ARGFMT(12) +#define _SDT_ASM_OPERANDS_0() [__sdt_dummy] "g" (0) +#define _SDT_ASM_OPERANDS_1(arg1) _SDT_ARG(1, arg1) +#define _SDT_ASM_OPERANDS_2(arg1, arg2) \ + _SDT_ASM_OPERANDS_1(arg1), _SDT_ARG(2, arg2) +#define _SDT_ASM_OPERANDS_3(arg1, arg2, arg3) \ + _SDT_ASM_OPERANDS_2(arg1, arg2), _SDT_ARG(3, arg3) +#define _SDT_ASM_OPERANDS_4(arg1, arg2, arg3, arg4) \ + _SDT_ASM_OPERANDS_3(arg1, arg2, arg3), _SDT_ARG(4, arg4) +#define _SDT_ASM_OPERANDS_5(arg1, arg2, arg3, arg4, arg5) \ + _SDT_ASM_OPERANDS_4(arg1, arg2, arg3, arg4), _SDT_ARG(5, arg5) +#define _SDT_ASM_OPERANDS_6(arg1, arg2, arg3, arg4, arg5, arg6) \ + _SDT_ASM_OPERANDS_5(arg1, arg2, arg3, arg4, arg5), _SDT_ARG(6, arg6) +#define _SDT_ASM_OPERANDS_7(arg1, arg2, arg3, arg4, arg5, arg6, arg7) \ + _SDT_ASM_OPERANDS_6(arg1, arg2, arg3, arg4, arg5, arg6), _SDT_ARG(7, arg7) +#define _SDT_ASM_OPERANDS_8(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8) \ + _SDT_ASM_OPERANDS_7(arg1, arg2, arg3, arg4, arg5, arg6, arg7), \ + _SDT_ARG(8, arg8) +#define _SDT_ASM_OPERANDS_9(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9) \ + _SDT_ASM_OPERANDS_8(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8), \ + _SDT_ARG(9, arg9) +#define _SDT_ASM_OPERANDS_10(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10) \ + _SDT_ASM_OPERANDS_9(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9), \ + _SDT_ARG(10, arg10) +#define _SDT_ASM_OPERANDS_11(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11) \ + _SDT_ASM_OPERANDS_10(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10), \ + _SDT_ARG(11, arg11) +#define _SDT_ASM_OPERANDS_12(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12) \ + _SDT_ASM_OPERANDS_11(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11), \ + _SDT_ARG(12, arg12) + +/* These macros can be used in C, C++, or assembly code. + In assembly code the arguments should use normal assembly operand syntax. */ + +#define STAP_PROBE(provider, name) \ + _SDT_PROBE(provider, name, 0, ()) +#define STAP_PROBE1(provider, name, arg1) \ + _SDT_PROBE(provider, name, 1, (arg1)) +#define STAP_PROBE2(provider, name, arg1, arg2) \ + _SDT_PROBE(provider, name, 2, (arg1, arg2)) +#define STAP_PROBE3(provider, name, arg1, arg2, arg3) \ + _SDT_PROBE(provider, name, 3, (arg1, arg2, arg3)) +#define STAP_PROBE4(provider, name, arg1, arg2, arg3, arg4) \ + _SDT_PROBE(provider, name, 4, (arg1, arg2, arg3, arg4)) +#define STAP_PROBE5(provider, name, arg1, arg2, arg3, arg4, arg5) \ + _SDT_PROBE(provider, name, 5, (arg1, arg2, arg3, arg4, arg5)) +#define STAP_PROBE6(provider, name, arg1, arg2, arg3, arg4, arg5, arg6) \ + _SDT_PROBE(provider, name, 6, (arg1, arg2, arg3, arg4, arg5, arg6)) +#define STAP_PROBE7(provider, name, arg1, arg2, arg3, arg4, arg5, arg6, arg7) \ + _SDT_PROBE(provider, name, 7, (arg1, arg2, arg3, arg4, arg5, arg6, arg7)) +#define STAP_PROBE8(provider,name,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8) \ + _SDT_PROBE(provider, name, 8, (arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8)) +#define STAP_PROBE9(provider,name,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9)\ + _SDT_PROBE(provider, name, 9, (arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9)) +#define STAP_PROBE10(provider,name,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10) \ + _SDT_PROBE(provider, name, 10, \ + (arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10)) +#define STAP_PROBE11(provider,name,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11) \ + _SDT_PROBE(provider, name, 11, \ + (arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11)) +#define STAP_PROBE12(provider,name,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12) \ + _SDT_PROBE(provider, name, 12, \ + (arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12)) + +/* This STAP_PROBEV macro can be used in variadic scenarios, where the + number of probe arguments is not known until compile time. Since + variadic macro support may vary with compiler options, you must + pre-#define SDT_USE_VARIADIC to enable this type of probe. + + The trick to count __VA_ARGS__ was inspired by this post by + Laurent Deniau : + http://groups.google.com/group/comp.std.c/msg/346fc464319b1ee5 + + Note that our _SDT_NARG is called with an extra 0 arg that's not + counted, so we don't have to worry about the behavior of macros + called without any arguments. */ + +#define _SDT_NARG(...) __SDT_NARG(__VA_ARGS__, 12,11,10,9,8,7,6,5,4,3,2,1,0) +#define __SDT_NARG(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_10,_11,_12, N, ...) N +#ifdef SDT_USE_VARIADIC +#define _SDT_PROBE_N(provider, name, N, ...) \ + _SDT_PROBE(provider, name, N, (__VA_ARGS__)) +#define STAP_PROBEV(provider, name, ...) \ + _SDT_PROBE_N(provider, name, _SDT_NARG(0, ##__VA_ARGS__), ##__VA_ARGS__) +#endif + +/* These macros are for use in asm statements. You must compile + with -std=gnu99 or -std=c99 to use the STAP_PROBE_ASM macro. + + The STAP_PROBE_ASM macro generates a quoted string to be used in the + template portion of the asm statement, concatenated with strings that + contain the actual assembly code around the probe site. + + For example: + + asm ("before\n" + STAP_PROBE_ASM(provider, fooprobe, %eax 4(%esi)) + "after"); + + emits the assembly code for "before\nafter", with a probe in between. + The probe arguments are the %eax register, and the value of the memory + word located 4 bytes past the address in the %esi register. Note that + because this is a simple asm, not a GNU C extended asm statement, these + % characters do not need to be doubled to generate literal %reg names. + + In a GNU C extended asm statement, the probe arguments can be specified + using the macro STAP_PROBE_ASM_TEMPLATE(n) for n arguments. The paired + macro STAP_PROBE_ASM_OPERANDS gives the C values of these probe arguments, + and appears in the input operand list of the asm statement. For example: + + asm ("someinsn %0,%1\n" // %0 is output operand, %1 is input operand + STAP_PROBE_ASM(provider, fooprobe, STAP_PROBE_ASM_TEMPLATE(3)) + "otherinsn %[namedarg]" + : "r" (outvar) + : "g" (some_value), [namedarg] "i" (1234), + STAP_PROBE_ASM_OPERANDS(3, some_value, some_ptr->field, 1234)); + + This is just like writing: + + STAP_PROBE3(provider, fooprobe, some_value, some_ptr->field, 1234)); + + but the probe site is right between "someinsn" and "otherinsn". + + The probe arguments in STAP_PROBE_ASM can be given as assembly + operands instead, even inside a GNU C extended asm statement. + Note that these can use operand templates like %0 or %[name], + and likewise they must write %%reg for a literal operand of %reg. */ + +#define _SDT_ASM_BODY_1(p,n,...) _SDT_ASM_BODY(p,n,_SDT_ASM_SUBSTR,(__VA_ARGS__)) +#define _SDT_ASM_BODY_2(p,n,...) _SDT_ASM_BODY(p,n,/*_SDT_ASM_STRING */,__VA_ARGS__) +#define _SDT_ASM_BODY_N2(p,n,no,...) _SDT_ASM_BODY_ ## no(p,n,__VA_ARGS__) +#define _SDT_ASM_BODY_N1(p,n,no,...) _SDT_ASM_BODY_N2(p,n,no,__VA_ARGS__) +#define _SDT_ASM_BODY_N(p,n,...) _SDT_ASM_BODY_N1(p,n,_SDT_NARG(0, __VA_ARGS__),__VA_ARGS__) + +#if __STDC_VERSION__ >= 199901L +# define STAP_PROBE_ASM(provider, name, ...) \ + _SDT_ASM_BODY_N(provider, name, __VA_ARGS__) \ + _SDT_ASM_BASE +# define STAP_PROBE_ASM_OPERANDS(n, ...) _SDT_ASM_OPERANDS_##n(__VA_ARGS__) +#else +# define STAP_PROBE_ASM(provider, name, args) \ + _SDT_ASM_BODY(provider, name, /* _SDT_ASM_STRING */, (args)) \ + _SDT_ASM_BASE +#endif +#define STAP_PROBE_ASM_TEMPLATE(n) _SDT_ASM_TEMPLATE_##n,"use _SDT_ASM_TEMPLATE_" + + +/* DTrace compatible macro names. */ +#define DTRACE_PROBE(provider,probe) \ + STAP_PROBE(provider,probe) +#define DTRACE_PROBE1(provider,probe,parm1) \ + STAP_PROBE1(provider,probe,parm1) +#define DTRACE_PROBE2(provider,probe,parm1,parm2) \ + STAP_PROBE2(provider,probe,parm1,parm2) +#define DTRACE_PROBE3(provider,probe,parm1,parm2,parm3) \ + STAP_PROBE3(provider,probe,parm1,parm2,parm3) +#define DTRACE_PROBE4(provider,probe,parm1,parm2,parm3,parm4) \ + STAP_PROBE4(provider,probe,parm1,parm2,parm3,parm4) +#define DTRACE_PROBE5(provider,probe,parm1,parm2,parm3,parm4,parm5) \ + STAP_PROBE5(provider,probe,parm1,parm2,parm3,parm4,parm5) +#define DTRACE_PROBE6(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6) \ + STAP_PROBE6(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6) +#define DTRACE_PROBE7(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7) \ + STAP_PROBE7(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7) +#define DTRACE_PROBE8(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8) \ + STAP_PROBE8(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8) +#define DTRACE_PROBE9(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8,parm9) \ + STAP_PROBE9(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8,parm9) +#define DTRACE_PROBE10(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8,parm9,parm10) \ + STAP_PROBE10(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8,parm9,parm10) +#define DTRACE_PROBE11(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8,parm9,parm10,parm11) \ + STAP_PROBE11(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8,parm9,parm10,parm11) +#define DTRACE_PROBE12(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8,parm9,parm10,parm11,parm12) \ + STAP_PROBE12(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8,parm9,parm10,parm11,parm12) + + +#endif /* sys/sdt.h */ -- cgit From 00a0fa2d7d496824648b125256c5566f36b48dad Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Apr 2022 16:42:02 -0700 Subject: selftests/bpf: Add urandom_read shared lib and USDTs Extend urandom_read helper binary to include USDTs of 4 combinations: semaphore/semaphoreless (refcounted and non-refcounted) and based in executable or shared library. We also extend urandom_read with ability to report it's own PID to parent process and wait for parent process to ready itself up for tracing urandom_read. We utilize popen() and underlying pipe properties for proper signaling. Once urandom_read is ready, we add few tests to validate that libbpf's USDT attachment handles all the above combinations of semaphore (or lack of it) and static or shared library USDTs. Also, we validate that libbpf handles shared libraries both with PID filter and without one (i.e., -1 for PID argument). Having the shared library case tested with and without PID is important because internal logic differs on kernels that don't support BPF cookies. On such older kernels, attaching to USDTs in shared libraries without specifying concrete PID doesn't work in principle, because it's impossible to determine shared library's load address to derive absolute IPs for uprobe attachments. Without absolute IPs, it's impossible to perform correct look up of USDT spec based on uprobe's absolute IP (the only kind available from BPF at runtime). This is not the problem on newer kernels with BPF cookie as we don't need IP-to-ID lookup because BPF cookie value *is* spec ID. So having those two situations as separate subtests is good because libbpf CI is able to test latest selftests against old kernels (e.g., 4.9 and 5.5), so we'll be able to disable PID-less shared lib attachment for old kernels, but will still leave PID-specific one enabled to validate this legacy logic is working correctly. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Dave Marchevsky Link: https://lore.kernel.org/bpf/20220404234202.331384-8-andrii@kernel.org --- tools/testing/selftests/bpf/Makefile | 11 ++- tools/testing/selftests/bpf/prog_tests/usdt.c | 108 +++++++++++++++++++++ .../selftests/bpf/progs/test_urandom_usdt.c | 70 +++++++++++++ .../selftests/bpf/progs/test_usdt_multispec.c | 2 - tools/testing/selftests/bpf/urandom_read.c | 63 +++++++++++- tools/testing/selftests/bpf/urandom_read_aux.c | 9 ++ tools/testing/selftests/bpf/urandom_read_lib1.c | 13 +++ tools/testing/selftests/bpf/urandom_read_lib2.c | 8 ++ 8 files changed, 275 insertions(+), 9 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_urandom_usdt.c create mode 100644 tools/testing/selftests/bpf/urandom_read_aux.c create mode 100644 tools/testing/selftests/bpf/urandom_read_lib1.c create mode 100644 tools/testing/selftests/bpf/urandom_read_lib2.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 0f8c55dfd844..bafdc5373a13 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -168,9 +168,15 @@ $(OUTPUT)/%:%.c $(call msg,BINARY,,$@) $(Q)$(LINK.c) $^ $(LDLIBS) -o $@ -$(OUTPUT)/urandom_read: urandom_read.c +$(OUTPUT)/liburandom_read.so: urandom_read_lib1.c urandom_read_lib2.c + $(call msg,LIB,,$@) + $(Q)$(CC) $(CFLAGS) -fPIC $(LDFLAGS) $^ $(LDLIBS) --shared -o $@ + +$(OUTPUT)/urandom_read: urandom_read.c urandom_read_aux.c $(OUTPUT)/liburandom_read.so $(call msg,BINARY,,$@) - $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $< $(LDLIBS) -Wl,--build-id=sha1 -o $@ + $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.c,$^) \ + liburandom_read.so $(LDLIBS) \ + -Wl,-rpath=. -Wl,--build-id=sha1 -o $@ $(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch]) $(call msg,MOD,,$@) @@ -493,6 +499,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ btf_helpers.c flow_dissector_load.h \ cap_helpers.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ + $(OUTPUT)/liburandom_read.so \ ima_setup.sh \ $(wildcard progs/btf_dump_test_case_*.c) TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c index 0677c9bfd40b..a71f51bdc08d 100644 --- a/tools/testing/selftests/bpf/prog_tests/usdt.c +++ b/tools/testing/selftests/bpf/prog_tests/usdt.c @@ -6,6 +6,7 @@ #include "../sdt.h" #include "test_usdt.skel.h" +#include "test_urandom_usdt.skel.h" int lets_test_this(int); @@ -304,10 +305,117 @@ cleanup: test_usdt__destroy(skel); } +static FILE *urand_spawn(int *pid) +{ + FILE *f; + + /* urandom_read's stdout is wired into f */ + f = popen("./urandom_read 1 report-pid", "r"); + if (!f) + return NULL; + + if (fscanf(f, "%d", pid) != 1) { + pclose(f); + return NULL; + } + + return f; +} + +static int urand_trigger(FILE **urand_pipe) +{ + int exit_code; + + /* pclose() waits for child process to exit and returns their exit code */ + exit_code = pclose(*urand_pipe); + *urand_pipe = NULL; + + return exit_code; +} + +static void subtest_urandom_usdt(bool auto_attach) +{ + struct test_urandom_usdt *skel; + struct test_urandom_usdt__bss *bss; + struct bpf_link *l; + FILE *urand_pipe = NULL; + int err, urand_pid = 0; + + skel = test_urandom_usdt__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + urand_pipe = urand_spawn(&urand_pid); + if (!ASSERT_OK_PTR(urand_pipe, "urand_spawn")) + goto cleanup; + + bss = skel->bss; + bss->urand_pid = urand_pid; + + if (auto_attach) { + err = test_urandom_usdt__attach(skel); + if (!ASSERT_OK(err, "skel_auto_attach")) + goto cleanup; + } else { + l = bpf_program__attach_usdt(skel->progs.urand_read_without_sema, + urand_pid, "./urandom_read", + "urand", "read_without_sema", NULL); + if (!ASSERT_OK_PTR(l, "urand_without_sema_attach")) + goto cleanup; + skel->links.urand_read_without_sema = l; + + l = bpf_program__attach_usdt(skel->progs.urand_read_with_sema, + urand_pid, "./urandom_read", + "urand", "read_with_sema", NULL); + if (!ASSERT_OK_PTR(l, "urand_with_sema_attach")) + goto cleanup; + skel->links.urand_read_with_sema = l; + + l = bpf_program__attach_usdt(skel->progs.urandlib_read_without_sema, + urand_pid, "./liburandom_read.so", + "urandlib", "read_without_sema", NULL); + if (!ASSERT_OK_PTR(l, "urandlib_without_sema_attach")) + goto cleanup; + skel->links.urandlib_read_without_sema = l; + + l = bpf_program__attach_usdt(skel->progs.urandlib_read_with_sema, + urand_pid, "./liburandom_read.so", + "urandlib", "read_with_sema", NULL); + if (!ASSERT_OK_PTR(l, "urandlib_with_sema_attach")) + goto cleanup; + skel->links.urandlib_read_with_sema = l; + + } + + /* trigger urandom_read USDTs */ + ASSERT_OK(urand_trigger(&urand_pipe), "urand_exit_code"); + + ASSERT_EQ(bss->urand_read_without_sema_call_cnt, 1, "urand_wo_sema_cnt"); + ASSERT_EQ(bss->urand_read_without_sema_buf_sz_sum, 256, "urand_wo_sema_sum"); + + ASSERT_EQ(bss->urand_read_with_sema_call_cnt, 1, "urand_w_sema_cnt"); + ASSERT_EQ(bss->urand_read_with_sema_buf_sz_sum, 256, "urand_w_sema_sum"); + + ASSERT_EQ(bss->urandlib_read_without_sema_call_cnt, 1, "urandlib_wo_sema_cnt"); + ASSERT_EQ(bss->urandlib_read_without_sema_buf_sz_sum, 256, "urandlib_wo_sema_sum"); + + ASSERT_EQ(bss->urandlib_read_with_sema_call_cnt, 1, "urandlib_w_sema_cnt"); + ASSERT_EQ(bss->urandlib_read_with_sema_buf_sz_sum, 256, "urandlib_w_sema_sum"); + +cleanup: + if (urand_pipe) + pclose(urand_pipe); + test_urandom_usdt__destroy(skel); +} + void test_usdt(void) { if (test__start_subtest("basic")) subtest_basic_usdt(); if (test__start_subtest("multispec")) subtest_multispec_usdt(); + if (test__start_subtest("urand_auto_attach")) + subtest_urandom_usdt(true /* auto_attach */); + if (test__start_subtest("urand_pid_attach")) + subtest_urandom_usdt(false /* auto_attach */); } diff --git a/tools/testing/selftests/bpf/progs/test_urandom_usdt.c b/tools/testing/selftests/bpf/progs/test_urandom_usdt.c new file mode 100644 index 000000000000..3539b02bd5f7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_urandom_usdt.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include +#include + +int urand_pid; + +int urand_read_without_sema_call_cnt; +int urand_read_without_sema_buf_sz_sum; + +SEC("usdt/./urandom_read:urand:read_without_sema") +int BPF_USDT(urand_read_without_sema, int iter_num, int iter_cnt, int buf_sz) +{ + if (urand_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + __sync_fetch_and_add(&urand_read_without_sema_call_cnt, 1); + __sync_fetch_and_add(&urand_read_without_sema_buf_sz_sum, buf_sz); + + return 0; +} + +int urand_read_with_sema_call_cnt; +int urand_read_with_sema_buf_sz_sum; + +SEC("usdt/./urandom_read:urand:read_with_sema") +int BPF_USDT(urand_read_with_sema, int iter_num, int iter_cnt, int buf_sz) +{ + if (urand_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + __sync_fetch_and_add(&urand_read_with_sema_call_cnt, 1); + __sync_fetch_and_add(&urand_read_with_sema_buf_sz_sum, buf_sz); + + return 0; +} + +int urandlib_read_without_sema_call_cnt; +int urandlib_read_without_sema_buf_sz_sum; + +SEC("usdt/./liburandom_read.so:urandlib:read_without_sema") +int BPF_USDT(urandlib_read_without_sema, int iter_num, int iter_cnt, int buf_sz) +{ + if (urand_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + __sync_fetch_and_add(&urandlib_read_without_sema_call_cnt, 1); + __sync_fetch_and_add(&urandlib_read_without_sema_buf_sz_sum, buf_sz); + + return 0; +} + +int urandlib_read_with_sema_call_cnt; +int urandlib_read_with_sema_buf_sz_sum; + +SEC("usdt/./liburandom_read.so:urandlib:read_with_sema") +int BPF_USDT(urandlib_read_with_sema, int iter_num, int iter_cnt, int buf_sz) +{ + if (urand_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + __sync_fetch_and_add(&urandlib_read_with_sema_call_cnt, 1); + __sync_fetch_and_add(&urandlib_read_with_sema_buf_sz_sum, buf_sz); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_usdt_multispec.c b/tools/testing/selftests/bpf/progs/test_usdt_multispec.c index 3a090681f981..aa6de32b50d1 100644 --- a/tools/testing/selftests/bpf/progs/test_usdt_multispec.c +++ b/tools/testing/selftests/bpf/progs/test_usdt_multispec.c @@ -26,8 +26,6 @@ int BPF_USDT(usdt_100, int x) __sync_fetch_and_add(&usdt_100_called, 1); __sync_fetch_and_add(&usdt_100_sum, x); - bpf_printk("X is %d, sum is %d", x, usdt_100_sum); - return 0; } diff --git a/tools/testing/selftests/bpf/urandom_read.c b/tools/testing/selftests/bpf/urandom_read.c index db781052758d..e92644d0fa75 100644 --- a/tools/testing/selftests/bpf/urandom_read.c +++ b/tools/testing/selftests/bpf/urandom_read.c @@ -1,32 +1,85 @@ +#include #include #include +#include #include #include #include #include +#include + +#define _SDT_HAS_SEMAPHORES 1 +#include "sdt.h" + +#define SEC(name) __attribute__((section(name), used)) #define BUF_SIZE 256 +/* defined in urandom_read_aux.c */ +void urand_read_without_sema(int iter_num, int iter_cnt, int read_sz); +/* these are coming from urandom_read_lib{1,2}.c */ +void urandlib_read_with_sema(int iter_num, int iter_cnt, int read_sz); +void urandlib_read_without_sema(int iter_num, int iter_cnt, int read_sz); + +unsigned short urand_read_with_sema_semaphore SEC(".probes"); + static __attribute__((noinline)) void urandom_read(int fd, int count) { - char buf[BUF_SIZE]; - int i; + char buf[BUF_SIZE]; + int i; + + for (i = 0; i < count; ++i) { + read(fd, buf, BUF_SIZE); + + /* trigger USDTs defined in executable itself */ + urand_read_without_sema(i, count, BUF_SIZE); + STAP_PROBE3(urand, read_with_sema, i, count, BUF_SIZE); - for (i = 0; i < count; ++i) - read(fd, buf, BUF_SIZE); + /* trigger USDTs defined in shared lib */ + urandlib_read_without_sema(i, count, BUF_SIZE); + urandlib_read_with_sema(i, count, BUF_SIZE); + } +} + +static volatile bool parent_ready; + +static void handle_sigpipe(int sig) +{ + parent_ready = true; } int main(int argc, char *argv[]) { int fd = open("/dev/urandom", O_RDONLY); int count = 4; + bool report_pid = false; if (fd < 0) return 1; - if (argc == 2) + if (argc >= 2) count = atoi(argv[1]); + if (argc >= 3) { + report_pid = true; + /* install SIGPIPE handler to catch when parent closes their + * end of the pipe (on the other side of our stdout) + */ + signal(SIGPIPE, handle_sigpipe); + } + + /* report PID and wait for parent process to send us "signal" by + * closing stdout + */ + if (report_pid) { + while (!parent_ready) { + fprintf(stdout, "%d\n", getpid()); + fflush(stdout); + } + /* at this point stdout is closed, parent process knows our + * PID and is ready to trace us + */ + } urandom_read(fd, count); diff --git a/tools/testing/selftests/bpf/urandom_read_aux.c b/tools/testing/selftests/bpf/urandom_read_aux.c new file mode 100644 index 000000000000..6132edcfea74 --- /dev/null +++ b/tools/testing/selftests/bpf/urandom_read_aux.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include "sdt.h" + +void urand_read_without_sema(int iter_num, int iter_cnt, int read_sz) +{ + /* semaphore-less USDT */ + STAP_PROBE3(urand, read_without_sema, iter_num, iter_cnt, read_sz); +} diff --git a/tools/testing/selftests/bpf/urandom_read_lib1.c b/tools/testing/selftests/bpf/urandom_read_lib1.c new file mode 100644 index 000000000000..86186e24b740 --- /dev/null +++ b/tools/testing/selftests/bpf/urandom_read_lib1.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#define _SDT_HAS_SEMAPHORES 1 +#include "sdt.h" + +#define SEC(name) __attribute__((section(name), used)) + +unsigned short urandlib_read_with_sema_semaphore SEC(".probes"); + +void urandlib_read_with_sema(int iter_num, int iter_cnt, int read_sz) +{ + STAP_PROBE3(urandlib, read_with_sema, iter_num, iter_cnt, read_sz); +} diff --git a/tools/testing/selftests/bpf/urandom_read_lib2.c b/tools/testing/selftests/bpf/urandom_read_lib2.c new file mode 100644 index 000000000000..9d401ad9838f --- /dev/null +++ b/tools/testing/selftests/bpf/urandom_read_lib2.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include "sdt.h" + +void urandlib_read_without_sema(int iter_num, int iter_cnt, int read_sz) +{ + STAP_PROBE3(urandlib, read_without_sema, iter_num, iter_cnt, read_sz); +} -- cgit From 042152c27c3bc3e20882f75c289ced32331f4010 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Sat, 2 Apr 2022 03:39:42 -0400 Subject: bpf, arm64: Sign return address for JITed code Sign return address for JITed code when the kernel is built with pointer authentication enabled: 1. Sign LR with paciasp instruction before LR is pushed to stack. Since paciasp acts like landing pads for function entry, no need to insert bti instruction before paciasp. 2. Authenticate LR with autiasp instruction after LR is popped from stack. For BPF tail call, the stack frame constructed by the caller is reused by the callee. That is, the stack frame is constructed by the caller and destructed by the callee. Thus LR is signed and pushed to the stack in the caller's prologue, and poped from the stack and authenticated in the callee's epilogue. For BPF2BPF call, the caller and callee construct their own stack frames, and sign and authenticate their own LRs. Signed-off-by: Xu Kuohai Signed-off-by: Daniel Borkmann Link: https://events.static.linuxfound.org/sites/events/files/slides/slides_23.pdf Link: https://lore.kernel.org/bpf/20220402073942.3782529-1-xukuohai@huawei.com --- arch/arm64/net/bpf_jit.h | 3 +++ arch/arm64/net/bpf_jit_comp.c | 12 ++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index 3920213244f0..194c95ccc1cf 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -263,6 +263,9 @@ /* HINTs */ #define A64_HINT(x) aarch64_insn_gen_hint(x) +#define A64_PACIASP A64_HINT(AARCH64_INSN_HINT_PACIASP) +#define A64_AUTIASP A64_HINT(AARCH64_INSN_HINT_AUTIASP) + /* BTI */ #define A64_BTI_C A64_HINT(AARCH64_INSN_HINT_BTIC) #define A64_BTI_J A64_HINT(AARCH64_INSN_HINT_BTIJ) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 093fa9ea1083..8ab4035dea27 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -236,7 +236,8 @@ static bool is_lsi_offset(int offset, int scale) } /* Tail call offset to jump into */ -#if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) +#if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) || \ + IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) #define PROLOGUE_OFFSET 9 #else #define PROLOGUE_OFFSET 8 @@ -278,8 +279,11 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) * */ + /* Sign lr */ + if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL)) + emit(A64_PACIASP, ctx); /* BTI landing pad */ - if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) + else if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)) emit(A64_BTI_C, ctx); /* Save FP and LR registers to stay align with ARM64 AAPCS */ @@ -580,6 +584,10 @@ static void build_epilogue(struct jit_ctx *ctx) /* Set return value */ emit(A64_MOV(1, A64_R(0), r0), ctx); + /* Authenticate lr */ + if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL)) + emit(A64_AUTIASP, ctx); + emit(A64_RET(A64_LR), ctx); } -- cgit From 2d0df01974ce2b59b6f7d5bd3ea58d74f12ddf85 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Tue, 5 Apr 2022 22:57:11 +0800 Subject: selftests/bpf: Fix file descriptor leak in load_kallsyms() Currently, if sym_cnt > 0, it just returns and does not close file, fix it. Signed-off-by: Yuntao Wang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220405145711.49543-1-ytcoode@gmail.com --- tools/testing/selftests/bpf/trace_helpers.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 3d6217e3aff7..9c4be2cdb21a 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -25,15 +25,12 @@ static int ksym_cmp(const void *p1, const void *p2) int load_kallsyms(void) { - FILE *f = fopen("/proc/kallsyms", "r"); + FILE *f; char func[256], buf[256]; char symbol; void *addr; int i = 0; - if (!f) - return -ENOENT; - /* * This is called/used from multiplace places, * load symbols just once. @@ -41,6 +38,10 @@ int load_kallsyms(void) if (sym_cnt) return 0; + f = fopen("/proc/kallsyms", "r"); + if (!f) + return -ENOENT; + while (fgets(buf, sizeof(buf), f)) { if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3) break; -- cgit From 1963c740dc2b52f45ba3d3748a50a0ab35db098a Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 2 Apr 2022 16:19:14 +0200 Subject: net: netfilter: Reports ct direction in CT lookup helpers for XDP and TC-BPF Report connection tracking tuple direction in bpf_skb_ct_lookup/bpf_xdp_ct_lookup helpers. Direction will be used to implement snat/dnat through xdp ebpf program. Signed-off-by: Lorenzo Bianconi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/aa1aaac89191cfc64078ecef36c0a48c302321b9.1648908601.git.lorenzo@kernel.org --- net/netfilter/nf_conntrack_bpf.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index fe98673dd5ac..bc4d5cd63a94 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -38,6 +38,7 @@ * @l4proto - Layer 4 protocol * Values: * IPPROTO_TCP, IPPROTO_UDP + * @dir: - connection tracking tuple direction. * @reserved - Reserved member, will be reused for more options in future * Values: * 0 @@ -46,7 +47,8 @@ struct bpf_ct_opts { s32 netns_id; s32 error; u8 l4proto; - u8 reserved[3]; + u8 dir; + u8 reserved[2]; }; enum { @@ -56,10 +58,11 @@ enum { static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, struct bpf_sock_tuple *bpf_tuple, u32 tuple_len, u8 protonum, - s32 netns_id) + s32 netns_id, u8 *dir) { struct nf_conntrack_tuple_hash *hash; struct nf_conntrack_tuple tuple; + struct nf_conn *ct; if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP)) return ERR_PTR(-EPROTO); @@ -99,7 +102,12 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, put_net(net); if (!hash) return ERR_PTR(-ENOENT); - return nf_ct_tuplehash_to_ctrack(hash); + + ct = nf_ct_tuplehash_to_ctrack(hash); + if (dir) + *dir = NF_CT_DIRECTION(hash); + + return ct; } __diag_push(); @@ -135,13 +143,13 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, if (!opts) return NULL; if (!bpf_tuple || opts->reserved[0] || opts->reserved[1] || - opts->reserved[2] || opts__sz != NF_BPF_CT_OPTS_SZ) { + opts__sz != NF_BPF_CT_OPTS_SZ) { opts->error = -EINVAL; return NULL; } caller_net = dev_net(ctx->rxq->dev); nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts->l4proto, - opts->netns_id); + opts->netns_id, &opts->dir); if (IS_ERR(nfct)) { opts->error = PTR_ERR(nfct); return NULL; @@ -178,13 +186,13 @@ bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, if (!opts) return NULL; if (!bpf_tuple || opts->reserved[0] || opts->reserved[1] || - opts->reserved[2] || opts__sz != NF_BPF_CT_OPTS_SZ) { + opts__sz != NF_BPF_CT_OPTS_SZ) { opts->error = -EINVAL; return NULL; } caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk); nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts->l4proto, - opts->netns_id); + opts->netns_id, &opts->dir); if (IS_ERR(nfct)) { opts->error = PTR_ERR(nfct); return NULL; -- cgit From 958ddfd75d83d83e713027677c8786781e1f4576 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Wed, 6 Apr 2022 08:36:22 +0800 Subject: selftests/bpf: Fix issues in parse_num_list() The function does not check that parsing_end is false after parsing argument. Thus, if the final part of the argument is something like '4-', which is invalid, parse_num_list() will discard it instead of returning -EINVAL. Before: $ ./test_progs -n 2,4- #2 atomic_bounds:OK Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED After: $ ./test_progs -n 2,4- Failed to parse test numbers. Signed-off-by: Yuntao Wang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220406003622.73539-1-ytcoode@gmail.com --- tools/testing/selftests/bpf/testing_helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index 795b6798ccee..87867f7a78c3 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -60,7 +60,7 @@ int parse_num_list(const char *s, bool **num_set, int *num_set_len) set[i] = true; } - if (!set) + if (!set || parsing_end) return -EINVAL; *num_set = set; -- cgit From a8d600f6bcd453f1807703b5a016212f5484ffa1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 6 Apr 2022 09:08:35 +0100 Subject: libbpf: Fix spelling mistake "libaries" -> "libraries" There is a spelling mistake in a pr_warn message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220406080835.14879-1-colin.i.king@gmail.com --- tools/lib/bpf/usdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index 1bce2eab5e89..c5acf2824fcc 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -687,7 +687,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * * to shared libraries with no PID filter. */ if (pid < 0) { - pr_warn("usdt: attaching to shared libaries without specific PID is not supported on current kernel\n"); + pr_warn("usdt: attaching to shared libraries without specific PID is not supported on current kernel\n"); err = -ENOTSUP; goto err_out; } -- cgit From ebaf24c589d7c714b763a80856d1a6df3ba25b84 Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Wed, 6 Apr 2022 10:54:08 +0200 Subject: selftests/bpf: Use bpf_num_possible_cpus() in per-cpu map allocations bpf_map_value_size() uses num_possible_cpus() to determine map size, but some of the tests only allocate enough memory for online cpus. This results in out-of-bound writes in userspace during bpf(BPF_MAP_LOOKUP_ELEM) syscalls in cases when number of online cpus is lower than the number of possible cpus. Fix by switching from get_nprocs_conf() to bpf_num_possible_cpus() when determining the number of processors in these tests (test_progs/netcnt and test_cgroup_storage). Signed-off-by: Artem Savkov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220406085408.339336-1-asavkov@redhat.com --- tools/testing/selftests/bpf/prog_tests/netcnt.c | 2 +- tools/testing/selftests/bpf/test_cgroup_storage.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/netcnt.c b/tools/testing/selftests/bpf/prog_tests/netcnt.c index 954964f0ac3d..d3915c58d0e1 100644 --- a/tools/testing/selftests/bpf/prog_tests/netcnt.c +++ b/tools/testing/selftests/bpf/prog_tests/netcnt.c @@ -25,7 +25,7 @@ void serial_test_netcnt(void) if (!ASSERT_OK_PTR(skel, "netcnt_prog__open_and_load")) return; - nproc = get_nprocs_conf(); + nproc = bpf_num_possible_cpus(); percpu_netcnt = malloc(sizeof(*percpu_netcnt) * nproc); if (!ASSERT_OK_PTR(percpu_netcnt, "malloc(percpu_netcnt)")) goto err; diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c index d6a1be4d8020..2ffa08198d1c 100644 --- a/tools/testing/selftests/bpf/test_cgroup_storage.c +++ b/tools/testing/selftests/bpf/test_cgroup_storage.c @@ -7,6 +7,7 @@ #include #include "bpf_rlimit.h" +#include "bpf_util.h" #include "cgroup_helpers.h" #include "testing_helpers.h" @@ -44,7 +45,7 @@ int main(int argc, char **argv) unsigned long long *percpu_value; int cpu, nproc; - nproc = get_nprocs_conf(); + nproc = bpf_num_possible_cpus(); percpu_value = malloc(sizeof(*percpu_value) * nproc); if (!percpu_value) { printf("Not enough memory for per-cpu area (%d cpus)\n", nproc); -- cgit From be77354a3d7ebd4897ee18eca26dca6df9224c76 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 19 Mar 2022 13:38:23 +0530 Subject: bpf: Do write access check for kfunc and global func When passing pointer to some map value to kfunc or global func, in verifier we are passing meta as NULL to various functions, which uses meta->raw_mode to check whether memory is being written to. Since some kfunc or global funcs may also write to memory pointers they receive as arguments, we must check for write access to memory. E.g. in some case map may be read only and this will be missed by current checks. However meta->raw_mode allows for uninitialized memory (e.g. on stack), since there is not enough info available through BTF, we must perform one call for read access (raw_mode = false), and one for write access (raw_mode = true). Fixes: e5069b9c23b3 ("bpf: Support pointers in global func args") Fixes: d583691c47dc ("bpf: Introduce mem, size argument pair support for kfunc") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220319080827.73251-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d175b70067b3..e9807e6e1090 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4919,8 +4919,7 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, * out. Only upper bounds can be learned because retval is an * int type and negative retvals are allowed. */ - if (meta) - meta->msize_max_value = reg->umax_value; + meta->msize_max_value = reg->umax_value; /* The register is SCALAR_VALUE; the access check * happens using its boundaries. @@ -4963,24 +4962,33 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size) { + bool may_be_null = type_may_be_null(reg->type); + struct bpf_reg_state saved_reg; + struct bpf_call_arg_meta meta; + int err; + if (register_is_null(reg)) return 0; - if (type_may_be_null(reg->type)) { - /* Assuming that the register contains a value check if the memory - * access is safe. Temporarily save and restore the register's state as - * the conversion shouldn't be visible to a caller. - */ - const struct bpf_reg_state saved_reg = *reg; - int rv; - + memset(&meta, 0, sizeof(meta)); + /* Assuming that the register contains a value check if the memory + * access is safe. Temporarily save and restore the register's state as + * the conversion shouldn't be visible to a caller. + */ + if (may_be_null) { + saved_reg = *reg; mark_ptr_not_null_reg(reg); - rv = check_helper_mem_access(env, regno, mem_size, true, NULL); - *reg = saved_reg; - return rv; } - return check_helper_mem_access(env, regno, mem_size, true, NULL); + err = check_helper_mem_access(env, regno, mem_size, true, &meta); + /* Check access for BPF_WRITE */ + meta.raw_mode = true; + err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta); + + if (may_be_null) + *reg = saved_reg; + + return err; } int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, @@ -4989,16 +4997,22 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1]; bool may_be_null = type_may_be_null(mem_reg->type); struct bpf_reg_state saved_reg; + struct bpf_call_arg_meta meta; int err; WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5); + memset(&meta, 0, sizeof(meta)); + if (may_be_null) { saved_reg = *mem_reg; mark_ptr_not_null_reg(mem_reg); } - err = check_mem_size_reg(env, reg, regno, true, NULL); + err = check_mem_size_reg(env, reg, regno, true, &meta); + /* Check access for BPF_WRITE */ + meta.raw_mode = true; + err = err ?: check_mem_size_reg(env, reg, regno, true, &meta); if (may_be_null) *mem_reg = saved_reg; -- cgit From 97e6d7dab1ca4648821c790a2b7913d6d5d549db Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 19 Mar 2022 13:38:24 +0530 Subject: bpf: Check PTR_TO_MEM | MEM_RDONLY in check_helper_mem_access The commit being fixed was aiming to disallow users from incorrectly obtaining writable pointer to memory that is only meant to be read. This is enforced now using a MEM_RDONLY flag. For instance, in case of global percpu variables, when the BTF type is not struct (e.g. bpf_prog_active), the verifier marks register type as PTR_TO_MEM | MEM_RDONLY from bpf_this_cpu_ptr or bpf_per_cpu_ptr helpers. However, when passing such pointer to kfunc, global funcs, or BPF helpers, in check_helper_mem_access, there is no expectation MEM_RDONLY flag will be set, hence it is checked as pointer to writable memory. Later, verifier sets up argument type of global func as PTR_TO_MEM | PTR_MAYBE_NULL, so user can use a global func to get around the limitations imposed by this flag. This check will also cover global non-percpu variables that may be introduced in kernel BTF in future. Also, we update the log message for PTR_TO_BUF case to be similar to PTR_TO_MEM case, so that the reason for error is clear to user. Fixes: 34d3a78c681e ("bpf: Make per_cpu_ptr return rdonly PTR_TO_MEM.") Reviewed-by: Hao Luo Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220319080827.73251-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e9807e6e1090..d953e62b5268 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4871,13 +4871,23 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MEM: + if (type_is_rdonly_mem(reg->type)) { + if (meta && meta->raw_mode) { + verbose(env, "R%d cannot write into %s\n", regno, + reg_type_str(env, reg->type)); + return -EACCES; + } + } return check_mem_region_access(env, regno, reg->off, access_size, reg->mem_size, zero_size_allowed); case PTR_TO_BUF: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) + if (meta && meta->raw_mode) { + verbose(env, "R%d cannot write into %s\n", regno, + reg_type_str(env, reg->type)); return -EACCES; + } max_access = &env->prog->aux->max_rdonly_access; } else { -- cgit From 7b3552d3f9f6897851fc453b5131a967167e43c2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 19 Mar 2022 13:38:25 +0530 Subject: bpf: Reject writes for PTR_TO_MAP_KEY in check_helper_mem_access It is not permitted to write to PTR_TO_MAP_KEY, but the current code in check_helper_mem_access would allow for it, reject this case as well, as helpers taking ARG_PTR_TO_UNINIT_MEM also take PTR_TO_MAP_KEY. Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220319080827.73251-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d953e62b5268..9c1a02b82ecd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4861,6 +4861,11 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: + if (meta && meta->raw_mode) { + verbose(env, "R%d cannot write into %s\n", regno, + reg_type_str(env, reg->type)); + return -EACCES; + } return check_mem_region_access(env, regno, reg->off, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: -- cgit From 7cb29b1c99f4244c3f1de04e88eb9aed842a0cba Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 19 Mar 2022 13:38:26 +0530 Subject: selftests/bpf: Test passing rdonly mem to global func Add two test cases, one pass read only map value pointer to global func, which should be rejected. The same code checks it for kfunc, so that is covered as well. Second one tries to use the missing check for PTR_TO_MEM's MEM_RDONLY flag and tries to write to a read only memory pointer. Without prior patches, both of these tests fail. Reviewed-by: Hao Luo Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220319080827.73251-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/ksyms_btf.c | 17 ++++++++++++----- .../selftests/bpf/prog_tests/test_global_funcs.c | 1 + tools/testing/selftests/bpf/progs/test_global_func17.c | 16 ++++++++++++++++ .../selftests/bpf/progs/test_ksyms_btf_write_check.c | 18 +++++++++++++++++- 4 files changed, 46 insertions(+), 6 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_global_func17.c diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c index f6933b06daf8..1d7a2f1e0731 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c @@ -138,12 +138,16 @@ cleanup: test_ksyms_weak_lskel__destroy(skel); } -static void test_write_check(void) +static void test_write_check(bool test_handler1) { struct test_ksyms_btf_write_check *skel; - skel = test_ksyms_btf_write_check__open_and_load(); - ASSERT_ERR_PTR(skel, "unexpected load of a prog writing to ksym memory\n"); + skel = test_ksyms_btf_write_check__open(); + if (!ASSERT_OK_PTR(skel, "test_ksyms_btf_write_check__open")) + return; + bpf_program__set_autoload(test_handler1 ? skel->progs.handler2 : skel->progs.handler1, false); + ASSERT_ERR(test_ksyms_btf_write_check__load(skel), + "unexpected load of a prog writing to ksym memory\n"); test_ksyms_btf_write_check__destroy(skel); } @@ -179,6 +183,9 @@ void test_ksyms_btf(void) if (test__start_subtest("weak_ksyms_lskel")) test_weak_syms_lskel(); - if (test__start_subtest("write_check")) - test_write_check(); + if (test__start_subtest("write_check1")) + test_write_check(true); + + if (test__start_subtest("write_check2")) + test_write_check(false); } diff --git a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c index 509e21d5cb9d..b90ee47d3111 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c +++ b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c @@ -81,6 +81,7 @@ void test_test_global_funcs(void) { "test_global_func14.o", "reference type('FWD S') size cannot be determined" }, { "test_global_func15.o", "At program exit the register R0 has value" }, { "test_global_func16.o", "invalid indirect read from stack" }, + { "test_global_func17.o", "Caller passes invalid args into func#1" }, }; libbpf_print_fn_t old_print_fn = NULL; int err, i, duration = 0; diff --git a/tools/testing/selftests/bpf/progs/test_global_func17.c b/tools/testing/selftests/bpf/progs/test_global_func17.c new file mode 100644 index 000000000000..2b8b9b8ba018 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func17.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +__noinline int foo(int *p) +{ + return p ? (*p = 42) : 0; +} + +const volatile int i; + +SEC("tc") +int test_cls(struct __sk_buff *skb) +{ + return foo((int *)&i); +} diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c index 2180c41cd890..a72a5bf3812a 100644 --- a/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c +++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c @@ -8,7 +8,7 @@ extern const int bpf_prog_active __ksym; /* int type global var. */ SEC("raw_tp/sys_enter") -int handler(const void *ctx) +int handler1(const void *ctx) { int *active; __u32 cpu; @@ -26,4 +26,20 @@ int handler(const void *ctx) return 0; } +__noinline int write_active(int *p) +{ + return p ? (*p = 42) : 0; +} + +SEC("raw_tp/sys_enter") +int handler2(const void *ctx) +{ + int *active; + __u32 cpu; + + active = bpf_this_cpu_ptr(&bpf_prog_active); + write_active(active); + return 0; +} + char _license[] SEC("license") = "GPL"; -- cgit From 9fc4476a08b6dc61e406c2c0c4e0690512f60e82 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 19 Mar 2022 13:38:27 +0530 Subject: selftests/bpf: Test for writes to map key from BPF helpers When invoking bpf_for_each_map_elem callback, we are passed a PTR_TO_MAP_KEY, previously writes to this through helper may be allowed, but the fix in previous patches is meant to prevent that case. The test case tries to pass it as writable memory to helper, and fails test if it succeeds to pass the verifier. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220319080827.73251-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/for_each.c | 12 ++++++++++ .../bpf/progs/for_each_map_elem_write_key.c | 27 ++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/for_each_map_elem_write_key.c diff --git a/tools/testing/selftests/bpf/prog_tests/for_each.c b/tools/testing/selftests/bpf/prog_tests/for_each.c index 044df13ee069..754e80937e5d 100644 --- a/tools/testing/selftests/bpf/prog_tests/for_each.c +++ b/tools/testing/selftests/bpf/prog_tests/for_each.c @@ -4,6 +4,7 @@ #include #include "for_each_hash_map_elem.skel.h" #include "for_each_array_map_elem.skel.h" +#include "for_each_map_elem_write_key.skel.h" static unsigned int duration; @@ -129,10 +130,21 @@ out: for_each_array_map_elem__destroy(skel); } +static void test_write_map_key(void) +{ + struct for_each_map_elem_write_key *skel; + + skel = for_each_map_elem_write_key__open_and_load(); + if (!ASSERT_ERR_PTR(skel, "for_each_map_elem_write_key__open_and_load")) + for_each_map_elem_write_key__destroy(skel); +} + void test_for_each(void) { if (test__start_subtest("hash_map")) test_hash_map(); if (test__start_subtest("array_map")) test_array_map(); + if (test__start_subtest("write_map_key")) + test_write_map_key(); } diff --git a/tools/testing/selftests/bpf/progs/for_each_map_elem_write_key.c b/tools/testing/selftests/bpf/progs/for_each_map_elem_write_key.c new file mode 100644 index 000000000000..8e545865ea33 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/for_each_map_elem_write_key.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} array_map SEC(".maps"); + +static __u64 +check_array_elem(struct bpf_map *map, __u32 *key, __u64 *val, + void *data) +{ + bpf_get_current_comm(key, sizeof(*key)); + return 0; +} + +SEC("raw_tp/sys_enter") +int test_map_key_write(const void *ctx) +{ + bpf_for_each_map_elem(&array_map, check_array_elem, NULL, 0); + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit From a1c9d61b19cbc0b9618c0a0400c304ecb63221d5 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Wed, 6 Apr 2022 12:43:49 +0100 Subject: libbpf: Improve library identification for uprobe binary path resolution In the process of doing path resolution for uprobe attach, libraries are identified by matching a ".so" substring in the binary_path. This matches a lot of patterns that do not conform to library.so[.version] format, so instead match a ".so" _suffix_, and if that fails match a ".so." substring for the versioned library case. Suggested-by: Andrii Nakryiko Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1649245431-29956-2-git-send-email-alan.maguire@oracle.com --- tools/lib/bpf/libbpf.c | 2 +- tools/lib/bpf/libbpf_internal.h | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 1111e9d16e01..c92226a150d0 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10766,7 +10766,7 @@ static int resolve_full_path(const char *file, char *result, size_t result_sz) const char *search_paths[3] = {}; int i; - if (strstr(file, ".so")) { + if (str_has_sfx(file, ".so") || strstr(file, ".so.")) { search_paths[0] = getenv("LD_LIBRARY_PATH"); search_paths[1] = "/usr/lib64:/usr/lib"; search_paths[2] = arch_specific_lib_paths(); diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index dd0d4ccfa649..080272421f6c 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -103,6 +103,17 @@ #define str_has_pfx(str, pfx) \ (strncmp(str, pfx, __builtin_constant_p(pfx) ? sizeof(pfx) - 1 : strlen(pfx)) == 0) +/* suffix check */ +static inline bool str_has_sfx(const char *str, const char *sfx) +{ + size_t str_len = strlen(str); + size_t sfx_len = strlen(sfx); + + if (sfx_len <= str_len) + return strcmp(str + str_len - sfx_len, sfx); + return false; +} + /* Symbol versioning is different between static and shared library. * Properly versioned symbols are needed for shared library, but * only the symbol of the new version is needed for static library. -- cgit From 90db26e6be01cea519d380c59db3491e75b96b7f Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Wed, 6 Apr 2022 12:43:50 +0100 Subject: libbpf: Improve string parsing for uprobe auto-attach For uprobe auto-attach, the parsing can be simplified for the SEC() name to a single sscanf(); the return value of the sscanf can then be used to distinguish between sections that simply specify "u[ret]probe" (and thus cannot auto-attach), those that specify "u[ret]probe/binary_path:function+offset" etc. Suggested-by: Andrii Nakryiko Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1649245431-29956-3-git-send-email-alan.maguire@oracle.com --- tools/lib/bpf/libbpf.c | 81 ++++++++++++++++++++------------------------------ 1 file changed, 33 insertions(+), 48 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c92226a150d0..016ecdd1c3e1 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10913,60 +10913,45 @@ err_out: static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link) { DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts); - char *func, *probe_name, *func_end; - char *func_name, binary_path[512]; - unsigned long long raw_offset; - size_t offset = 0; - int n; + char *probe_type = NULL, *binary_path = NULL, *func_name = NULL; + int n, ret = -EINVAL; + long offset = 0; *link = NULL; - opts.retprobe = str_has_pfx(prog->sec_name, "uretprobe"); - if (opts.retprobe) - probe_name = prog->sec_name + sizeof("uretprobe") - 1; - else - probe_name = prog->sec_name + sizeof("uprobe") - 1; - if (probe_name[0] == '/') - probe_name++; - - /* handle SEC("u[ret]probe") - format is valid, but auto-attach is impossible. */ - if (strlen(probe_name) == 0) - return 0; - - snprintf(binary_path, sizeof(binary_path), "%s", probe_name); - /* ':' should be prior to function+offset */ - func_name = strrchr(binary_path, ':'); - if (!func_name) { - pr_warn("section '%s' missing ':function[+offset]' specification\n", + n = sscanf(prog->sec_name, "%m[^/]/%m[^:]:%m[a-zA-Z0-9_.]+%li", + &probe_type, &binary_path, &func_name, &offset); + switch (n) { + case 1: + /* handle SEC("u[ret]probe") - format is valid, but auto-attach is impossible. */ + ret = 0; + break; + case 2: + pr_warn("prog '%s': section '%s' missing ':function[+offset]' specification\n", + prog->name, prog->sec_name); + break; + case 3: + case 4: + opts.retprobe = strcmp(probe_type, "uretprobe") == 0; + if (opts.retprobe && offset != 0) { + pr_warn("prog '%s': uretprobes do not support offset specification\n", + prog->name); + break; + } + opts.func_name = func_name; + *link = bpf_program__attach_uprobe_opts(prog, -1, binary_path, offset, &opts); + ret = libbpf_get_error(*link); + break; + default: + pr_warn("prog '%s': invalid format of section definition '%s'\n", prog->name, prog->sec_name); - return -EINVAL; - } - func_name[0] = '\0'; - func_name++; - n = sscanf(func_name, "%m[a-zA-Z0-9_.]+%li", &func, &offset); - if (n < 1) { - pr_warn("uprobe name '%s' is invalid\n", func_name); - return -EINVAL; - } - if (opts.retprobe && offset != 0) { - free(func); - pr_warn("uretprobes do not support offset specification\n"); - return -EINVAL; - } - - /* Is func a raw address? */ - errno = 0; - raw_offset = strtoull(func, &func_end, 0); - if (!errno && !*func_end) { - free(func); - func = NULL; - offset = (size_t)raw_offset; + break; } - opts.func_name = func; + free(probe_type); + free(binary_path); + free(func_name); - *link = bpf_program__attach_uprobe_opts(prog, -1, binary_path, offset, &opts); - free(func); - return libbpf_get_error(*link); + return ret; } struct bpf_link *bpf_program__attach_uprobe(const struct bpf_program *prog, -- cgit From 1717e248014c33a81d7a1cdc048c6b3bed7bb3f8 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Wed, 6 Apr 2022 12:43:51 +0100 Subject: selftests/bpf: Uprobe tests should verify param/return values uprobe/uretprobe tests don't do any validation of arguments/return values, and without this we can't be sure we are attached to the right function, or that we are indeed attached to a uprobe or uretprobe. To fix this record argument and return value for auto-attached functions and ensure these match expectations. Also need to filter by pid to ensure we do not pick up stray malloc()s since auto-attach traces libc system-wide. Suggested-by: Andrii Nakryiko Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1649245431-29956-4-git-send-email-alan.maguire@oracle.com --- .../selftests/bpf/prog_tests/uprobe_autoattach.c | 25 +++++++++---- .../selftests/bpf/progs/test_uprobe_autoattach.c | 43 ++++++++++++++++------ 2 files changed, 50 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c b/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c index 03b15d632be4..d6003dc8cc99 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c @@ -5,14 +5,17 @@ #include "test_uprobe_autoattach.skel.h" /* uprobe attach point */ -static void autoattach_trigger_func(void) +static noinline int autoattach_trigger_func(int arg) { asm volatile (""); + return arg + 1; } void test_uprobe_autoattach(void) { struct test_uprobe_autoattach *skel; + int trigger_val = 100, trigger_ret; + size_t malloc_sz = 1; char *mem; skel = test_uprobe_autoattach__open_and_load(); @@ -22,17 +25,25 @@ void test_uprobe_autoattach(void) if (!ASSERT_OK(test_uprobe_autoattach__attach(skel), "skel_attach")) goto cleanup; + skel->bss->test_pid = getpid(); + /* trigger & validate uprobe & uretprobe */ - autoattach_trigger_func(); + trigger_ret = autoattach_trigger_func(trigger_val); + + skel->bss->test_pid = getpid(); /* trigger & validate shared library u[ret]probes attached by name */ - mem = malloc(1); + mem = malloc(malloc_sz); free(mem); - ASSERT_EQ(skel->bss->uprobe_byname_res, 1, "check_uprobe_byname_res"); - ASSERT_EQ(skel->bss->uretprobe_byname_res, 2, "check_uretprobe_byname_res"); - ASSERT_EQ(skel->bss->uprobe_byname2_res, 3, "check_uprobe_byname2_res"); - ASSERT_EQ(skel->bss->uretprobe_byname2_res, 4, "check_uretprobe_byname2_res"); + ASSERT_EQ(skel->bss->uprobe_byname_parm1, trigger_val, "check_uprobe_byname_parm1"); + ASSERT_EQ(skel->bss->uprobe_byname_ran, 1, "check_uprobe_byname_ran"); + ASSERT_EQ(skel->bss->uretprobe_byname_rc, trigger_ret, "check_uretprobe_byname_rc"); + ASSERT_EQ(skel->bss->uretprobe_byname_ran, 2, "check_uretprobe_byname_ran"); + ASSERT_EQ(skel->bss->uprobe_byname2_parm1, malloc_sz, "check_uprobe_byname2_parm1"); + ASSERT_EQ(skel->bss->uprobe_byname2_ran, 3, "check_uprobe_byname2_ran"); + ASSERT_EQ(skel->bss->uretprobe_byname2_rc, mem, "check_uretprobe_byname2_rc"); + ASSERT_EQ(skel->bss->uretprobe_byname2_ran, 4, "check_uretprobe_byname2_ran"); cleanup: test_uprobe_autoattach__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c b/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c index b442fb5ef54b..ab75522e2eeb 100644 --- a/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c +++ b/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c @@ -1,15 +1,22 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2022, Oracle and/or its affiliates. */ -#include -#include +#include "vmlinux.h" + +#include #include #include -int uprobe_byname_res = 0; -int uretprobe_byname_res = 0; -int uprobe_byname2_res = 0; -int uretprobe_byname2_res = 0; +int uprobe_byname_parm1 = 0; +int uprobe_byname_ran = 0; +int uretprobe_byname_rc = 0; +int uretprobe_byname_ran = 0; +size_t uprobe_byname2_parm1 = 0; +int uprobe_byname2_ran = 0; +char *uretprobe_byname2_rc = NULL; +int uretprobe_byname2_ran = 0; + +int test_pid; /* This program cannot auto-attach, but that should not stop other * programs from attaching. @@ -23,14 +30,16 @@ int handle_uprobe_noautoattach(struct pt_regs *ctx) SEC("uprobe//proc/self/exe:autoattach_trigger_func") int handle_uprobe_byname(struct pt_regs *ctx) { - uprobe_byname_res = 1; + uprobe_byname_parm1 = PT_REGS_PARM1_CORE(ctx); + uprobe_byname_ran = 1; return 0; } SEC("uretprobe//proc/self/exe:autoattach_trigger_func") int handle_uretprobe_byname(struct pt_regs *ctx) { - uretprobe_byname_res = 2; + uretprobe_byname_rc = PT_REGS_RC_CORE(ctx); + uretprobe_byname_ran = 2; return 0; } @@ -38,14 +47,26 @@ int handle_uretprobe_byname(struct pt_regs *ctx) SEC("uprobe/libc.so.6:malloc") int handle_uprobe_byname2(struct pt_regs *ctx) { - uprobe_byname2_res = 3; + int pid = bpf_get_current_pid_tgid() >> 32; + + /* ignore irrelevant invocations */ + if (test_pid != pid) + return 0; + uprobe_byname2_parm1 = PT_REGS_PARM1_CORE(ctx); + uprobe_byname2_ran = 3; return 0; } -SEC("uretprobe/libc.so.6:free") +SEC("uretprobe/libc.so.6:malloc") int handle_uretprobe_byname2(struct pt_regs *ctx) { - uretprobe_byname2_res = 4; + int pid = bpf_get_current_pid_tgid() >> 32; + + /* ignore irrelevant invocations */ + if (test_pid != pid) + return 0; + uretprobe_byname2_rc = (char *)PT_REGS_RC_CORE(ctx); + uretprobe_byname2_ran = 4; return 0; } -- cgit From e58c5c9717460851047f63b8615ea0760a6f3a2e Mon Sep 17 00:00:00 2001 From: Haowen Bai Date: Thu, 7 Apr 2022 10:38:17 +0800 Subject: libbpf: Potential NULL dereference in usdt_manager_attach_usdt() link could be null but still dereference bpf_link__destroy(&link->link) and it will lead to a null pointer access. Signed-off-by: Haowen Bai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1649299098-2069-1-git-send-email-baihaowen@meizu.com --- tools/lib/bpf/usdt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index c5acf2824fcc..bb1e88613343 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -1071,8 +1071,8 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct return &link->link; err_out: - bpf_link__destroy(&link->link); - + if (link) + bpf_link__destroy(&link->link); free(targets); hashmap__free(specs_hash); if (elf) -- cgit From ded6dffaed5edc68f1e64b523353da14db673460 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 7 Apr 2022 13:38:42 -0700 Subject: libbpf: Fix use #ifdef instead of #if to avoid compiler warning As reported by Naresh: perf build errors on i386 [1] on Linux next-20220407 [2] usdt.c:1181:5: error: "__x86_64__" is not defined, evaluates to 0 [-Werror=undef] 1181 | #if __x86_64__ | ^~~~~~~~~~ usdt.c:1196:5: error: "__x86_64__" is not defined, evaluates to 0 [-Werror=undef] 1196 | #if __x86_64__ | ^~~~~~~~~~ cc1: all warnings being treated as errors Use #ifdef instead of #if to avoid this. Fixes: 4c59e584d158 ("libbpf: Add x86-specific USDT arg spec parsing logic") Reported-by: Naresh Kamboju Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220407203842.3019904-1-andrii@kernel.org --- tools/lib/bpf/usdt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index bb1e88613343..b699e720136a 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -1178,7 +1178,7 @@ static int calc_pt_regs_off(const char *reg_name) const char *names[4]; size_t pt_regs_off; } reg_map[] = { -#if __x86_64__ +#ifdef __x86_64__ #define reg_off(reg64, reg32) offsetof(struct pt_regs, reg64) #else #define reg_off(reg64, reg32) offsetof(struct pt_regs, reg32) @@ -1193,7 +1193,7 @@ static int calc_pt_regs_off(const char *reg_name) { {"rbp", "ebp", "bp", "bpl"}, reg_off(rbp, ebp) }, { {"rsp", "esp", "sp", "spl"}, reg_off(rsp, esp) }, #undef reg_off -#if __x86_64__ +#ifdef __x86_64__ { {"r8", "r8d", "r8w", "r8b"}, offsetof(struct pt_regs, r8) }, { {"r9", "r9d", "r9w", "r9b"}, offsetof(struct pt_regs, r9) }, { {"r10", "r10d", "r10w", "r10b"}, offsetof(struct pt_regs, r10) }, -- cgit From e1b6df598aa86e351437135c76ed38c9a5e3d397 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 7 Apr 2022 23:44:09 +0200 Subject: libbpf: Minor style improvements in USDT code Fix several typos and references to non-existing headers. Also use __BYTE_ORDER__ instead of __BYTE_ORDER for consistency with the rest of the bpf code - see commit 45f2bebc8079 ("libbpf: Fix endianness detection in BPF_CORE_READ_BITFIELD_PROBED()") for rationale). Signed-off-by: Ilya Leoshkevich Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220407214411.257260-2-iii@linux.ibm.com --- tools/lib/bpf/usdt.bpf.h | 4 ++-- tools/lib/bpf/usdt.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h index 60237acf6b02..420d743734e1 100644 --- a/tools/lib/bpf/usdt.bpf.h +++ b/tools/lib/bpf/usdt.bpf.h @@ -166,7 +166,7 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) case BPF_USDT_ARG_REG_DEREF: /* Arg is in memory addressed by register, plus some offset * (e.g., "-4@-1204(%rbp)" in USDT arg spec). Register is - * identified lik with BPF_USDT_ARG_REG case, and the offset + * identified like with BPF_USDT_ARG_REG case, and the offset * is in arg_spec->val_off. We first fetch register contents * from pt_regs, then do another user-space probe read to * fetch argument value itself. @@ -198,7 +198,7 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) /* Retrieve user-specified cookie value provided during attach as * bpf_usdt_opts.usdt_cookie. This serves the same purpose as BPF cookie * returned by bpf_get_attach_cookie(). Libbpf's support for USDT is itself - * utilizaing BPF cookies internally, so user can't use BPF cookie directly + * utilizing BPF cookies internally, so user can't use BPF cookie directly * for USDT programs and has to use bpf_usdt_cookie() API instead. */ static inline __noinline diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index b699e720136a..3dcb79f1e3a7 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -108,7 +108,7 @@ * code through spec map. This allows BPF applications to quickly fetch the * actual value at runtime using a simple BPF-side code. * - * With basics out of the way, let's go over less immeditately obvious aspects + * With basics out of the way, let's go over less immediately obvious aspects * of supporting USDTs. * * First, there is no special USDT BPF program type. It is actually just @@ -189,14 +189,14 @@ #define USDT_NOTE_TYPE 3 #define USDT_NOTE_NAME "stapsdt" -/* should match exactly enum __bpf_usdt_arg_type from bpf_usdt.bpf.h */ +/* should match exactly enum __bpf_usdt_arg_type from usdt.bpf.h */ enum usdt_arg_type { USDT_ARG_CONST, USDT_ARG_REG, USDT_ARG_REG_DEREF, }; -/* should match exactly struct __bpf_usdt_arg_spec from bpf_usdt.bpf.h */ +/* should match exactly struct __bpf_usdt_arg_spec from usdt.bpf.h */ struct usdt_arg_spec { __u64 val_off; enum usdt_arg_type arg_type; @@ -328,9 +328,9 @@ static int sanity_check_usdt_elf(Elf *elf, const char *path) return -EBADF; } -#if __BYTE_ORDER == __LITTLE_ENDIAN +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ endianness = ELFDATA2LSB; -#elif __BYTE_ORDER == __BIG_ENDIAN +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ endianness = ELFDATA2MSB; #else # error "Unrecognized __BYTE_ORDER__" @@ -843,7 +843,7 @@ static int bpf_link_usdt_detach(struct bpf_link *link) sizeof(*new_free_ids)); /* If we couldn't resize free_spec_ids, we'll just leak * a bunch of free IDs; this is very unlikely to happen and if - * system is so exausted on memory, it's the least of user's + * system is so exhausted on memory, it's the least of user's * concerns, probably. * So just do our best here to return those IDs to usdt_manager. */ -- cgit From 6f403d9d530635f533577d37929c61474d6c5d7f Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 7 Apr 2022 23:44:10 +0200 Subject: libbpf: Make BPF-side of USDT support work on big-endian machines BPF_USDT_ARG_REG_DEREF handling always reads 8 bytes, regardless of the actual argument size. On little-endian the relevant argument bits end up in the lower bits of val, and later on the code that handles all the argument types expects them to be there. On big-endian they end up in the upper bits of val, breaking that expectation. Fix by right-shifting val on big-endian. Signed-off-by: Ilya Leoshkevich Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220407214411.257260-3-iii@linux.ibm.com --- tools/lib/bpf/usdt.bpf.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h index 420d743734e1..881a2422a8ef 100644 --- a/tools/lib/bpf/usdt.bpf.h +++ b/tools/lib/bpf/usdt.bpf.h @@ -177,6 +177,9 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) err = bpf_probe_read_user(&val, sizeof(val), (void *)val + arg_spec->val_off); if (err) return err; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + val >>= arg_spec->arg_bitshift; +#endif break; default: return -EINVAL; -- cgit From bd022685bd441056365e9a44a6bf940f45054250 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 7 Apr 2022 23:44:11 +0200 Subject: libbpf: Add s390-specific USDT arg spec parsing logic The logic is superficially similar to that of x86, but the small differences (no need for register table and dynamic allocation of register names, no $ sign before constants) make maintaining a common implementation too burdensome. Therefore simply add a s390x-specific version of parse_usdt_arg(). Note that while bcc supports index registers, this patch does not. This should not be a problem in most cases, since s390 uses a default value "nor" for STAP_SDT_ARG_CONSTRAINT. Signed-off-by: Ilya Leoshkevich Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220407214411.257260-4-iii@linux.ibm.com --- tools/lib/bpf/usdt.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index 3dcb79f1e3a7..30c495a6554c 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -1269,6 +1269,61 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec return len; } +#elif defined(__s390x__) + +/* Do not support __s390__ for now, since user_pt_regs is broken with -m31. */ + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +{ + unsigned int reg; + int arg_sz, len; + long off; + + if (sscanf(arg_str, " %d @ %ld ( %%r%u ) %n", &arg_sz, &off, ®, &len) == 3) { + /* Memory dereference case, e.g., -2@-28(%r15) */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + if (reg > 15) { + pr_warn("usdt: unrecognized register '%%r%u'\n", reg); + return -EINVAL; + } + arg->reg_off = offsetof(user_pt_regs, gprs[reg]); + } else if (sscanf(arg_str, " %d @ %%r%u %n", &arg_sz, ®, &len) == 2) { + /* Register read case, e.g., -8@%r0 */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + if (reg > 15) { + pr_warn("usdt: unrecognized register '%%r%u'\n", reg); + return -EINVAL; + } + arg->reg_off = offsetof(user_pt_regs, gprs[reg]); + } else if (sscanf(arg_str, " %d @ %ld %n", &arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@71 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); + return -EINVAL; + } + + arg->arg_signed = arg_sz < 0; + if (arg_sz < 0) + arg_sz = -arg_sz; + + switch (arg_sz) { + case 1: case 2: case 4: case 8: + arg->arg_bitshift = 64 - arg_sz * 8; + break; + default: + pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", + arg_num, arg_str, arg_sz); + return -EINVAL; + } + + return len; +} + #else static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) -- cgit From 3c0dfe6e4c43ea0cf252ff4cb7a332423866d488 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 7 Apr 2022 16:04:45 -0700 Subject: libbpf: Use strlcpy() in path resolution fallback logic Coverity static analyzer complains that strcpy() can cause buffer overflow. Use libbpf_strlcpy() instead to be 100% sure this doesn't happen. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220407230446.3980075-1-andrii@kernel.org --- tools/lib/bpf/usdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index 30c495a6554c..acf2d99a9e77 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -456,7 +456,7 @@ static int parse_lib_segs(int pid, const char *lib_path, struct elf_seg **segs, if (!realpath(lib_path, path)) { pr_warn("usdt: failed to get absolute path of '%s' (err %d), using path as is...\n", lib_path, -errno); - strcpy(path, lib_path); + libbpf_strlcpy(path, lib_path, sizeof(path)); } proceed: -- cgit From 3a06ec0a996dc8c4bc518f0b6bedc3587dd15169 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 7 Apr 2022 16:04:46 -0700 Subject: libbpf: Allow WEAK and GLOBAL bindings during BTF fixup During BTF fix up for global variables, global variable can be global weak and will have STB_WEAK binding in ELF. Support such global variables in addition to non-weak ones. This is not the problem when using BPF static linking, as BPF static linker "fixes up" BTF during generation so that libbpf doesn't have to do it anymore during bpf_object__open(), which led to this not being noticed for a while, along with a pretty rare (currently) use of __weak variables and maps. Reported-by: Hengqi Chen Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220407230446.3980075-2-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 016ecdd1c3e1..9deb1fc67f19 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1401,8 +1401,11 @@ static int find_elf_var_offset(const struct bpf_object *obj, const char *name, _ for (si = 0; si < symbols->d_size / sizeof(Elf64_Sym); si++) { Elf64_Sym *sym = elf_sym_by_idx(obj, si); - if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL || - ELF64_ST_TYPE(sym->st_info) != STT_OBJECT) + if (ELF64_ST_TYPE(sym->st_info) != STT_OBJECT) + continue; + + if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL && + ELF64_ST_BIND(sym->st_info) != STB_WEAK) continue; sname = elf_sym_str(obj, sym->st_name); -- cgit From 587323cf6a6a6da074f0518444ee7da10f517f45 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 5 Apr 2022 16:15:14 +0200 Subject: samples, bpf: Move routes monitor in xdp_router_ipv4 in a dedicated thread In order to not miss any netlink message from the kernel, move routes monitor to a dedicated thread. Fixes: 85bf1f51691c ("samples: bpf: Convert xdp_router_ipv4 to XDP samples helper") Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/e364b817c69ded73be24b677ab47a157f7c21b64.1649167911.git.lorenzo@kernel.org --- samples/bpf/Makefile | 2 +- samples/bpf/xdp_router_ipv4_user.c | 86 ++++++++++++++++++++++++-------------- 2 files changed, 55 insertions(+), 33 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 342a41a10356..8fff5ad3444b 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -219,7 +219,7 @@ TPROGLDLIBS_xdp_redirect += -lm TPROGLDLIBS_xdp_redirect_cpu += -lm TPROGLDLIBS_xdp_redirect_map += -lm TPROGLDLIBS_xdp_redirect_map_multi += -lm -TPROGLDLIBS_xdp_router_ipv4 += -lm +TPROGLDLIBS_xdp_router_ipv4 += -lm -pthread TPROGLDLIBS_tracex4 += -lrt TPROGLDLIBS_trace_output += -lrt TPROGLDLIBS_map_perf_test += -lrt diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c index 7828784612ec..f32bbd5c32bf 100644 --- a/samples/bpf/xdp_router_ipv4_user.c +++ b/samples/bpf/xdp_router_ipv4_user.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "xdp_sample_user.h" #include "xdp_router_ipv4.skel.h" @@ -38,6 +39,9 @@ static int arp_table_map_fd; static int exact_match_map_fd; static int tx_port_map_fd; +static bool routes_thread_exit; +static int interval = 5; + static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_MAP_CNT | SAMPLE_DEVMAP_XMIT_CNT_MULTI | SAMPLE_EXCEPTION_CNT; @@ -445,7 +449,7 @@ cleanup: /* Function to keep track and update changes in route and arp table * Give regular statistics of packets forwarded */ -static void monitor_route(void *ctx) +static void *monitor_routes_thread(void *arg) { struct pollfd fds_route, fds_arp; struct sockaddr_nl la, lr; @@ -455,7 +459,7 @@ static void monitor_route(void *ctx) sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sock < 0) { fprintf(stderr, "open netlink socket: %s\n", strerror(errno)); - return; + return NULL; } fcntl(sock, F_SETFL, O_NONBLOCK); @@ -465,7 +469,7 @@ static void monitor_route(void *ctx) if (bind(sock, (struct sockaddr *)&lr, sizeof(lr)) < 0) { fprintf(stderr, "bind netlink socket: %s\n", strerror(errno)); close(sock); - return; + return NULL; } fds_route.fd = sock; @@ -475,7 +479,7 @@ static void monitor_route(void *ctx) if (sock_arp < 0) { fprintf(stderr, "open netlink socket: %s\n", strerror(errno)); close(sock); - return; + return NULL; } fcntl(sock_arp, F_SETFL, O_NONBLOCK); @@ -490,35 +494,51 @@ static void monitor_route(void *ctx) fds_arp.fd = sock_arp; fds_arp.events = POLL_IN; - memset(buf, 0, sizeof(buf)); - if (poll(&fds_route, 1, 3) == POLL_IN) { - nll = recv_msg(lr, sock); - if (nll < 0) { - fprintf(stderr, "recv from netlink: %s\n", - strerror(nll)); - goto cleanup; - } + /* dump route and arp tables */ + if (get_arp_table(AF_INET) < 0) { + fprintf(stderr, "Failed reading arp table\n"); + goto cleanup; + } - nh = (struct nlmsghdr *)buf; - read_route(nh, nll); + if (get_route_table(AF_INET) < 0) { + fprintf(stderr, "Failed reading route table\n"); + goto cleanup; } - memset(buf, 0, sizeof(buf)); - if (poll(&fds_arp, 1, 3) == POLL_IN) { - nll = recv_msg(la, sock_arp); - if (nll < 0) { - fprintf(stderr, "recv from netlink: %s\n", - strerror(nll)); - goto cleanup; + while (!routes_thread_exit) { + memset(buf, 0, sizeof(buf)); + if (poll(&fds_route, 1, 3) == POLL_IN) { + nll = recv_msg(lr, sock); + if (nll < 0) { + fprintf(stderr, "recv from netlink: %s\n", + strerror(nll)); + goto cleanup; + } + + nh = (struct nlmsghdr *)buf; + read_route(nh, nll); } - nh = (struct nlmsghdr *)buf; - read_arp(nh, nll); + memset(buf, 0, sizeof(buf)); + if (poll(&fds_arp, 1, 3) == POLL_IN) { + nll = recv_msg(la, sock_arp); + if (nll < 0) { + fprintf(stderr, "recv from netlink: %s\n", + strerror(nll)); + goto cleanup; + } + + nh = (struct nlmsghdr *)buf; + read_arp(nh, nll); + } + + sleep(interval); } cleanup: close(sock_arp); close(sock); + return NULL; } static void usage(char *argv[], const struct option *long_options, @@ -531,10 +551,11 @@ static void usage(char *argv[], const struct option *long_options, int main(int argc, char **argv) { bool error = true, generic = false, force = false; - int opt, interval = 5, ret = EXIT_FAIL_BPF; + int opt, ret = EXIT_FAIL_BPF; struct xdp_router_ipv4 *skel; int i, total_ifindex = argc - 1; char **ifname_list = argv + 1; + pthread_t routes_thread; int longindex = 0; if (libbpf_set_strict_mode(LIBBPF_STRICT_ALL) < 0) { @@ -653,24 +674,25 @@ int main(int argc, char **argv) goto end_destroy; } - if (get_route_table(AF_INET) < 0) { - fprintf(stderr, "Failed reading routing table\n"); + ret = pthread_create(&routes_thread, NULL, monitor_routes_thread, NULL); + if (ret) { + fprintf(stderr, "Failed creating routes_thread: %s\n", strerror(-ret)); + ret = EXIT_FAIL; goto end_destroy; } - if (get_arp_table(AF_INET) < 0) { - fprintf(stderr, "Failed reading arptable\n"); - goto end_destroy; - } + ret = sample_run(interval, NULL, NULL); + routes_thread_exit = true; - ret = sample_run(interval, monitor_route, NULL); if (ret < 0) { fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret)); ret = EXIT_FAIL; - goto end_destroy; + goto end_thread_wait; } ret = EXIT_OK; +end_thread_wait: + pthread_join(routes_thread, NULL); end_destroy: xdp_router_ipv4__destroy(skel); end: -- cgit From e89d57d938c8fa80c457982154ed6110804814fe Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 8 Apr 2022 11:14:23 -0700 Subject: libbpf: Don't error out on CO-RE relos for overriden weak subprogs During BPF static linking, all the ELF relocations and .BTF.ext information (including CO-RE relocations) are preserved for __weak subprograms that were logically overriden by either previous weak subprogram instance or by corresponding "strong" (non-weak) subprogram. This is just how native user-space linkers work, nothing new. But libbpf is over-zealous when processing CO-RE relocation to error out when CO-RE relocation belonging to such eliminated weak subprogram is encountered. Instead of erroring out on this expected situation, log debug-level message and skip the relocation. Fixes: db2b8b06423c ("libbpf: Support CO-RE relocations for multi-prog sections") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220408181425.2287230-2-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 9deb1fc67f19..465b7c0996f1 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5687,10 +5687,17 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) insn_idx = rec->insn_off / BPF_INSN_SZ; prog = find_prog_by_sec_insn(obj, sec_idx, insn_idx); if (!prog) { - pr_warn("sec '%s': failed to find program at insn #%d for CO-RE offset relocation #%d\n", - sec_name, insn_idx, i); - err = -EINVAL; - goto out; + /* When __weak subprog is "overridden" by another instance + * of the subprog from a different object file, linker still + * appends all the .BTF.ext info that used to belong to that + * eliminated subprogram. + * This is similar to what x86-64 linker does for relocations. + * So just ignore such relocations just like we ignore + * subprog instructions when discovering subprograms. + */ + pr_debug("sec '%s': skipping CO-RE relocation #%d for insn #%d belonging to eliminated weak subprogram\n", + sec_name, i, insn_idx); + continue; } /* no need to apply CO-RE relocation if the program is * not going to be loaded -- cgit From 2fa5b0f290e19bb34393e1983be511aab18b683e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 8 Apr 2022 11:14:24 -0700 Subject: libbpf: Use weak hidden modifier for USDT BPF-side API functions Use __weak __hidden for bpf_usdt_xxx() APIs instead of much more confusing `static inline __noinline`. This was previously impossible due to libbpf erroring out on CO-RE relocations pointing to eliminated weak subprogs. Now that previous patch fixed this issue, switch back to __weak __hidden as it's a more direct way of specifying the desired behavior. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220408181425.2287230-3-andrii@kernel.org --- tools/lib/bpf/usdt.bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h index 881a2422a8ef..4181fddb3687 100644 --- a/tools/lib/bpf/usdt.bpf.h +++ b/tools/lib/bpf/usdt.bpf.h @@ -103,7 +103,7 @@ int __bpf_usdt_spec_id(struct pt_regs *ctx) } /* Return number of USDT arguments defined for currently traced USDT. */ -static inline __noinline +__weak __hidden int bpf_usdt_arg_cnt(struct pt_regs *ctx) { struct __bpf_usdt_spec *spec; @@ -124,7 +124,7 @@ int bpf_usdt_arg_cnt(struct pt_regs *ctx) * Returns 0 on success; negative error, otherwise. * On error *res is guaranteed to be set to zero. */ -static inline __noinline +__weak __hidden int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) { struct __bpf_usdt_spec *spec; @@ -204,7 +204,7 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) * utilizing BPF cookies internally, so user can't use BPF cookie directly * for USDT programs and has to use bpf_usdt_cookie() API instead. */ -static inline __noinline +__weak __hidden long bpf_usdt_cookie(struct pt_regs *ctx) { struct __bpf_usdt_spec *spec; -- cgit From 8555defe48610a7efe9f2d72689e2d4f006898d7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 8 Apr 2022 11:14:25 -0700 Subject: selftests/bpf: Add CO-RE relos into linked_funcs selftests Add CO-RE relocations into __weak subprogs for multi-file linked_funcs selftest to make sure libbpf handles such combination well. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220408181425.2287230-4-andrii@kernel.org --- tools/testing/selftests/bpf/progs/linked_funcs1.c | 8 ++++++++ tools/testing/selftests/bpf/progs/linked_funcs2.c | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/linked_funcs1.c b/tools/testing/selftests/bpf/progs/linked_funcs1.c index b964ec1390c2..963b393c37e8 100644 --- a/tools/testing/selftests/bpf/progs/linked_funcs1.c +++ b/tools/testing/selftests/bpf/progs/linked_funcs1.c @@ -4,6 +4,7 @@ #include "vmlinux.h" #include #include +#include /* weak and shared between two files */ const volatile int my_tid __weak; @@ -44,6 +45,13 @@ void set_output_ctx1(__u64 *ctx) /* this weak instance should win because it's the first one */ __weak int set_output_weak(int x) { + static volatile int whatever; + + /* make sure we use CO-RE relocations in a weak function, this used to + * cause problems for BPF static linker + */ + whatever = bpf_core_type_size(struct task_struct); + output_weak1 = x; return x; } diff --git a/tools/testing/selftests/bpf/progs/linked_funcs2.c b/tools/testing/selftests/bpf/progs/linked_funcs2.c index 575e958e60b7..db195872f4eb 100644 --- a/tools/testing/selftests/bpf/progs/linked_funcs2.c +++ b/tools/testing/selftests/bpf/progs/linked_funcs2.c @@ -4,6 +4,7 @@ #include "vmlinux.h" #include #include +#include /* weak and shared between both files */ const volatile int my_tid __weak; @@ -44,6 +45,13 @@ void set_output_ctx2(__u64 *ctx) /* this weak instance should lose, because it will be processed second */ __weak int set_output_weak(int x) { + static volatile int whatever; + + /* make sure we use CO-RE relocations in a weak function, this used to + * cause problems for BPF static linker + */ + whatever = 2 * bpf_core_type_size(struct task_struct); + output_weak2 = x; return 2 * x; } -- cgit From 658d87687cd5a6c8762d1de8abee1e6792d8d71e Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Fri, 8 Apr 2022 12:14:52 +0800 Subject: selftests/bpf: Fix return value checks in perf_event_stackmap test The bpf_get_stackid() function may also return 0 on success as per UAPI BPF helper documentation. Therefore, correct checks from 'val > 0' to 'val >= 0' to ensure that they cover all possible success return values. Signed-off-by: Yuntao Wang Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220408041452.933944-1-ytcoode@gmail.com --- tools/testing/selftests/bpf/progs/perf_event_stackmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/perf_event_stackmap.c b/tools/testing/selftests/bpf/progs/perf_event_stackmap.c index b3fcb5274ee0..f793280a3238 100644 --- a/tools/testing/selftests/bpf/progs/perf_event_stackmap.c +++ b/tools/testing/selftests/bpf/progs/perf_event_stackmap.c @@ -35,10 +35,10 @@ int oncpu(void *ctx) long val; val = bpf_get_stackid(ctx, &stackmap, 0); - if (val > 0) + if (val >= 0) stackid_kernel = 2; val = bpf_get_stackid(ctx, &stackmap, BPF_F_USER_STACK); - if (val > 0) + if (val >= 0) stackid_user = 2; trace = bpf_map_lookup_elem(&stackdata_map, &key); -- cgit From b45043192b3e481304062938a6561da2ceea46a6 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Thu, 7 Apr 2022 21:04:23 +0800 Subject: bpf: Fix excessive memory allocation in stack_map_alloc() The 'n_buckets * (value_size + sizeof(struct stack_map_bucket))' part of the allocated memory for 'smap' is never used after the memlock accounting was removed, thus get rid of it. [ Note, Daniel: Commit b936ca643ade ("bpf: rework memlock-based memory accounting for maps") moved `cost += n_buckets * (value_size + sizeof(struct stack_map_bucket))` up and therefore before the bpf_map_area_alloc() allocation, sigh. In a later step commit c85d69135a91 ("bpf: move memory size checks to bpf_map_charge_init()"), and the overflow checks of `cost >= U32_MAX - PAGE_SIZE` moved into bpf_map_charge_init(). And then 370868107bf6 ("bpf: Eliminate rlimit-based memory accounting for stackmap maps") finally removed the bpf_map_charge_init(). Anyway, the original code did the allocation same way as /after/ this fix. ] Fixes: b936ca643ade ("bpf: rework memlock-based memory accounting for maps") Signed-off-by: Yuntao Wang Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220407130423.798386-1-ytcoode@gmail.com --- kernel/bpf/stackmap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 6131b4a19572..1dd5266fbebb 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -100,7 +100,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) return ERR_PTR(-E2BIG); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); - cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); if (!smap) return ERR_PTR(-ENOMEM); -- cgit