diff options
200 files changed, 7503 insertions, 3323 deletions
diff --git a/Documentation/networking/kapi.rst b/Documentation/networking/kapi.rst index f03ae64be8bc..d198fa5eaacd 100644 --- a/Documentation/networking/kapi.rst +++ b/Documentation/networking/kapi.rst @@ -134,6 +134,15 @@ PHY Support .. kernel-doc:: drivers/net/phy/phy.c :internal: +.. kernel-doc:: drivers/net/phy/phy-core.c + :export: + +.. kernel-doc:: drivers/net/phy/phy-c45.c + :export: + +.. kernel-doc:: include/linux/phy.h + :internal: + .. kernel-doc:: drivers/net/phy/phy_device.c :export: @@ -1080,13 +1080,15 @@ ifdef CONFIG_STACK_VALIDATION endif endif +ifdef CONFIG_BPF ifdef CONFIG_DEBUG_INFO_BTF ifeq ($(has_libelf),1) resolve_btfids_target := tools/bpf/resolve_btfids FORCE else ERROR_RESOLVE_BTFIDS := 1 endif -endif +endif # CONFIG_DEBUG_INFO_BTF +endif # CONFIG_BPF PHONY += prepare0 diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index be4b8532dd3c..0a4182792876 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -50,7 +50,6 @@ struct bpf_jit { int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */ int tail_call_start; /* Tail call start offset */ int excnt; /* Number of exception table entries */ - int labels[1]; /* Labels for local jumps */ }; #define SEEN_MEM BIT(0) /* use mem[] for temporary storage */ @@ -229,18 +228,18 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) REG_SET_SEEN(b3); \ }) -#define EMIT6_PCREL_LABEL(op1, op2, b1, b2, label, mask) \ +#define EMIT6_PCREL_RIEB(op1, op2, b1, b2, mask, target) \ ({ \ - int rel = (jit->labels[label] - jit->prg) >> 1; \ + unsigned int rel = (int)((target) - jit->prg) / 2; \ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), \ (op2) | (mask) << 12); \ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ }) -#define EMIT6_PCREL_IMM_LABEL(op1, op2, b1, imm, label, mask) \ +#define EMIT6_PCREL_RIEC(op1, op2, b1, imm, mask, target) \ ({ \ - int rel = (jit->labels[label] - jit->prg) >> 1; \ + unsigned int rel = (int)((target) - jit->prg) / 2; \ _EMIT6((op1) | (reg_high(b1) | (mask)) << 16 | \ (rel & 0xffff), (op2) | ((imm) & 0xff) << 8); \ REG_SET_SEEN(b1); \ @@ -1282,7 +1281,9 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb9040000, BPF_REG_0, REG_2); break; } - case BPF_JMP | BPF_TAIL_CALL: + case BPF_JMP | BPF_TAIL_CALL: { + int patch_1_clrj, patch_2_clij, patch_3_brc; + /* * Implicit input: * B1: pointer to ctx @@ -1300,16 +1301,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_2, offsetof(struct bpf_array, map.max_entries)); /* if ((u32)%b3 >= (u32)%w1) goto out; */ - if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) { - /* clrj %b3,%w1,0xa,label0 */ - EMIT6_PCREL_LABEL(0xec000000, 0x0077, BPF_REG_3, - REG_W1, 0, 0xa); - } else { - /* clr %b3,%w1 */ - EMIT2(0x1500, BPF_REG_3, REG_W1); - /* brcl 0xa,label0 */ - EMIT6_PCREL_RILC(0xc0040000, 0xa, jit->labels[0]); - } + /* clrj %b3,%w1,0xa,out */ + patch_1_clrj = jit->prg; + EMIT6_PCREL_RIEB(0xec000000, 0x0077, BPF_REG_3, REG_W1, 0xa, + jit->prg); /* * if (tail_call_cnt++ > MAX_TAIL_CALL_CNT) @@ -1324,16 +1319,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4_IMM(0xa7080000, REG_W0, 1); /* laal %w1,%w0,off(%r15) */ EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off); - if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) { - /* clij %w1,MAX_TAIL_CALL_CNT,0x2,label0 */ - EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007f, REG_W1, - MAX_TAIL_CALL_CNT, 0, 0x2); - } else { - /* clfi %w1,MAX_TAIL_CALL_CNT */ - EMIT6_IMM(0xc20f0000, REG_W1, MAX_TAIL_CALL_CNT); - /* brcl 0x2,label0 */ - EMIT6_PCREL_RILC(0xc0040000, 0x2, jit->labels[0]); - } + /* clij %w1,MAX_TAIL_CALL_CNT,0x2,out */ + patch_2_clij = jit->prg; + EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W1, MAX_TAIL_CALL_CNT, + 2, jit->prg); /* * prog = array->ptrs[index]; @@ -1348,13 +1337,9 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* ltg %r1,prog(%b2,%r1) */ EMIT6_DISP_LH(0xe3000000, 0x0002, REG_1, BPF_REG_2, REG_1, offsetof(struct bpf_array, ptrs)); - if (!is_first_pass(jit) && can_use_rel(jit, jit->labels[0])) { - /* brc 0x8,label0 */ - EMIT4_PCREL_RIC(0xa7040000, 0x8, jit->labels[0]); - } else { - /* brcl 0x8,label0 */ - EMIT6_PCREL_RILC(0xc0040000, 0x8, jit->labels[0]); - } + /* brc 0x8,out */ + patch_3_brc = jit->prg; + EMIT4_PCREL_RIC(0xa7040000, 8, jit->prg); /* * Restore registers before calling function @@ -1371,8 +1356,16 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* bc 0xf,tail_call_start(%r1) */ _EMIT4(0x47f01000 + jit->tail_call_start); /* out: */ - jit->labels[0] = jit->prg; + if (jit->prg_buf) { + *(u16 *)(jit->prg_buf + patch_1_clrj + 2) = + (jit->prg - patch_1_clrj) >> 1; + *(u16 *)(jit->prg_buf + patch_2_clij + 2) = + (jit->prg - patch_2_clij) >> 1; + *(u16 *)(jit->prg_buf + patch_3_brc + 2) = + (jit->prg - patch_3_brc) >> 1; + } break; + } case BPF_JMP | BPF_EXIT: /* return b0 */ last = (i == fp->len - 1) ? 1 : 0; if (last) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e7752b4038ff..e491c3d9f227 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -314,19 +314,19 @@ static inline void mds_idle_clear_cpu_buffers(void) * lfence * jmp spec_trap * do_rop: - * mov %rax,(%rsp) for x86_64 + * mov %rcx,(%rsp) for x86_64 * mov %edx,(%esp) for x86_32 * retq * * Without retpolines configured: * - * jmp *%rax for x86_64 + * jmp *%rcx for x86_64 * jmp *%edx for x86_32 */ #ifdef CONFIG_RETPOLINE # ifdef CONFIG_X86_64 -# define RETPOLINE_RAX_BPF_JIT_SIZE 17 -# define RETPOLINE_RAX_BPF_JIT() \ +# define RETPOLINE_RCX_BPF_JIT_SIZE 17 +# define RETPOLINE_RCX_BPF_JIT() \ do { \ EMIT1_off32(0xE8, 7); /* callq do_rop */ \ /* spec_trap: */ \ @@ -334,7 +334,7 @@ do { \ EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \ EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ /* do_rop: */ \ - EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */ \ + EMIT4(0x48, 0x89, 0x0C, 0x24); /* mov %rcx,(%rsp) */ \ EMIT1(0xC3); /* retq */ \ } while (0) # else /* !CONFIG_X86_64 */ @@ -352,9 +352,9 @@ do { \ # endif #else /* !CONFIG_RETPOLINE */ # ifdef CONFIG_X86_64 -# define RETPOLINE_RAX_BPF_JIT_SIZE 2 -# define RETPOLINE_RAX_BPF_JIT() \ - EMIT2(0xFF, 0xE0); /* jmp *%rax */ +# define RETPOLINE_RCX_BPF_JIT_SIZE 2 +# define RETPOLINE_RCX_BPF_JIT() \ + EMIT2(0xFF, 0xE1); /* jmp *%rcx */ # else /* !CONFIG_X86_64 */ # define RETPOLINE_EDX_BPF_JIT() \ EMIT2(0xFF, 0xE2) /* jmp *%edx */ diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7d9ea7b41c71..26f43279b78b 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -221,14 +221,48 @@ struct jit_context { /* Number of bytes emit_patch() needs to generate instructions */ #define X86_PATCH_SIZE 5 +/* Number of bytes that will be skipped on tailcall */ +#define X86_TAIL_CALL_OFFSET 11 -#define PROLOGUE_SIZE 25 +static void push_callee_regs(u8 **pprog, bool *callee_regs_used) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (callee_regs_used[0]) + EMIT1(0x53); /* push rbx */ + if (callee_regs_used[1]) + EMIT2(0x41, 0x55); /* push r13 */ + if (callee_regs_used[2]) + EMIT2(0x41, 0x56); /* push r14 */ + if (callee_regs_used[3]) + EMIT2(0x41, 0x57); /* push r15 */ + *pprog = prog; +} + +static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (callee_regs_used[3]) + EMIT2(0x41, 0x5F); /* pop r15 */ + if (callee_regs_used[2]) + EMIT2(0x41, 0x5E); /* pop r14 */ + if (callee_regs_used[1]) + EMIT2(0x41, 0x5D); /* pop r13 */ + if (callee_regs_used[0]) + EMIT1(0x5B); /* pop rbx */ + *pprog = prog; +} /* - * Emit x86-64 prologue code for BPF program and check its size. - * bpf_tail_call helper will skip it while jumping into another program + * Emit x86-64 prologue code for BPF program. + * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes + * while jumping to another program */ -static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) +static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, + bool tail_call_reachable, bool is_subprog) { u8 *prog = *pprog; int cnt = X86_PATCH_SIZE; @@ -238,19 +272,18 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) */ memcpy(prog, ideal_nops[NOP_ATOMIC5], cnt); prog += cnt; + if (!ebpf_from_cbpf) { + if (tail_call_reachable && !is_subprog) + EMIT2(0x31, 0xC0); /* xor eax, eax */ + else + EMIT2(0x66, 0x90); /* nop2 */ + } EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ /* sub rsp, rounded_stack_depth */ EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8)); - EMIT1(0x53); /* push rbx */ - EMIT2(0x41, 0x55); /* push r13 */ - EMIT2(0x41, 0x56); /* push r14 */ - EMIT2(0x41, 0x57); /* push r15 */ - if (!ebpf_from_cbpf) { - /* zero init tail_call_cnt */ - EMIT2(0x6a, 0x00); - BUILD_BUG_ON(cnt != PROLOGUE_SIZE); - } + if (tail_call_reachable) + EMIT1(0x50); /* push rax */ *pprog = prog; } @@ -314,13 +347,14 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, mutex_lock(&text_mutex); if (memcmp(ip, old_insn, X86_PATCH_SIZE)) goto out; + ret = 1; if (memcmp(ip, new_insn, X86_PATCH_SIZE)) { if (text_live) text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); else memcpy(ip, new_insn, X86_PATCH_SIZE); + ret = 0; } - ret = 0; out: mutex_unlock(&text_mutex); return ret; @@ -337,6 +371,22 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true); } +static int get_pop_bytes(bool *callee_regs_used) +{ + int bytes = 0; + + if (callee_regs_used[3]) + bytes += 2; + if (callee_regs_used[2]) + bytes += 2; + if (callee_regs_used[1]) + bytes += 2; + if (callee_regs_used[0]) + bytes += 1; + + return bytes; +} + /* * Generate the following code: * @@ -351,12 +401,26 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, * goto *(prog->bpf_func + prologue_size); * out: */ -static void emit_bpf_tail_call_indirect(u8 **pprog) +static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, + u32 stack_depth) { + int tcc_off = -4 - round_up(stack_depth, 8); u8 *prog = *pprog; - int label1, label2, label3; + int pop_bytes = 0; + int off1 = 49; + int off2 = 38; + int off3 = 16; int cnt = 0; + /* count the additional bytes used for popping callee regs from stack + * that need to be taken into account for each of the offsets that + * are used for bailing out of the tail call + */ + pop_bytes = get_pop_bytes(callee_regs_used); + off1 += pop_bytes; + off2 += pop_bytes; + off3 += pop_bytes; + /* * rdi - pointer to ctx * rsi - pointer to bpf_array @@ -370,72 +434,106 @@ static void emit_bpf_tail_call_indirect(u8 **pprog) EMIT2(0x89, 0xD2); /* mov edx, edx */ EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); -#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* Number of bytes to jump */ +#define OFFSET1 (off1 + RETPOLINE_RCX_BPF_JIT_SIZE) /* Number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ - label1 = cnt; /* * if (tail_call_cnt > MAX_TAIL_CALL_CNT) * goto out; */ - EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */ + EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 (30 + RETPOLINE_RAX_BPF_JIT_SIZE) +#define OFFSET2 (off2 + RETPOLINE_RCX_BPF_JIT_SIZE) EMIT2(X86_JA, OFFSET2); /* ja out */ - label2 = cnt; EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ - EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */ + EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ /* prog = array->ptrs[index]; */ - EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */ + EMIT4_off32(0x48, 0x8B, 0x8C, 0xD6, /* mov rcx, [rsi + rdx * 8 + offsetof(...)] */ offsetof(struct bpf_array, ptrs)); /* * if (prog == NULL) * goto out; */ - EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ -#define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE) + EMIT3(0x48, 0x85, 0xC9); /* test rcx,rcx */ +#define OFFSET3 (off3 + RETPOLINE_RCX_BPF_JIT_SIZE) EMIT2(X86_JE, OFFSET3); /* je out */ - label3 = cnt; - /* goto *(prog->bpf_func + prologue_size); */ - EMIT4(0x48, 0x8B, 0x40, /* mov rax, qword ptr [rax + 32] */ - offsetof(struct bpf_prog, bpf_func)); - EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */ + *pprog = prog; + pop_callee_regs(pprog, callee_regs_used); + prog = *pprog; + + EMIT1(0x58); /* pop rax */ + EMIT3_off32(0x48, 0x81, 0xC4, /* add rsp, sd */ + round_up(stack_depth, 8)); + /* goto *(prog->bpf_func + X86_TAIL_CALL_OFFSET); */ + EMIT4(0x48, 0x8B, 0x49, /* mov rcx, qword ptr [rcx + 32] */ + offsetof(struct bpf_prog, bpf_func)); + EMIT4(0x48, 0x83, 0xC1, /* add rcx, X86_TAIL_CALL_OFFSET */ + X86_TAIL_CALL_OFFSET); /* - * Wow we're ready to jump into next BPF program + * Now we're ready to jump into next BPF program * rdi == ctx (1st arg) - * rax == prog->bpf_func + prologue_size + * rcx == prog->bpf_func + X86_TAIL_CALL_OFFSET */ - RETPOLINE_RAX_BPF_JIT(); + RETPOLINE_RCX_BPF_JIT(); /* out: */ - BUILD_BUG_ON(cnt - label1 != OFFSET1); - BUILD_BUG_ON(cnt - label2 != OFFSET2); - BUILD_BUG_ON(cnt - label3 != OFFSET3); *pprog = prog; } static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, - u8 **pprog, int addr, u8 *image) + u8 **pprog, int addr, u8 *image, + bool *callee_regs_used, u32 stack_depth) { + int tcc_off = -4 - round_up(stack_depth, 8); u8 *prog = *pprog; + int pop_bytes = 0; + int off1 = 27; + int poke_off; int cnt = 0; + /* count the additional bytes used for popping callee regs to stack + * that need to be taken into account for jump offset that is used for + * bailing out from of the tail call when limit is reached + */ + pop_bytes = get_pop_bytes(callee_regs_used); + off1 += pop_bytes; + + /* + * total bytes for: + * - nop5/ jmpq $off + * - pop callee regs + * - sub rsp, $val + * - pop rax + */ + poke_off = X86_PATCH_SIZE + pop_bytes + 7 + 1; + /* * if (tail_call_cnt > MAX_TAIL_CALL_CNT) * goto out; */ - EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */ + EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ - EMIT2(X86_JA, 14); /* ja out */ + EMIT2(X86_JA, off1); /* ja out */ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ - EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */ + EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ + + poke->tailcall_bypass = image + (addr - poke_off - X86_PATCH_SIZE); + poke->adj_off = X86_TAIL_CALL_OFFSET; + poke->tailcall_target = image + (addr - X86_PATCH_SIZE); + poke->bypass_addr = (u8 *)poke->tailcall_target + X86_PATCH_SIZE; - poke->ip = image + (addr - X86_PATCH_SIZE); - poke->adj_off = PROLOGUE_SIZE; + emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE, + poke->tailcall_bypass); + + *pprog = prog; + pop_callee_regs(pprog, callee_regs_used); + prog = *pprog; + EMIT1(0x58); /* pop rax */ + EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8)); memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); prog += X86_PATCH_SIZE; @@ -453,7 +551,7 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) for (i = 0; i < prog->aux->size_poke_tab; i++) { poke = &prog->aux->poke_tab[i]; - WARN_ON_ONCE(READ_ONCE(poke->ip_stable)); + WARN_ON_ONCE(READ_ONCE(poke->tailcall_target_stable)); if (poke->reason != BPF_POKE_REASON_TAIL_CALL) continue; @@ -464,18 +562,25 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) if (target) { /* Plain memcpy is used when image is not live yet * and still not locked as read-only. Once poke - * location is active (poke->ip_stable), any parallel - * bpf_arch_text_poke() might occur still on the - * read-write image until we finally locked it as - * read-only. Both modifications on the given image - * are under text_mutex to avoid interference. + * location is active (poke->tailcall_target_stable), + * any parallel bpf_arch_text_poke() might occur + * still on the read-write image until we finally + * locked it as read-only. Both modifications on + * the given image are under text_mutex to avoid + * interference. */ - ret = __bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, NULL, + ret = __bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, NULL, (u8 *)target->bpf_func + poke->adj_off, false); BUG_ON(ret < 0); + ret = __bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + (u8 *)poke->tailcall_target + + X86_PATCH_SIZE, NULL, false); + BUG_ON(ret < 0); } - WRITE_ONCE(poke->ip_stable, true); + WRITE_ONCE(poke->tailcall_target_stable, true); mutex_unlock(&array->aux->poke_mutex); } } @@ -652,19 +757,49 @@ static bool ex_handler_bpf(const struct exception_table_entry *x, return true; } +static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt, + bool *regs_used, bool *tail_call_seen) +{ + int i; + + for (i = 1; i <= insn_cnt; i++, insn++) { + if (insn->code == (BPF_JMP | BPF_TAIL_CALL)) + *tail_call_seen = true; + if (insn->dst_reg == BPF_REG_6 || insn->src_reg == BPF_REG_6) + regs_used[0] = true; + if (insn->dst_reg == BPF_REG_7 || insn->src_reg == BPF_REG_7) + regs_used[1] = true; + if (insn->dst_reg == BPF_REG_8 || insn->src_reg == BPF_REG_8) + regs_used[2] = true; + if (insn->dst_reg == BPF_REG_9 || insn->src_reg == BPF_REG_9) + regs_used[3] = true; + } +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { + bool tail_call_reachable = bpf_prog->aux->tail_call_reachable; struct bpf_insn *insn = bpf_prog->insnsi; + bool callee_regs_used[4] = {}; int insn_cnt = bpf_prog->len; + bool tail_call_seen = false; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; int i, cnt = 0, excnt = 0; int proglen = 0; u8 *prog = temp; + detect_reg_usage(insn, insn_cnt, callee_regs_used, + &tail_call_seen); + + /* tail call's presence in current prog implies it is reachable */ + tail_call_reachable |= tail_call_seen; + emit_prologue(&prog, bpf_prog->aux->stack_depth, - bpf_prog_was_classic(bpf_prog)); + bpf_prog_was_classic(bpf_prog), tail_call_reachable, + bpf_prog->aux->func_idx != 0); + push_callee_regs(&prog, callee_regs_used); addrs[0] = prog - temp; for (i = 1; i <= insn_cnt; i++, insn++) { @@ -1102,16 +1237,27 @@ xadd: if (is_imm8(insn->off)) /* call */ case BPF_JMP | BPF_CALL: func = (u8 *) __bpf_call_base + imm32; - if (!imm32 || emit_call(&prog, func, image + addrs[i - 1])) - return -EINVAL; + if (tail_call_reachable) { + EMIT3_off32(0x48, 0x8B, 0x85, + -(bpf_prog->aux->stack_depth + 8)); + if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7)) + return -EINVAL; + } else { + if (!imm32 || emit_call(&prog, func, image + addrs[i - 1])) + return -EINVAL; + } break; case BPF_JMP | BPF_TAIL_CALL: if (imm32) emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1], - &prog, addrs[i], image); + &prog, addrs[i], image, + callee_regs_used, + bpf_prog->aux->stack_depth); else - emit_bpf_tail_call_indirect(&prog); + emit_bpf_tail_call_indirect(&prog, + callee_regs_used, + bpf_prog->aux->stack_depth); break; /* cond jump */ @@ -1294,12 +1440,9 @@ emit_jmp: seen_exit = true; /* Update cleanup_addr */ ctx->cleanup_addr = proglen; - if (!bpf_prog_was_classic(bpf_prog)) - EMIT1(0x5B); /* get rid of tail_call_cnt */ - EMIT2(0x41, 0x5F); /* pop r15 */ - EMIT2(0x41, 0x5E); /* pop r14 */ - EMIT2(0x41, 0x5D); /* pop r13 */ - EMIT1(0x5B); /* pop rbx */ + pop_callee_regs(&prog, callee_regs_used); + if (tail_call_reachable) + EMIT1(0x59); /* pop rcx, get rid of tail_call_cnt */ EMIT1(0xC9); /* leave */ EMIT1(0xC3); /* ret */ break; diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index 19403e88daa3..e86925134009 100644 --- a/drivers/net/can/flexcan.c +++ b/drivers/net/can/flexcan.c @@ -9,7 +9,7 @@ // // Based on code originally by Andrey Volkov <[email protected]> -#include <linux/netdevice.h> +#include <linux/bitfield.h> #include <linux/can.h> #include <linux/can/dev.h> #include <linux/can/error.h> @@ -21,12 +21,14 @@ #include <linux/io.h> #include <linux/mfd/syscon.h> #include <linux/module.h> +#include <linux/netdevice.h> #include <linux/of.h> #include <linux/of_device.h> +#include <linux/pinctrl/consumer.h> #include <linux/platform_device.h> #include <linux/pm_runtime.h> -#include <linux/regulator/consumer.h> #include <linux/regmap.h> +#include <linux/regulator/consumer.h> #define DRV_NAME "flexcan" @@ -52,6 +54,7 @@ #define FLEXCAN_MCR_IRMQ BIT(16) #define FLEXCAN_MCR_LPRIO_EN BIT(13) #define FLEXCAN_MCR_AEN BIT(12) +#define FLEXCAN_MCR_FDEN BIT(11) /* MCR_MAXMB: maximum used MBs is MAXMB + 1 */ #define FLEXCAN_MCR_MAXMB(x) ((x) & 0x7f) #define FLEXCAN_MCR_IDAM_A (0x0 << 8) @@ -91,6 +94,7 @@ #define FLEXCAN_CTRL2_MRP BIT(18) #define FLEXCAN_CTRL2_RRS BIT(17) #define FLEXCAN_CTRL2_EACEN BIT(16) +#define FLEXCAN_CTRL2_ISOCANFDEN BIT(12) /* FLEXCAN memory error control register (MECR) bits */ #define FLEXCAN_MECR_ECRWRDIS BIT(31) @@ -134,8 +138,35 @@ (FLEXCAN_ESR_ERR_BUS | FLEXCAN_ESR_ERR_STATE) #define FLEXCAN_ESR_ALL_INT \ (FLEXCAN_ESR_TWRN_INT | FLEXCAN_ESR_RWRN_INT | \ - FLEXCAN_ESR_BOFF_INT | FLEXCAN_ESR_ERR_INT | \ - FLEXCAN_ESR_WAK_INT) + FLEXCAN_ESR_BOFF_INT | FLEXCAN_ESR_ERR_INT) + +/* FLEXCAN Bit Timing register (CBT) bits */ +#define FLEXCAN_CBT_BTF BIT(31) +#define FLEXCAN_CBT_EPRESDIV_MASK GENMASK(30, 21) +#define FLEXCAN_CBT_ERJW_MASK GENMASK(20, 16) +#define FLEXCAN_CBT_EPROPSEG_MASK GENMASK(15, 10) +#define FLEXCAN_CBT_EPSEG1_MASK GENMASK(9, 5) +#define FLEXCAN_CBT_EPSEG2_MASK GENMASK(4, 0) + +/* FLEXCAN FD control register (FDCTRL) bits */ +#define FLEXCAN_FDCTRL_FDRATE BIT(31) +#define FLEXCAN_FDCTRL_MBDSR1 GENMASK(20, 19) +#define FLEXCAN_FDCTRL_MBDSR0 GENMASK(17, 16) +#define FLEXCAN_FDCTRL_MBDSR_8 0x0 +#define FLEXCAN_FDCTRL_MBDSR_12 0x1 +#define FLEXCAN_FDCTRL_MBDSR_32 0x2 +#define FLEXCAN_FDCTRL_MBDSR_64 0x3 +#define FLEXCAN_FDCTRL_TDCEN BIT(15) +#define FLEXCAN_FDCTRL_TDCFAIL BIT(14) +#define FLEXCAN_FDCTRL_TDCOFF GENMASK(12, 8) +#define FLEXCAN_FDCTRL_TDCVAL GENMASK(5, 0) + +/* FLEXCAN FD Bit Timing register (FDCBT) bits */ +#define FLEXCAN_FDCBT_FPRESDIV_MASK GENMASK(29, 20) +#define FLEXCAN_FDCBT_FRJW_MASK GENMASK(18, 16) +#define FLEXCAN_FDCBT_FPROPSEG_MASK GENMASK(14, 10) +#define FLEXCAN_FDCBT_FPSEG1_MASK GENMASK(7, 5) +#define FLEXCAN_FDCBT_FPSEG2_MASK GENMASK(2, 0) /* FLEXCAN interrupt flag register (IFLAG) bits */ /* Errata ERR005829 step7: Reserve first valid MB */ @@ -161,6 +192,9 @@ #define FLEXCAN_MB_CODE_TX_DATA (0xc << 24) #define FLEXCAN_MB_CODE_TX_TANSWER (0xe << 24) +#define FLEXCAN_MB_CNT_EDL BIT(31) +#define FLEXCAN_MB_CNT_BRS BIT(30) +#define FLEXCAN_MB_CNT_ESI BIT(29) #define FLEXCAN_MB_CNT_SRR BIT(22) #define FLEXCAN_MB_CNT_IDE BIT(21) #define FLEXCAN_MB_CNT_RTR BIT(20) @@ -172,26 +206,39 @@ /* FLEXCAN hardware feature flags * * Below is some version info we got: - * SOC Version IP-Version Glitch- [TR]WRN_INT IRQ Err Memory err RTR re- - * Filter? connected? Passive detection ception in MB - * MX25 FlexCAN2 03.00.00.00 no no no no no - * MX28 FlexCAN2 03.00.04.00 yes yes no no no - * MX35 FlexCAN2 03.00.00.00 no no no no no - * MX53 FlexCAN2 03.00.00.00 yes no no no no - * MX6s FlexCAN3 10.00.12.00 yes yes no no yes - * VF610 FlexCAN3 ? no yes no yes yes? - * LS1021A FlexCAN2 03.00.04.00 no yes no no yes + * SOC Version IP-Version Glitch- [TR]WRN_INT IRQ Err Memory err RTR rece- FD Mode + * Filter? connected? Passive detection ption in MB Supported? + * MX25 FlexCAN2 03.00.00.00 no no no no no no + * MX28 FlexCAN2 03.00.04.00 yes yes no no no no + * MX35 FlexCAN2 03.00.00.00 no no no no no no + * MX53 FlexCAN2 03.00.00.00 yes no no no no no + * MX6s FlexCAN3 10.00.12.00 yes yes no no yes no + * MX8QM FlexCAN3 03.00.23.00 yes yes no no yes yes + * VF610 FlexCAN3 ? no yes no yes yes? no + * LS1021A FlexCAN2 03.00.04.00 no yes no no yes no + * LX2160A FlexCAN3 03.00.23.00 no yes no no yes yes * * Some SOCs do not have the RX_WARN & TX_WARN interrupt line connected. */ -#define FLEXCAN_QUIRK_BROKEN_WERR_STATE BIT(1) /* [TR]WRN_INT not connected */ -#define FLEXCAN_QUIRK_DISABLE_RXFG BIT(2) /* Disable RX FIFO Global mask */ -#define FLEXCAN_QUIRK_ENABLE_EACEN_RRS BIT(3) /* Enable EACEN and RRS bit in ctrl2 */ -#define FLEXCAN_QUIRK_DISABLE_MECR BIT(4) /* Disable Memory error detection */ -#define FLEXCAN_QUIRK_USE_OFF_TIMESTAMP BIT(5) /* Use timestamp based offloading */ -#define FLEXCAN_QUIRK_BROKEN_PERR_STATE BIT(6) /* No interrupt for error passive */ -#define FLEXCAN_QUIRK_DEFAULT_BIG_ENDIAN BIT(7) /* default to BE register access */ -#define FLEXCAN_QUIRK_SETUP_STOP_MODE BIT(8) /* Setup stop mode to support wakeup */ + +/* [TR]WRN_INT not connected */ +#define FLEXCAN_QUIRK_BROKEN_WERR_STATE BIT(1) + /* Disable RX FIFO Global mask */ +#define FLEXCAN_QUIRK_DISABLE_RXFG BIT(2) +/* Enable EACEN and RRS bit in ctrl2 */ +#define FLEXCAN_QUIRK_ENABLE_EACEN_RRS BIT(3) +/* Disable non-correctable errors interrupt and freeze mode */ +#define FLEXCAN_QUIRK_DISABLE_MECR BIT(4) +/* Use timestamp based offloading */ +#define FLEXCAN_QUIRK_USE_OFF_TIMESTAMP BIT(5) +/* No interrupt for error passive */ +#define FLEXCAN_QUIRK_BROKEN_PERR_STATE BIT(6) +/* default to BE register access */ +#define FLEXCAN_QUIRK_DEFAULT_BIG_ENDIAN BIT(7) +/* Setup stop mode to support wakeup */ +#define FLEXCAN_QUIRK_SETUP_STOP_MODE BIT(8) +/* Support CAN-FD mode */ +#define FLEXCAN_QUIRK_SUPPORT_FD BIT(9) /* Structure of the message buffer */ struct flexcan_mb { @@ -203,12 +250,12 @@ struct flexcan_mb { /* Structure of the hardware registers */ struct flexcan_regs { u32 mcr; /* 0x00 */ - u32 ctrl; /* 0x04 */ + u32 ctrl; /* 0x04 - Not affected by Soft Reset */ u32 timer; /* 0x08 */ - u32 _reserved1; /* 0x0c */ - u32 rxgmask; /* 0x10 */ - u32 rx14mask; /* 0x14 */ - u32 rx15mask; /* 0x18 */ + u32 tcr; /* 0x0c */ + u32 rxgmask; /* 0x10 - Not affected by Soft Reset */ + u32 rx14mask; /* 0x14 - Not affected by Soft Reset */ + u32 rx15mask; /* 0x18 - Not affected by Soft Reset */ u32 ecr; /* 0x1c */ u32 esr; /* 0x20 */ u32 imask2; /* 0x24 */ @@ -217,16 +264,20 @@ struct flexcan_regs { u32 iflag1; /* 0x30 */ union { /* 0x34 */ u32 gfwr_mx28; /* MX28, MX53 */ - u32 ctrl2; /* MX6, VF610 */ + u32 ctrl2; /* MX6, VF610 - Not affected by Soft Reset */ }; u32 esr2; /* 0x38 */ u32 imeur; /* 0x3c */ u32 lrfr; /* 0x40 */ u32 crcr; /* 0x44 */ u32 rxfgmask; /* 0x48 */ - u32 rxfir; /* 0x4c */ - u32 _reserved3[12]; /* 0x50 */ - u8 mb[2][512]; /* 0x80 */ + u32 rxfir; /* 0x4c - Not affected by Soft Reset */ + u32 cbt; /* 0x50 - Not affected by Soft Reset */ + u32 _reserved2; /* 0x54 */ + u32 dbg1; /* 0x58 */ + u32 dbg2; /* 0x5c */ + u32 _reserved3[8]; /* 0x60 */ + u8 mb[2][512]; /* 0x80 - Not affected by Soft Reset */ /* FIFO-mode: * MB * 0x080...0x08f 0 RX message buffer @@ -238,7 +289,7 @@ struct flexcan_regs { * (mx6, vf610) */ u32 _reserved4[256]; /* 0x480 */ - u32 rximr[64]; /* 0x880 */ + u32 rximr[64]; /* 0x880 - Not affected by Soft Reset */ u32 _reserved5[24]; /* 0x980 */ u32 gfwr_mx6; /* 0x9e0 - MX6 */ u32 _reserved6[63]; /* 0x9e4 */ @@ -250,8 +301,14 @@ struct flexcan_regs { u32 rerrdr; /* 0xaf4 */ u32 rerrsynr; /* 0xaf8 */ u32 errsr; /* 0xafc */ + u32 _reserved7[64]; /* 0xb00 */ + u32 fdctrl; /* 0xc00 - Not affected by Soft Reset */ + u32 fdcbt; /* 0xc04 - Not affected by Soft Reset */ + u32 fdcrc; /* 0xc08 */ }; +static_assert(sizeof(struct flexcan_regs) == 0x4 + 0xc08); + struct flexcan_devtype_data { u32 quirks; /* quirks needed for different IP cores */ }; @@ -313,6 +370,12 @@ static const struct flexcan_devtype_data fsl_imx6q_devtype_data = { FLEXCAN_QUIRK_SETUP_STOP_MODE, }; +static const struct flexcan_devtype_data fsl_imx8qm_devtype_data = { + .quirks = FLEXCAN_QUIRK_DISABLE_RXFG | FLEXCAN_QUIRK_ENABLE_EACEN_RRS | + FLEXCAN_QUIRK_USE_OFF_TIMESTAMP | FLEXCAN_QUIRK_BROKEN_PERR_STATE | + FLEXCAN_QUIRK_SUPPORT_FD, +}; + static const struct flexcan_devtype_data fsl_vf610_devtype_data = { .quirks = FLEXCAN_QUIRK_DISABLE_RXFG | FLEXCAN_QUIRK_ENABLE_EACEN_RRS | FLEXCAN_QUIRK_DISABLE_MECR | FLEXCAN_QUIRK_USE_OFF_TIMESTAMP | @@ -325,6 +388,12 @@ static const struct flexcan_devtype_data fsl_ls1021a_r2_devtype_data = { FLEXCAN_QUIRK_USE_OFF_TIMESTAMP, }; +static const struct flexcan_devtype_data fsl_lx2160a_r1_devtype_data = { + .quirks = FLEXCAN_QUIRK_DISABLE_RXFG | FLEXCAN_QUIRK_ENABLE_EACEN_RRS | + FLEXCAN_QUIRK_DISABLE_MECR | FLEXCAN_QUIRK_BROKEN_PERR_STATE | + FLEXCAN_QUIRK_USE_OFF_TIMESTAMP | FLEXCAN_QUIRK_SUPPORT_FD, +}; + static const struct can_bittiming_const flexcan_bittiming_const = { .name = DRV_NAME, .tseg1_min = 4, @@ -337,6 +406,30 @@ static const struct can_bittiming_const flexcan_bittiming_const = { .brp_inc = 1, }; +static const struct can_bittiming_const flexcan_fd_bittiming_const = { + .name = DRV_NAME, + .tseg1_min = 2, + .tseg1_max = 96, + .tseg2_min = 2, + .tseg2_max = 32, + .sjw_max = 16, + .brp_min = 1, + .brp_max = 1024, + .brp_inc = 1, +}; + +static const struct can_bittiming_const flexcan_fd_data_bittiming_const = { + .name = DRV_NAME, + .tseg1_min = 2, + .tseg1_max = 39, + .tseg2_min = 2, + .tseg2_max = 8, + .sjw_max = 4, + .brp_min = 1, + .brp_max = 1024, + .brp_inc = 1, +}; + /* FlexCAN module is essentially modelled as a little-endian IP in most * SoCs, i.e the registers as well as the message buffer areas are * implemented in a little-endian fashion. @@ -457,7 +550,6 @@ static inline int flexcan_exit_stop_mode(struct flexcan_priv *priv) regmap_update_bits(priv->stm.gpr, priv->stm.req_gpr, 1 << priv->stm.req_bit, 0); - reg_mcr = priv->read(®s->mcr); reg_mcr &= ~FLEXCAN_MCR_SLF_WAK; priv->write(reg_mcr, ®s->mcr); @@ -628,10 +720,10 @@ static int flexcan_get_berr_counter(const struct net_device *dev, static netdev_tx_t flexcan_start_xmit(struct sk_buff *skb, struct net_device *dev) { const struct flexcan_priv *priv = netdev_priv(dev); - struct can_frame *cf = (struct can_frame *)skb->data; + struct canfd_frame *cfd = (struct canfd_frame *)skb->data; u32 can_id; u32 data; - u32 ctrl = FLEXCAN_MB_CODE_TX_DATA | (cf->can_dlc << 16); + u32 ctrl = FLEXCAN_MB_CODE_TX_DATA | ((can_len2dlc(cfd->len)) << 16); int i; if (can_dropped_invalid_skb(dev, skb)) @@ -639,18 +731,25 @@ static netdev_tx_t flexcan_start_xmit(struct sk_buff *skb, struct net_device *de netif_stop_queue(dev); - if (cf->can_id & CAN_EFF_FLAG) { - can_id = cf->can_id & CAN_EFF_MASK; + if (cfd->can_id & CAN_EFF_FLAG) { + can_id = cfd->can_id & CAN_EFF_MASK; ctrl |= FLEXCAN_MB_CNT_IDE | FLEXCAN_MB_CNT_SRR; } else { - can_id = (cf->can_id & CAN_SFF_MASK) << 18; + can_id = (cfd->can_id & CAN_SFF_MASK) << 18; } - if (cf->can_id & CAN_RTR_FLAG) + if (cfd->can_id & CAN_RTR_FLAG) ctrl |= FLEXCAN_MB_CNT_RTR; - for (i = 0; i < cf->can_dlc; i += sizeof(u32)) { - data = be32_to_cpup((__be32 *)&cf->data[i]); + if (can_is_canfd_skb(skb)) { + ctrl |= FLEXCAN_MB_CNT_EDL; + + if (cfd->flags & CANFD_BRS) + ctrl |= FLEXCAN_MB_CNT_BRS; + } + + for (i = 0; i < cfd->len; i += sizeof(u32)) { + data = be32_to_cpup((__be32 *)&cfd->data[i]); priv->write(data, &priv->tx_mb->data[i / sizeof(u32)]); } @@ -822,7 +921,7 @@ static struct sk_buff *flexcan_mailbox_read(struct can_rx_offload *offload, struct flexcan_regs __iomem *regs = priv->regs; struct flexcan_mb __iomem *mb; struct sk_buff *skb; - struct can_frame *cf; + struct canfd_frame *cfd; u32 reg_ctrl, reg_id, reg_iflag1; int i; @@ -859,8 +958,11 @@ static struct sk_buff *flexcan_mailbox_read(struct can_rx_offload *offload, reg_ctrl = priv->read(&mb->can_ctrl); } - skb = alloc_can_skb(offload->dev, &cf); - if (!skb) { + if (reg_ctrl & FLEXCAN_MB_CNT_EDL) + skb = alloc_canfd_skb(offload->dev, &cfd); + else + skb = alloc_can_skb(offload->dev, (struct can_frame **)&cfd); + if (unlikely(!skb)) { skb = ERR_PTR(-ENOMEM); goto mark_as_read; } @@ -870,17 +972,28 @@ static struct sk_buff *flexcan_mailbox_read(struct can_rx_offload *offload, reg_id = priv->read(&mb->can_id); if (reg_ctrl & FLEXCAN_MB_CNT_IDE) - cf->can_id = ((reg_id >> 0) & CAN_EFF_MASK) | CAN_EFF_FLAG; + cfd->can_id = ((reg_id >> 0) & CAN_EFF_MASK) | CAN_EFF_FLAG; else - cf->can_id = (reg_id >> 18) & CAN_SFF_MASK; + cfd->can_id = (reg_id >> 18) & CAN_SFF_MASK; + + if (reg_ctrl & FLEXCAN_MB_CNT_EDL) { + cfd->len = can_dlc2len(get_canfd_dlc((reg_ctrl >> 16) & 0xf)); + + if (reg_ctrl & FLEXCAN_MB_CNT_BRS) + cfd->flags |= CANFD_BRS; + } else { + cfd->len = get_can_dlc((reg_ctrl >> 16) & 0xf); + + if (reg_ctrl & FLEXCAN_MB_CNT_RTR) + cfd->can_id |= CAN_RTR_FLAG; + } - if (reg_ctrl & FLEXCAN_MB_CNT_RTR) - cf->can_id |= CAN_RTR_FLAG; - cf->can_dlc = get_can_dlc((reg_ctrl >> 16) & 0xf); + if (reg_ctrl & FLEXCAN_MB_CNT_ESI) + cfd->flags |= CANFD_ESI; - for (i = 0; i < cf->can_dlc; i += sizeof(u32)) { + for (i = 0; i < cfd->len; i += sizeof(u32)) { __be32 data = cpu_to_be32(priv->read(&mb->data[i / sizeof(u32)])); - *(__be32 *)(cf->data + i) = data; + *(__be32 *)(cfd->data + i) = data; } mark_as_read: @@ -961,10 +1074,10 @@ static irqreturn_t flexcan_irq(int irq, void *dev_id) reg_esr = priv->read(®s->esr); - /* ACK all bus error and state change IRQ sources */ - if (reg_esr & FLEXCAN_ESR_ALL_INT) { + /* ACK all bus error, state change and wake IRQ sources */ + if (reg_esr & (FLEXCAN_ESR_ALL_INT | FLEXCAN_ESR_WAK_INT)) { handled = IRQ_HANDLED; - priv->write(reg_esr & FLEXCAN_ESR_ALL_INT, ®s->esr); + priv->write(reg_esr & (FLEXCAN_ESR_ALL_INT | FLEXCAN_ESR_WAK_INT), ®s->esr); } /* state change interrupt or broken error state quirk fix is enabled */ @@ -1019,7 +1132,7 @@ static irqreturn_t flexcan_irq(int irq, void *dev_id) return handled; } -static void flexcan_set_bittiming(struct net_device *dev) +static void flexcan_set_bittiming_ctrl(const struct net_device *dev) { const struct flexcan_priv *priv = netdev_priv(dev); const struct can_bittiming *bt = &priv->can.bittiming; @@ -1031,10 +1144,7 @@ static void flexcan_set_bittiming(struct net_device *dev) FLEXCAN_CTRL_RJW(0x3) | FLEXCAN_CTRL_PSEG1(0x7) | FLEXCAN_CTRL_PSEG2(0x7) | - FLEXCAN_CTRL_PROPSEG(0x7) | - FLEXCAN_CTRL_LPB | - FLEXCAN_CTRL_SMP | - FLEXCAN_CTRL_LOM); + FLEXCAN_CTRL_PROPSEG(0x7)); reg |= FLEXCAN_CTRL_PRESDIV(bt->brp - 1) | FLEXCAN_CTRL_PSEG1(bt->phase_seg1 - 1) | @@ -1042,6 +1152,130 @@ static void flexcan_set_bittiming(struct net_device *dev) FLEXCAN_CTRL_RJW(bt->sjw - 1) | FLEXCAN_CTRL_PROPSEG(bt->prop_seg - 1); + netdev_dbg(dev, "writing ctrl=0x%08x\n", reg); + priv->write(reg, ®s->ctrl); + + /* print chip status */ + netdev_dbg(dev, "%s: mcr=0x%08x ctrl=0x%08x\n", __func__, + priv->read(®s->mcr), priv->read(®s->ctrl)); +} + +static void flexcan_set_bittiming_cbt(const struct net_device *dev) +{ + struct flexcan_priv *priv = netdev_priv(dev); + struct can_bittiming *bt = &priv->can.bittiming; + struct can_bittiming *dbt = &priv->can.data_bittiming; + struct flexcan_regs __iomem *regs = priv->regs; + u32 reg_cbt, reg_fdctrl; + + /* CBT */ + /* CBT[EPSEG1] is 5 bit long and CBT[EPROPSEG] is 6 bit + * long. The can_calc_bittiming() tries to divide the tseg1 + * equally between phase_seg1 and prop_seg, which may not fit + * in CBT register. Therefore, if phase_seg1 is more than + * possible value, increase prop_seg and decrease phase_seg1. + */ + if (bt->phase_seg1 > 0x20) { + bt->prop_seg += (bt->phase_seg1 - 0x20); + bt->phase_seg1 = 0x20; + } + + reg_cbt = FLEXCAN_CBT_BTF | + FIELD_PREP(FLEXCAN_CBT_EPRESDIV_MASK, bt->brp - 1) | + FIELD_PREP(FLEXCAN_CBT_ERJW_MASK, bt->sjw - 1) | + FIELD_PREP(FLEXCAN_CBT_EPROPSEG_MASK, bt->prop_seg - 1) | + FIELD_PREP(FLEXCAN_CBT_EPSEG1_MASK, bt->phase_seg1 - 1) | + FIELD_PREP(FLEXCAN_CBT_EPSEG2_MASK, bt->phase_seg2 - 1); + + netdev_dbg(dev, "writing cbt=0x%08x\n", reg_cbt); + priv->write(reg_cbt, ®s->cbt); + + if (priv->can.ctrlmode & CAN_CTRLMODE_FD) { + u32 reg_fdcbt, reg_ctrl2; + + if (bt->brp != dbt->brp) + netdev_warn(dev, "Data brp=%d and brp=%d don't match, this may result in a phase error. Consider using different bitrate and/or data bitrate.\n", + dbt->brp, bt->brp); + + /* FDCBT */ + /* FDCBT[FPSEG1] is 3 bit long and FDCBT[FPROPSEG] is + * 5 bit long. The can_calc_bittiming tries to divide + * the tseg1 equally between phase_seg1 and prop_seg, + * which may not fit in FDCBT register. Therefore, if + * phase_seg1 is more than possible value, increase + * prop_seg and decrease phase_seg1 + */ + if (dbt->phase_seg1 > 0x8) { + dbt->prop_seg += (dbt->phase_seg1 - 0x8); + dbt->phase_seg1 = 0x8; + } + + reg_fdcbt = priv->read(®s->fdcbt); + reg_fdcbt &= ~(FIELD_PREP(FLEXCAN_FDCBT_FPRESDIV_MASK, 0x3ff) | + FIELD_PREP(FLEXCAN_FDCBT_FRJW_MASK, 0x7) | + FIELD_PREP(FLEXCAN_FDCBT_FPROPSEG_MASK, 0x1f) | + FIELD_PREP(FLEXCAN_FDCBT_FPSEG1_MASK, 0x7) | + FIELD_PREP(FLEXCAN_FDCBT_FPSEG2_MASK, 0x7)); + + reg_fdcbt |= FIELD_PREP(FLEXCAN_FDCBT_FPRESDIV_MASK, dbt->brp - 1) | + FIELD_PREP(FLEXCAN_FDCBT_FRJW_MASK, dbt->sjw - 1) | + FIELD_PREP(FLEXCAN_FDCBT_FPROPSEG_MASK, dbt->prop_seg) | + FIELD_PREP(FLEXCAN_FDCBT_FPSEG1_MASK, dbt->phase_seg1 - 1) | + FIELD_PREP(FLEXCAN_FDCBT_FPSEG2_MASK, dbt->phase_seg2 - 1); + + netdev_dbg(dev, "writing fdcbt=0x%08x\n", reg_fdcbt); + priv->write(reg_fdcbt, ®s->fdcbt); + + /* CTRL2 */ + reg_ctrl2 = priv->read(®s->ctrl2); + reg_ctrl2 &= ~FLEXCAN_CTRL2_ISOCANFDEN; + if (!(priv->can.ctrlmode & CAN_CTRLMODE_FD_NON_ISO)) + reg_ctrl2 |= FLEXCAN_CTRL2_ISOCANFDEN; + + netdev_dbg(dev, "writing ctrl2=0x%08x\n", reg_ctrl2); + priv->write(reg_ctrl2, ®s->ctrl2); + } + + /* FDCTRL */ + reg_fdctrl = priv->read(®s->fdctrl); + reg_fdctrl &= ~(FLEXCAN_FDCTRL_FDRATE | + FIELD_PREP(FLEXCAN_FDCTRL_TDCOFF, 0x1f)); + + if (priv->can.ctrlmode & CAN_CTRLMODE_FD) { + reg_fdctrl |= FLEXCAN_FDCTRL_FDRATE; + + if (priv->can.ctrlmode & CAN_CTRLMODE_LOOPBACK) { + /* TDC must be disabled for Loop Back mode */ + reg_fdctrl &= ~FLEXCAN_FDCTRL_TDCEN; + } else { + reg_fdctrl |= FLEXCAN_FDCTRL_TDCEN | + FIELD_PREP(FLEXCAN_FDCTRL_TDCOFF, + ((dbt->phase_seg1 - 1) + + dbt->prop_seg + 2) * + ((dbt->brp - 1 ) + 1)); + } + } + + netdev_dbg(dev, "writing fdctrl=0x%08x\n", reg_fdctrl); + priv->write(reg_fdctrl, ®s->fdctrl); + + netdev_dbg(dev, "%s: mcr=0x%08x ctrl=0x%08x ctrl2=0x%08x fdctrl=0x%08x cbt=0x%08x fdcbt=0x%08x\n", + __func__, + priv->read(®s->mcr), priv->read(®s->ctrl), + priv->read(®s->ctrl2), priv->read(®s->fdctrl), + priv->read(®s->cbt), priv->read(®s->fdcbt)); +} + +static void flexcan_set_bittiming(struct net_device *dev) +{ + const struct flexcan_priv *priv = netdev_priv(dev); + struct flexcan_regs __iomem *regs = priv->regs; + u32 reg; + + reg = priv->read(®s->ctrl); + reg &= ~(FLEXCAN_CTRL_LPB | FLEXCAN_CTRL_SMP | + FLEXCAN_CTRL_LOM); + if (priv->can.ctrlmode & CAN_CTRLMODE_LOOPBACK) reg |= FLEXCAN_CTRL_LPB; if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY) @@ -1052,9 +1286,10 @@ static void flexcan_set_bittiming(struct net_device *dev) netdev_dbg(dev, "writing ctrl=0x%08x\n", reg); priv->write(reg, ®s->ctrl); - /* print chip status */ - netdev_dbg(dev, "%s: mcr=0x%08x ctrl=0x%08x\n", __func__, - priv->read(®s->mcr), priv->read(®s->ctrl)); + if (priv->can.ctrlmode_supported & CAN_CTRLMODE_FD) + return flexcan_set_bittiming_cbt(dev); + else + return flexcan_set_bittiming_ctrl(dev); } /* flexcan_chip_start @@ -1127,6 +1362,12 @@ static int flexcan_chip_start(struct net_device *dev) else reg_mcr |= FLEXCAN_MCR_SRX_DIS; + /* MCR - CAN-FD */ + if (priv->can.ctrlmode & CAN_CTRLMODE_FD) + reg_mcr |= FLEXCAN_MCR_FDEN; + else + reg_mcr &= ~FLEXCAN_MCR_FDEN; + netdev_dbg(dev, "%s: writing mcr=0x%08x", __func__, reg_mcr); priv->write(reg_mcr, ®s->mcr); @@ -1169,6 +1410,32 @@ static int flexcan_chip_start(struct net_device *dev) priv->write(reg_ctrl2, ®s->ctrl2); } + if (priv->can.ctrlmode_supported & CAN_CTRLMODE_FD) { + u32 reg_fdctrl; + + reg_fdctrl = priv->read(®s->fdctrl); + reg_fdctrl &= ~(FIELD_PREP(FLEXCAN_FDCTRL_MBDSR1, 0x3) | + FIELD_PREP(FLEXCAN_FDCTRL_MBDSR0, 0x3)); + + if (priv->can.ctrlmode & CAN_CTRLMODE_FD) { + reg_fdctrl |= + FIELD_PREP(FLEXCAN_FDCTRL_MBDSR1, + FLEXCAN_FDCTRL_MBDSR_64) | + FIELD_PREP(FLEXCAN_FDCTRL_MBDSR0, + FLEXCAN_FDCTRL_MBDSR_64); + } else { + reg_fdctrl |= + FIELD_PREP(FLEXCAN_FDCTRL_MBDSR1, + FLEXCAN_FDCTRL_MBDSR_8) | + FIELD_PREP(FLEXCAN_FDCTRL_MBDSR0, + FLEXCAN_FDCTRL_MBDSR_8); + } + + netdev_dbg(dev, "%s: writing fdctrl=0x%08x", + __func__, reg_fdctrl); + priv->write(reg_fdctrl, ®s->fdctrl); + } + if (priv->devtype_data->quirks & FLEXCAN_QUIRK_USE_OFF_TIMESTAMP) { for (i = priv->offload.mb_first; i <= priv->offload.mb_last; i++) { mb = flexcan_get_mb(priv, i); @@ -1204,28 +1471,43 @@ static int flexcan_chip_start(struct net_device *dev) for (i = 0; i < priv->mb_count; i++) priv->write(0, ®s->rximr[i]); - /* On Vybrid, disable memory error detection interrupts - * and freeze mode. - * This also works around errata e5295 which generates - * false positive memory errors and put the device in - * freeze mode. + /* On Vybrid, disable non-correctable errors interrupt and + * freeze mode. It still can correct the correctable errors + * when HW supports ECC. + * + * This also works around errata e5295 which generates false + * positive memory errors and put the device in freeze mode. */ if (priv->devtype_data->quirks & FLEXCAN_QUIRK_DISABLE_MECR) { /* Follow the protocol as described in "Detection * and Correction of Memory Errors" to write to - * MECR register + * MECR register (step 1 - 5) + * + * 1. By default, CTRL2[ECRWRE] = 0, MECR[ECRWRDIS] = 1 + * 2. set CTRL2[ECRWRE] */ reg_ctrl2 = priv->read(®s->ctrl2); reg_ctrl2 |= FLEXCAN_CTRL2_ECRWRE; priv->write(reg_ctrl2, ®s->ctrl2); + /* 3. clear MECR[ECRWRDIS] */ reg_mecr = priv->read(®s->mecr); reg_mecr &= ~FLEXCAN_MECR_ECRWRDIS; priv->write(reg_mecr, ®s->mecr); - reg_mecr |= FLEXCAN_MECR_ECCDIS; + + /* 4. all writes to MECR must keep MECR[ECRWRDIS] cleared */ reg_mecr &= ~(FLEXCAN_MECR_NCEFAFRZ | FLEXCAN_MECR_HANCEI_MSK | FLEXCAN_MECR_FANCEI_MSK); priv->write(reg_mecr, ®s->mecr); + + /* 5. after configuration done, lock MECR by either + * setting MECR[ECRWRDIS] or clearing CTRL2[ECRWRE] + */ + reg_mecr |= FLEXCAN_MECR_ECRWRDIS; + priv->write(reg_mecr, ®s->mecr); + + reg_ctrl2 &= ~FLEXCAN_CTRL2_ECRWRE; + priv->write(reg_ctrl2, ®s->ctrl2); } err = flexcan_transceiver_enable(priv); @@ -1260,18 +1542,23 @@ static int flexcan_chip_start(struct net_device *dev) return err; } -/* flexcan_chip_stop +/* __flexcan_chip_stop * - * this functions is entered with clocks enabled + * this function is entered with clocks enabled */ -static void flexcan_chip_stop(struct net_device *dev) +static int __flexcan_chip_stop(struct net_device *dev, bool disable_on_error) { struct flexcan_priv *priv = netdev_priv(dev); struct flexcan_regs __iomem *regs = priv->regs; + int err; /* freeze + disable module */ - flexcan_chip_freeze(priv); - flexcan_chip_disable(priv); + err = flexcan_chip_freeze(priv); + if (err && !disable_on_error) + return err; + err = flexcan_chip_disable(priv); + if (err && !disable_on_error) + goto out_chip_unfreeze; /* Disable all interrupts */ priv->write(0, ®s->imask2); @@ -1281,6 +1568,23 @@ static void flexcan_chip_stop(struct net_device *dev) flexcan_transceiver_disable(priv); priv->can.state = CAN_STATE_STOPPED; + + return 0; + + out_chip_unfreeze: + flexcan_chip_unfreeze(priv); + + return err; +} + +static inline int flexcan_chip_stop_disable_on_error(struct net_device *dev) +{ + return __flexcan_chip_stop(dev, true); +} + +static inline int flexcan_chip_stop(struct net_device *dev) +{ + return __flexcan_chip_stop(dev, false); } static int flexcan_open(struct net_device *dev) @@ -1288,6 +1592,12 @@ static int flexcan_open(struct net_device *dev) struct flexcan_priv *priv = netdev_priv(dev); int err; + if ((priv->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES) && + (priv->can.ctrlmode & CAN_CTRLMODE_FD)) { + netdev_err(dev, "Three Samples mode and CAN-FD mode can't be used together\n"); + return -EINVAL; + } + err = pm_runtime_get_sync(priv->dev); if (err < 0) return err; @@ -1300,7 +1610,10 @@ static int flexcan_open(struct net_device *dev) if (err) goto out_close; - priv->mb_size = sizeof(struct flexcan_mb) + CAN_MAX_DLEN; + if (priv->can.ctrlmode & CAN_CTRLMODE_FD) + priv->mb_size = sizeof(struct flexcan_mb) + CANFD_MAX_DLEN; + else + priv->mb_size = sizeof(struct flexcan_mb) + CAN_MAX_DLEN; priv->mb_count = (sizeof(priv->regs->mb[0]) / priv->mb_size) + (sizeof(priv->regs->mb[1]) / priv->mb_size); @@ -1362,7 +1675,7 @@ static int flexcan_close(struct net_device *dev) netif_stop_queue(dev); can_rx_offload_disable(&priv->offload); - flexcan_chip_stop(dev); + flexcan_chip_stop_disable_on_error(dev); can_rx_offload_del(&priv->offload); free_irq(dev->irq, dev); @@ -1531,6 +1844,7 @@ out_put_node: } static const struct of_device_id flexcan_of_match[] = { + { .compatible = "fsl,imx8qm-flexcan", .data = &fsl_imx8qm_devtype_data, }, { .compatible = "fsl,imx6q-flexcan", .data = &fsl_imx6q_devtype_data, }, { .compatible = "fsl,imx28-flexcan", .data = &fsl_imx28_devtype_data, }, { .compatible = "fsl,imx53-flexcan", .data = &fsl_imx25_devtype_data, }, @@ -1539,6 +1853,7 @@ static const struct of_device_id flexcan_of_match[] = { { .compatible = "fsl,p1010-flexcan", .data = &fsl_p1010_devtype_data, }, { .compatible = "fsl,vf610-flexcan", .data = &fsl_vf610_devtype_data, }, { .compatible = "fsl,ls1021ar2-flexcan", .data = &fsl_ls1021a_r2_devtype_data, }, + { .compatible = "fsl,lx2160ar1-flexcan", .data = &fsl_lx2160a_r1_devtype_data, }, { /* sentinel */ }, }; MODULE_DEVICE_TABLE(of, flexcan_of_match); @@ -1562,11 +1877,13 @@ static int flexcan_probe(struct platform_device *pdev) u8 clk_src = 1; u32 clock_freq = 0; - reg_xceiver = devm_regulator_get(&pdev->dev, "xceiver"); + reg_xceiver = devm_regulator_get_optional(&pdev->dev, "xceiver"); if (PTR_ERR(reg_xceiver) == -EPROBE_DEFER) return -EPROBE_DEFER; - else if (IS_ERR(reg_xceiver)) + else if (PTR_ERR(reg_xceiver) == -ENODEV) reg_xceiver = NULL; + else if (IS_ERR(reg_xceiver)) + return PTR_ERR(reg_xceiver); if (pdev->dev.of_node) { of_property_read_u32(pdev->dev.of_node, @@ -1608,6 +1925,12 @@ static int flexcan_probe(struct platform_device *pdev) return -ENODEV; } + if ((devtype_data->quirks & FLEXCAN_QUIRK_SUPPORT_FD) && + !(devtype_data->quirks & FLEXCAN_QUIRK_USE_OFF_TIMESTAMP)) { + dev_err(&pdev->dev, "CAN-FD mode doesn't work with FIFO mode!\n"); + return -EINVAL; + } + dev = alloc_candev(sizeof(struct flexcan_priv), 1); if (!dev) return -ENOMEM; @@ -1632,7 +1955,6 @@ static int flexcan_probe(struct platform_device *pdev) priv->dev = &pdev->dev; priv->can.clock.freq = clock_freq; - priv->can.bittiming_const = &flexcan_bittiming_const; priv->can.do_set_mode = flexcan_set_mode; priv->can.do_get_berr_counter = flexcan_get_berr_counter; priv->can.ctrlmode_supported = CAN_CTRLMODE_LOOPBACK | @@ -1645,6 +1967,16 @@ static int flexcan_probe(struct platform_device *pdev) priv->devtype_data = devtype_data; priv->reg_xceiver = reg_xceiver; + if (priv->devtype_data->quirks & FLEXCAN_QUIRK_SUPPORT_FD) { + priv->can.ctrlmode_supported |= CAN_CTRLMODE_FD | + CAN_CTRLMODE_FD_NON_ISO; + priv->can.bittiming_const = &flexcan_fd_bittiming_const; + priv->can.data_bittiming_const = + &flexcan_fd_data_bittiming_const; + } else { + priv->can.bittiming_const = &flexcan_bittiming_const; + } + pm_runtime_get_noresume(&pdev->dev); pm_runtime_set_active(&pdev->dev); pm_runtime_enable(&pdev->dev); @@ -1655,6 +1987,7 @@ static int flexcan_probe(struct platform_device *pdev) goto failed_register; } + of_can_transceiver(dev); devm_can_led_init(dev); if (priv->devtype_data->quirks & FLEXCAN_QUIRK_SETUP_STOP_MODE) { @@ -1685,7 +2018,7 @@ static int __maybe_unused flexcan_suspend(struct device *device) { struct net_device *dev = dev_get_drvdata(device); struct flexcan_priv *priv = netdev_priv(dev); - int err = 0; + int err; if (netif_running(dev)) { /* if wakeup is enabled, enter stop mode @@ -1697,25 +2030,27 @@ static int __maybe_unused flexcan_suspend(struct device *device) if (err) return err; } else { - err = flexcan_chip_disable(priv); + err = flexcan_chip_stop(dev); if (err) return err; - err = pm_runtime_force_suspend(device); + err = pinctrl_pm_select_sleep_state(device); + if (err) + return err; } netif_stop_queue(dev); netif_device_detach(dev); } priv->can.state = CAN_STATE_SLEEPING; - return err; + return 0; } static int __maybe_unused flexcan_resume(struct device *device) { struct net_device *dev = dev_get_drvdata(device); struct flexcan_priv *priv = netdev_priv(dev); - int err = 0; + int err; priv->can.state = CAN_STATE_ERROR_ACTIVE; if (netif_running(dev)) { @@ -1727,15 +2062,17 @@ static int __maybe_unused flexcan_resume(struct device *device) if (err) return err; } else { - err = pm_runtime_force_resume(device); + err = pinctrl_pm_select_default_state(device); if (err) return err; - err = flexcan_chip_enable(priv); + err = flexcan_chip_start(dev); + if (err) + return err; } } - return err; + return 0; } static int __maybe_unused flexcan_runtime_suspend(struct device *device) @@ -1761,8 +2098,16 @@ static int __maybe_unused flexcan_noirq_suspend(struct device *device) struct net_device *dev = dev_get_drvdata(device); struct flexcan_priv *priv = netdev_priv(dev); - if (netif_running(dev) && device_may_wakeup(device)) - flexcan_enable_wakeup_irq(priv, true); + if (netif_running(dev)) { + int err; + + if (device_may_wakeup(device)) + flexcan_enable_wakeup_irq(priv, true); + + err = pm_runtime_force_suspend(device); + if (err) + return err; + } return 0; } @@ -1772,8 +2117,16 @@ static int __maybe_unused flexcan_noirq_resume(struct device *device) struct net_device *dev = dev_get_drvdata(device); struct flexcan_priv *priv = netdev_priv(dev); - if (netif_running(dev) && device_may_wakeup(device)) - flexcan_enable_wakeup_irq(priv, false); + if (netif_running(dev)) { + int err; + + err = pm_runtime_force_resume(device); + if (err) + return err; + + if (device_may_wakeup(device)) + flexcan_enable_wakeup_irq(priv, false); + } return 0; } diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 6a5796c32721..73507cff3bc4 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1377,23 +1377,6 @@ EXPORT_SYMBOL(b53_phylink_mac_link_up); int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) { struct b53_device *dev = ds->priv; - u16 pvid, new_pvid; - - b53_read16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), &pvid); - if (!vlan_filtering) { - /* Filtering is currently enabled, use the default PVID since - * the bridge does not expect tagging anymore - */ - dev->ports[port].pvid = pvid; - new_pvid = b53_default_pvid(dev); - } else { - /* Filtering is currently disabled, restore the previous PVID */ - new_pvid = dev->ports[port].pvid; - } - - if (pvid != new_pvid) - b53_write16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), - new_pvid); b53_enable_vlan(dev, dev->vlan_enabled, vlan_filtering); @@ -2619,6 +2602,8 @@ struct b53_device *b53_switch_alloc(struct device *base, dev->priv = priv; dev->ops = ops; ds->ops = &b53_switch_ops; + ds->configure_vlan_while_not_filtering = true; + dev->vlan_enabled = ds->configure_vlan_while_not_filtering; mutex_init(&dev->reg_mutex); mutex_init(&dev->stats_mutex); diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index c55c0a9f1b47..24893b592216 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -91,7 +91,6 @@ enum { struct b53_port { u16 vlan_ctl_mask; struct ethtool_eee eee; - u16 pvid; }; struct b53_vlan { diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 723820603107..0b5b2b33b3b6 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -457,6 +457,7 @@ static void bcm_sf2_identify_ports(struct bcm_sf2_priv *priv, { struct device_node *port; unsigned int port_num; + struct property *prop; phy_interface_t mode; int err; @@ -483,6 +484,16 @@ static void bcm_sf2_identify_ports(struct bcm_sf2_priv *priv, if (of_property_read_bool(port, "brcm,use-bcm-hdr")) priv->brcm_tag_mask |= 1 << port_num; + + /* Ensure that port 5 is not picked up as a DSA CPU port + * flavour but a regular port instead. We should be using + * devlink to be able to set the port flavour. + */ + if (port_num == 5 && priv->type == BCM7278_DEVICE_ID) { + prop = of_find_property(port, "ethernet", NULL); + if (prop) + of_remove_property(port, prop); + } } } @@ -527,7 +538,7 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds) * driver. */ if (of_machine_is_compatible("brcm,bcm7445d0")) - priv->indir_phy_mask |= (1 << BRCM_PSEUDO_PHY_ADDR); + priv->indir_phy_mask |= (1 << BRCM_PSEUDO_PHY_ADDR) | (1 << 0); else priv->indir_phy_mask = 0; diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 5f395d4119ac..a56fc50f5be4 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -669,8 +669,11 @@ static bool felix_txtstamp(struct dsa_switch *ds, int port, struct ocelot *ocelot = ds->priv; struct ocelot_port *ocelot_port = ocelot->ports[port]; - if (!ocelot_port_add_txtstamp_skb(ocelot_port, clone)) + if (ocelot->ptp && (skb_shinfo(clone)->tx_flags & SKBTX_HW_TSTAMP) && + ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { + ocelot_port_add_txtstamp_skb(ocelot, port, clone); return true; + } return false; } diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c b/drivers/net/ethernet/allwinner/sun4i-emac.c index b3b8a8010142..862ea44beea7 100644 --- a/drivers/net/ethernet/allwinner/sun4i-emac.c +++ b/drivers/net/ethernet/allwinner/sun4i-emac.c @@ -640,13 +640,11 @@ static irqreturn_t emac_interrupt(int irq, void *dev_id) struct net_device *dev = dev_id; struct emac_board_info *db = netdev_priv(dev); int int_status; - unsigned long flags; unsigned int reg_val; /* A real interrupt coming */ - /* holders of db->lock must always block IRQs */ - spin_lock_irqsave(&db->lock, flags); + spin_lock(&db->lock); /* Disable all interrupts */ writel(0, db->membase + EMAC_INT_CTL_REG); @@ -680,7 +678,7 @@ static irqreturn_t emac_interrupt(int irq, void *dev_id) reg_val |= (0xf << 0) | (0x01 << 8); writel(reg_val, db->membase + EMAC_INT_CTL_REG); } - spin_unlock_irqrestore(&db->lock, flags); + spin_unlock(&db->lock); return IRQ_HANDLED; } diff --git a/drivers/net/ethernet/freescale/dpaa2/Kconfig b/drivers/net/ethernet/freescale/dpaa2/Kconfig index feea797cde02..cfd369cf4c8c 100644 --- a/drivers/net/ethernet/freescale/dpaa2/Kconfig +++ b/drivers/net/ethernet/freescale/dpaa2/Kconfig @@ -3,6 +3,7 @@ config FSL_DPAA2_ETH tristate "Freescale DPAA2 Ethernet" depends on FSL_MC_BUS && FSL_MC_DPIO select PHYLINK + select PCS_LYNX help This is the DPAA2 Ethernet driver supporting Freescale SoCs with DPAA2 (DataPath Acceleration Architecture v2). diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c index 3ee236c5fc37..6ff64dd1cf27 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c @@ -15,6 +15,18 @@ static int phy_mode(enum dpmac_eth_if eth_if, phy_interface_t *if_mode) case DPMAC_ETH_IF_RGMII: *if_mode = PHY_INTERFACE_MODE_RGMII; break; + case DPMAC_ETH_IF_USXGMII: + *if_mode = PHY_INTERFACE_MODE_USXGMII; + break; + case DPMAC_ETH_IF_QSGMII: + *if_mode = PHY_INTERFACE_MODE_QSGMII; + break; + case DPMAC_ETH_IF_SGMII: + *if_mode = PHY_INTERFACE_MODE_SGMII; + break; + case DPMAC_ETH_IF_XFI: + *if_mode = PHY_INTERFACE_MODE_10GBASER; + break; default: return -EINVAL; } @@ -67,6 +79,10 @@ static bool dpaa2_mac_phy_mode_mismatch(struct dpaa2_mac *mac, phy_interface_t interface) { switch (interface) { + case PHY_INTERFACE_MODE_10GBASER: + case PHY_INTERFACE_MODE_USXGMII: + case PHY_INTERFACE_MODE_QSGMII: + case PHY_INTERFACE_MODE_SGMII: case PHY_INTERFACE_MODE_RGMII: case PHY_INTERFACE_MODE_RGMII_ID: case PHY_INTERFACE_MODE_RGMII_RXID: @@ -95,6 +111,17 @@ static void dpaa2_mac_validate(struct phylink_config *config, phylink_set(mask, Asym_Pause); switch (state->interface) { + case PHY_INTERFACE_MODE_NA: + case PHY_INTERFACE_MODE_10GBASER: + case PHY_INTERFACE_MODE_USXGMII: + phylink_set(mask, 10000baseT_Full); + if (state->interface == PHY_INTERFACE_MODE_10GBASER) + break; + phylink_set(mask, 5000baseT_Full); + phylink_set(mask, 2500baseT_Full); + fallthrough; + case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_QSGMII: case PHY_INTERFACE_MODE_RGMII: case PHY_INTERFACE_MODE_RGMII_ID: case PHY_INTERFACE_MODE_RGMII_RXID: @@ -227,6 +254,52 @@ out: return fixed; } +static int dpaa2_pcs_create(struct dpaa2_mac *mac, + struct device_node *dpmac_node, int id) +{ + struct mdio_device *mdiodev; + struct device_node *node; + + node = of_parse_phandle(dpmac_node, "pcs-handle", 0); + if (!node) { + /* do not error out on old DTS files */ + netdev_warn(mac->net_dev, "pcs-handle node not found\n"); + return 0; + } + + if (!of_device_is_available(node) || + !of_device_is_available(node->parent)) { + netdev_err(mac->net_dev, "pcs-handle node not available\n"); + return -ENODEV; + } + + mdiodev = of_mdio_find_device(node); + of_node_put(node); + if (!mdiodev) + return -EPROBE_DEFER; + + mac->pcs = lynx_pcs_create(mdiodev); + if (!mac->pcs) { + netdev_err(mac->net_dev, "lynx_pcs_create() failed\n"); + put_device(&mdiodev->dev); + return -ENOMEM; + } + + return 0; +} + +static void dpaa2_pcs_destroy(struct dpaa2_mac *mac) +{ + struct lynx_pcs *pcs = mac->pcs; + struct device *dev = &pcs->mdio->dev; + + if (pcs) { + lynx_pcs_destroy(pcs); + put_device(dev); + mac->pcs = NULL; + } +} + int dpaa2_mac_connect(struct dpaa2_mac *mac) { struct fsl_mc_device *dpmac_dev = mac->mc_dev; @@ -278,6 +351,13 @@ int dpaa2_mac_connect(struct dpaa2_mac *mac) goto err_put_node; } + if (attr.link_type == DPMAC_LINK_TYPE_PHY && + attr.eth_if != DPMAC_ETH_IF_RGMII) { + err = dpaa2_pcs_create(mac, dpmac_node, attr.id); + if (err) + goto err_put_node; + } + mac->phylink_config.dev = &net_dev->dev; mac->phylink_config.type = PHYLINK_NETDEV; @@ -286,10 +366,13 @@ int dpaa2_mac_connect(struct dpaa2_mac *mac) &dpaa2_mac_phylink_ops); if (IS_ERR(phylink)) { err = PTR_ERR(phylink); - goto err_put_node; + goto err_pcs_destroy; } mac->phylink = phylink; + if (mac->pcs) + phylink_set_pcs(mac->phylink, &mac->pcs->pcs); + err = phylink_of_phy_connect(mac->phylink, dpmac_node, 0); if (err) { netdev_err(net_dev, "phylink_of_phy_connect() = %d\n", err); @@ -302,6 +385,8 @@ int dpaa2_mac_connect(struct dpaa2_mac *mac) err_phylink_destroy: phylink_destroy(mac->phylink); +err_pcs_destroy: + dpaa2_pcs_destroy(mac); err_put_node: of_node_put(dpmac_node); err_close_dpmac: @@ -316,6 +401,8 @@ void dpaa2_mac_disconnect(struct dpaa2_mac *mac) phylink_disconnect_phy(mac->phylink); phylink_destroy(mac->phylink); + dpaa2_pcs_destroy(mac); + dpmac_close(mac->mc_io, 0, mac->mc_dev->mc_handle); } diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.h index 2130d9c7d40e..955a52856210 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.h @@ -7,6 +7,7 @@ #include <linux/of_mdio.h> #include <linux/of_net.h> #include <linux/phylink.h> +#include <linux/pcs-lynx.h> #include "dpmac.h" #include "dpmac-cmd.h" @@ -21,6 +22,7 @@ struct dpaa2_mac { struct phylink *phylink; phy_interface_t if_mode; enum dpmac_link_type if_link_type; + struct lynx_pcs *pcs; }; bool dpaa2_mac_is_type_fixed(struct fsl_mc_device *dpmac_dev, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c index 26f6f068b01d..c643c5ab60df 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c @@ -8,7 +8,7 @@ #include "hclge_tm.h" #include "hnae3.h" -static struct hclge_dbg_reg_type_info hclge_dbg_reg_info[] = { +static const struct hclge_dbg_reg_type_info hclge_dbg_reg_info[] = { { .reg_type = "bios common", .dfx_msg = &hclge_dbg_bios_common_reg[0], .reg_msg = { .msg_num = ARRAY_SIZE(hclge_dbg_bios_common_reg), @@ -115,14 +115,14 @@ static int hclge_dbg_cmd_send(struct hclge_dev *hdev, } static void hclge_dbg_dump_reg_common(struct hclge_dev *hdev, - struct hclge_dbg_reg_type_info *reg_info, + const struct hclge_dbg_reg_type_info *reg_info, const char *cmd_buf) { #define IDX_OFFSET 1 const char *s = &cmd_buf[strlen(reg_info->reg_type) + IDX_OFFSET]; - struct hclge_dbg_dfx_message *dfx_message = reg_info->dfx_msg; - struct hclge_dbg_reg_common_msg *reg_msg = ®_info->reg_msg; + const struct hclge_dbg_dfx_message *dfx_message = reg_info->dfx_msg; + const struct hclge_dbg_reg_common_msg *reg_msg = ®_info->reg_msg; struct hclge_desc *desc_src; struct hclge_desc *desc; int entries_per_desc; @@ -399,7 +399,7 @@ err_dcb_cmd_send: static void hclge_dbg_dump_reg_cmd(struct hclge_dev *hdev, const char *cmd_buf) { - struct hclge_dbg_reg_type_info *reg_info; + const struct hclge_dbg_reg_type_info *reg_info; bool has_dump = false; int i; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h index 38b79321c4c4..a9066e6ff697 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h @@ -81,13 +81,13 @@ struct hclge_dbg_dfx_message { #define HCLGE_DBG_MAC_REG_TYPE_LEN 32 struct hclge_dbg_reg_type_info { const char *reg_type; - struct hclge_dbg_dfx_message *dfx_msg; + const struct hclge_dbg_dfx_message *dfx_msg; struct hclge_dbg_reg_common_msg reg_msg; }; #pragma pack() -static struct hclge_dbg_dfx_message hclge_dbg_bios_common_reg[] = { +static const struct hclge_dbg_dfx_message hclge_dbg_bios_common_reg[] = { {false, "Reserved"}, {true, "BP_CPU_STATE"}, {true, "DFX_MSIX_INFO_NIC_0"}, @@ -103,7 +103,7 @@ static struct hclge_dbg_dfx_message hclge_dbg_bios_common_reg[] = { {false, "Reserved"}, }; -static struct hclge_dbg_dfx_message hclge_dbg_ssu_reg_0[] = { +static const struct hclge_dbg_dfx_message hclge_dbg_ssu_reg_0[] = { {false, "Reserved"}, {true, "SSU_ETS_PORT_STATUS"}, {true, "SSU_ETS_TCG_STATUS"}, @@ -175,7 +175,7 @@ static struct hclge_dbg_dfx_message hclge_dbg_ssu_reg_0[] = { {false, "Reserved"}, }; -static struct hclge_dbg_dfx_message hclge_dbg_ssu_reg_1[] = { +static const struct hclge_dbg_dfx_message hclge_dbg_ssu_reg_1[] = { {true, "prt_id"}, {true, "PACKET_TC_CURR_BUFFER_CNT_0"}, {true, "PACKET_TC_CURR_BUFFER_CNT_1"}, @@ -282,7 +282,7 @@ static struct hclge_dbg_dfx_message hclge_dbg_ssu_reg_1[] = { {false, "Reserved"}, }; -static struct hclge_dbg_dfx_message hclge_dbg_ssu_reg_2[] = { +static const struct hclge_dbg_dfx_message hclge_dbg_ssu_reg_2[] = { {true, "OQ_INDEX"}, {true, "QUEUE_CNT"}, {false, "Reserved"}, @@ -291,7 +291,7 @@ static struct hclge_dbg_dfx_message hclge_dbg_ssu_reg_2[] = { {false, "Reserved"}, }; -static struct hclge_dbg_dfx_message hclge_dbg_igu_egu_reg[] = { +static const struct hclge_dbg_dfx_message hclge_dbg_igu_egu_reg[] = { {true, "prt_id"}, {true, "IGU_RX_ERR_PKT"}, {true, "IGU_RX_NO_SOF_PKT"}, @@ -356,7 +356,7 @@ static struct hclge_dbg_dfx_message hclge_dbg_igu_egu_reg[] = { {false, "Reserved"}, }; -static struct hclge_dbg_dfx_message hclge_dbg_rpu_reg_0[] = { +static const struct hclge_dbg_dfx_message hclge_dbg_rpu_reg_0[] = { {true, "tc_queue_num"}, {true, "FSM_DFX_ST0"}, {true, "FSM_DFX_ST1"}, @@ -365,7 +365,7 @@ static struct hclge_dbg_dfx_message hclge_dbg_rpu_reg_0[] = { {true, "BUF_WAIT_TIMEOUT_QID"}, }; -static struct hclge_dbg_dfx_message hclge_dbg_rpu_reg_1[] = { +static const struct hclge_dbg_dfx_message hclge_dbg_rpu_reg_1[] = { {false, "Reserved"}, {true, "FIFO_DFX_ST0"}, {true, "FIFO_DFX_ST1"}, @@ -381,7 +381,7 @@ static struct hclge_dbg_dfx_message hclge_dbg_rpu_reg_1[] = { {false, "Reserved"}, }; -static struct hclge_dbg_dfx_message hclge_dbg_ncsi_reg[] = { +static const struct hclge_dbg_dfx_message hclge_dbg_ncsi_reg[] = { {false, "Reserved"}, {true, "NCSI_EGU_TX_FIFO_STS"}, {true, "NCSI_PAUSE_STATUS"}, @@ -453,7 +453,7 @@ static struct hclge_dbg_dfx_message hclge_dbg_ncsi_reg[] = { {true, "NCSI_MAC_RX_PAUSE_FRAMES"}, }; -static struct hclge_dbg_dfx_message hclge_dbg_rtc_reg[] = { +static const struct hclge_dbg_dfx_message hclge_dbg_rtc_reg[] = { {false, "Reserved"}, {true, "LGE_IGU_AFIFO_DFX_0"}, {true, "LGE_IGU_AFIFO_DFX_1"}, @@ -483,7 +483,7 @@ static struct hclge_dbg_dfx_message hclge_dbg_rtc_reg[] = { {false, "Reserved"}, }; -static struct hclge_dbg_dfx_message hclge_dbg_ppp_reg[] = { +static const struct hclge_dbg_dfx_message hclge_dbg_ppp_reg[] = { {false, "Reserved"}, {true, "DROP_FROM_PRT_PKT_CNT"}, {true, "DROP_FROM_HOST_PKT_CNT"}, @@ -639,7 +639,7 @@ static struct hclge_dbg_dfx_message hclge_dbg_ppp_reg[] = { {false, "Reserved"}, }; -static struct hclge_dbg_dfx_message hclge_dbg_rcb_reg[] = { +static const struct hclge_dbg_dfx_message hclge_dbg_rcb_reg[] = { {false, "Reserved"}, {true, "FSM_DFX_ST0"}, {true, "FSM_DFX_ST1"}, @@ -711,7 +711,7 @@ static struct hclge_dbg_dfx_message hclge_dbg_rcb_reg[] = { {false, "Reserved"}, }; -static struct hclge_dbg_dfx_message hclge_dbg_tqp_reg[] = { +static const struct hclge_dbg_dfx_message hclge_dbg_tqp_reg[] = { {true, "q_num"}, {true, "RCB_CFG_RX_RING_TAIL"}, {true, "RCB_CFG_RX_RING_HEAD"}, diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c index e0eb294779ec..5a6bbee819cd 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c @@ -784,7 +784,7 @@ static void free_cmdq(struct hinic_cmdq *cmdq) * init_cmdqs_ctxt - write the cmdq ctxt to HW after init all cmdq * @hwdev: the NIC HW device * @cmdqs: cmdqs to write the ctxts for - * &db_area: db_area for all the cmdqs + * @db_area: db_area for all the cmdqs * * Return 0 - Success, negative - Failure **/ diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c index 239685152f6e..0c74f6674634 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c @@ -437,6 +437,8 @@ static int get_base_qpn(struct hinic_hwdev *hwdev, u16 *base_qpn) /** * hinic_hwdev_ifup - Preparing the HW for passing IO * @hwdev: the NIC HW device + * @sq_depth: the send queue depth + * @rq_depth: the receive queue depth * * Return 0 - Success, negative - Failure **/ @@ -582,6 +584,7 @@ void hinic_hwdev_cb_unregister(struct hinic_hwdev *hwdev, /** * nic_mgmt_msg_handler - nic mgmt event handler * @handle: private data for the handler + * @cmd: message command * @buf_in: input buffer * @in_size: input size * @buf_out: output buffer @@ -909,6 +912,7 @@ int hinic_set_interrupt_cfg(struct hinic_hwdev *hwdev, /** * hinic_init_hwdev - Initialize the NIC HW * @pdev: the NIC pci device + * @devlink: the poniter of hinic devlink * * Return initialized NIC HW device * @@ -1121,7 +1125,7 @@ int hinic_hwdev_msix_cnt_set(struct hinic_hwdev *hwdev, u16 msix_index) * @msix_index: msix_index * @pending_limit: the maximum pending interrupt events (unit 8) * @coalesc_timer: coalesc period for interrupt (unit 8 us) - * @lli_timer: replenishing period for low latency credit (unit 8 us) + * @lli_timer_cfg: replenishing period for low latency credit (unit 8 us) * @lli_credit_limit: maximum credits for low latency msix messages (unit 8) * @resend_timer: maximum wait for resending msix (unit coalesc period) * diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c index f108b0c9228e..19942fef99d9 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c @@ -188,6 +188,7 @@ static u8 eq_cons_idx_checksum_set(u32 val) /** * eq_update_ci - update the HW cons idx of event queue * @eq: the event queue to update the cons idx for + * @arm_state: the arm bit value of eq's interrupt **/ static void eq_update_ci(struct hinic_eq *eq, u32 arm_state) { @@ -368,7 +369,7 @@ static void eq_irq_work(struct work_struct *work) /** * ceq_tasklet - the tasklet of the EQ that received the event - * @ceq_data: the eq + * @t: the tasklet struct pointer **/ static void ceq_tasklet(struct tasklet_struct *t) { diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_if.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_if.c index bc8925c0c982..efbaed389440 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_if.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_if.c @@ -230,6 +230,7 @@ static int wait_hwif_ready(struct hinic_hwif *hwif) * @hwif: the HW interface of a pci function device * @attr0: the first attribute that was read from the hw * @attr1: the second attribute that was read from the hw + * @attr2: the third attribute that was read from the hw **/ static void set_hwif_attr(struct hinic_hwif *hwif, u32 attr0, u32 attr1, u32 attr2) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c index 2ebae6cb5db5..819fa13034c0 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c @@ -238,6 +238,7 @@ static int send_msg_to_mgmt(struct hinic_pf_to_mgmt *pf_to_mgmt, * @out_size: response length * @direction: the direction of the original message * @resp_msg_id: msg id to response for + * @timeout: time-out period of waiting for response * * Return 0 - Success, negative - Failure **/ diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c index 19d01def891f..350225bbe0be 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_main.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c @@ -990,7 +990,7 @@ static void hinic_refresh_nic_cfg(struct hinic_dev *nic_dev) * @handle: nic device for the handler * @buf_in: input buffer * @in_size: input size - * @buf_in: output buffer + * @buf_out: output buffer * @out_size: returned output size * * Return 0 - Success, negative - Failure diff --git a/drivers/net/ethernet/marvell/octeontx2/af/Makefile b/drivers/net/ethernet/marvell/octeontx2/af/Makefile index 0bc2410c8949..2f7a861d0c7b 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/Makefile +++ b/drivers/net/ethernet/marvell/octeontx2/af/Makefile @@ -3,9 +3,10 @@ # Makefile for Marvell's OcteonTX2 RVU Admin Function driver # +ccflags-y += -I$(src) obj-$(CONFIG_OCTEONTX2_MBOX) += octeontx2_mbox.o obj-$(CONFIG_OCTEONTX2_AF) += octeontx2_af.o -octeontx2_mbox-y := mbox.o +octeontx2_mbox-y := mbox.o rvu_trace.o octeontx2_af-y := cgx.o rvu.o rvu_cgx.o rvu_npa.o rvu_nix.o \ rvu_reg.o rvu_npc.o rvu_debugfs.o ptp.o diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.c b/drivers/net/ethernet/marvell/octeontx2/af/mbox.c index 387e33fa417a..4b4cf7dac77f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.c @@ -14,6 +14,7 @@ #include "rvu_reg.h" #include "mbox.h" +#include "rvu_trace.h" static const u16 msgs_offset = ALIGN(sizeof(struct mbox_hdr), MBOX_MSG_ALIGN); @@ -199,6 +200,9 @@ void otx2_mbox_msg_send(struct otx2_mbox *mbox, int devid) */ tx_hdr->num_msgs = mdev->num_msgs; rx_hdr->num_msgs = 0; + + trace_otx2_msg_send(mbox->pdev, tx_hdr->num_msgs, tx_hdr->msg_size); + spin_unlock(&mdev->mbox_lock); /* The interrupt should be fired after num_msgs is written @@ -295,10 +299,15 @@ int otx2_mbox_check_rsp_msgs(struct otx2_mbox *mbox, int devid) struct mbox_msghdr *preq = mdev->mbase + ireq; struct mbox_msghdr *prsp = mdev->mbase + irsp; - if (preq->id != prsp->id) + if (preq->id != prsp->id) { + trace_otx2_msg_check(mbox->pdev, preq->id, + prsp->id, prsp->rc); goto exit; + } if (prsp->rc) { rc = prsp->rc; + trace_otx2_msg_check(mbox->pdev, preq->id, + prsp->id, prsp->rc); goto exit; } diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h index 4aaef0a2b51c..aa3bda3f34be 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h @@ -625,6 +625,7 @@ struct nix_rss_flowkey_cfg { #define NIX_FLOW_KEY_TYPE_INNR_UDP BIT(15) #define NIX_FLOW_KEY_TYPE_INNR_SCTP BIT(16) #define NIX_FLOW_KEY_TYPE_INNR_ETH_DMAC BIT(17) +#define NIX_FLOW_KEY_TYPE_VLAN BIT(20) u32 flowkey_cfg; /* Flowkey types selected */ u8 group; /* RSS context or group */ }; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index c3ef73ae782c..e1f918960730 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -20,6 +20,8 @@ #include "rvu_reg.h" #include "ptp.h" +#include "rvu_trace.h" + #define DRV_NAME "octeontx2-af" #define DRV_STRING "Marvell OcteonTX2 RVU Admin Function Driver" @@ -1549,6 +1551,7 @@ static int rvu_process_mbox_msg(struct otx2_mbox *mbox, int devid, if (rsp && err) \ rsp->hdr.rc = err; \ \ + trace_otx2_msg_process(mbox->pdev, _id, err); \ return rsp ? err : -ENOMEM; \ } MBOX_MESSAGES @@ -1881,6 +1884,8 @@ static irqreturn_t rvu_mbox_intr_handler(int irq, void *rvu_irq) intr = rvu_read64(rvu, BLKADDR_RVUM, RVU_AF_PFAF_MBOX_INT); /* Clear interrupts */ rvu_write64(rvu, BLKADDR_RVUM, RVU_AF_PFAF_MBOX_INT, intr); + if (intr) + trace_otx2_msg_interrupt(rvu->pdev, "PF(s) to AF", intr); /* Sync with mbox memory region */ rmb(); @@ -1898,6 +1903,8 @@ static irqreturn_t rvu_mbox_intr_handler(int irq, void *rvu_irq) intr = rvupf_read64(rvu, RVU_PF_VFPF_MBOX_INTX(0)); rvupf_write64(rvu, RVU_PF_VFPF_MBOX_INTX(0), intr); + if (intr) + trace_otx2_msg_interrupt(rvu->pdev, "VF(s) to AF", intr); rvu_queue_work(&rvu->afvf_wq_info, 0, vfs, intr); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c index fe3389c144b5..fa9152ff5e2a 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c @@ -15,6 +15,7 @@ #include "rvu.h" #include "cgx.h" #include "rvu_reg.h" +#include "rvu_trace.h" struct cgx_evq_entry { struct list_head evq_node; @@ -34,6 +35,7 @@ static struct _req_type __maybe_unused \ return NULL; \ req->hdr.sig = OTX2_MBOX_REQ_SIG; \ req->hdr.id = _id; \ + trace_otx2_msg_alloc(rvu->pdev, _id, sizeof(*req)); \ return req; \ } diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index 08181fc5f5d4..4bdc4baa3c59 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -2509,6 +2509,14 @@ static int set_flowkey_fields(struct nix_rx_flowkey_alg *alg, u32 flow_cfg) field->ltype_match = NPC_LT_LE_GTPU; field->ltype_mask = 0xF; break; + case NIX_FLOW_KEY_TYPE_VLAN: + field->lid = NPC_LID_LB; + field->hdr_offset = 2; /* Skip TPID (2-bytes) */ + field->bytesm1 = 1; /* 2 Bytes (Actually 12 bits) */ + field->ltype_match = NPC_LT_LB_CTAG; + field->ltype_mask = 0xF; + field->fn_mask = 1; /* Mask out the first nibble */ + break; } field->ena = 1; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.c new file mode 100644 index 000000000000..56f90cf9c4c0 --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Marvell OcteonTx2 RVU Admin Function driver tracepoints + * + * Copyright (C) 2020 Marvell International Ltd. + */ + +#define CREATE_TRACE_POINTS +#include "rvu_trace.h" + +EXPORT_TRACEPOINT_SYMBOL(otx2_msg_alloc); +EXPORT_TRACEPOINT_SYMBOL(otx2_msg_interrupt); +EXPORT_TRACEPOINT_SYMBOL(otx2_msg_process); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.h new file mode 100644 index 000000000000..e6609068e81b --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Marvell OcteonTx2 RVU Admin Function driver tracepoints + * + * Copyright (C) 2020 Marvell International Ltd. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rvu + +#if !defined(__RVU_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __RVU_TRACE_H + +#include <linux/types.h> +#include <linux/tracepoint.h> +#include <linux/pci.h> + +TRACE_EVENT(otx2_msg_alloc, + TP_PROTO(const struct pci_dev *pdev, u16 id, u64 size), + TP_ARGS(pdev, id, size), + TP_STRUCT__entry(__string(dev, pci_name(pdev)) + __field(u16, id) + __field(u64, size) + ), + TP_fast_assign(__assign_str(dev, pci_name(pdev)) + __entry->id = id; + __entry->size = size; + ), + TP_printk("[%s] msg:(0x%x) size:%lld\n", __get_str(dev), + __entry->id, __entry->size) +); + +TRACE_EVENT(otx2_msg_send, + TP_PROTO(const struct pci_dev *pdev, u16 num_msgs, u64 msg_size), + TP_ARGS(pdev, num_msgs, msg_size), + TP_STRUCT__entry(__string(dev, pci_name(pdev)) + __field(u16, num_msgs) + __field(u64, msg_size) + ), + TP_fast_assign(__assign_str(dev, pci_name(pdev)) + __entry->num_msgs = num_msgs; + __entry->msg_size = msg_size; + ), + TP_printk("[%s] sent %d msg(s) of size:%lld\n", __get_str(dev), + __entry->num_msgs, __entry->msg_size) +); + +TRACE_EVENT(otx2_msg_check, + TP_PROTO(const struct pci_dev *pdev, u16 reqid, u16 rspid, int rc), + TP_ARGS(pdev, reqid, rspid, rc), + TP_STRUCT__entry(__string(dev, pci_name(pdev)) + __field(u16, reqid) + __field(u16, rspid) + __field(int, rc) + ), + TP_fast_assign(__assign_str(dev, pci_name(pdev)) + __entry->reqid = reqid; + __entry->rspid = rspid; + __entry->rc = rc; + ), + TP_printk("[%s] req->id:0x%x rsp->id:0x%x resp_code:%d\n", + __get_str(dev), __entry->reqid, + __entry->rspid, __entry->rc) +); + +TRACE_EVENT(otx2_msg_interrupt, + TP_PROTO(const struct pci_dev *pdev, const char *msg, u64 intr), + TP_ARGS(pdev, msg, intr), + TP_STRUCT__entry(__string(dev, pci_name(pdev)) + __string(str, msg) + __field(u64, intr) + ), + TP_fast_assign(__assign_str(dev, pci_name(pdev)) + __assign_str(str, msg) + __entry->intr = intr; + ), + TP_printk("[%s] mbox interrupt %s (0x%llx)\n", __get_str(dev), + __get_str(str), __entry->intr) +); + +TRACE_EVENT(otx2_msg_process, + TP_PROTO(const struct pci_dev *pdev, u16 id, int err), + TP_ARGS(pdev, id, err), + TP_STRUCT__entry(__string(dev, pci_name(pdev)) + __field(u16, id) + __field(int, err) + ), + TP_fast_assign(__assign_str(dev, pci_name(pdev)) + __entry->id = id; + __entry->err = err; + ), + TP_printk("[%s] msg:(0x%x) error:%d\n", __get_str(dev), + __entry->id, __entry->err) +); + +#endif /* __RVU_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . + +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE rvu_trace + +#include <trace/define_trace.h> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 820fc660de66..d2581090f9a4 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -355,7 +355,7 @@ int otx2_rss_init(struct otx2_nic *pfvf) rss->flowkey_cfg = rss->enable ? rss->flowkey_cfg : NIX_FLOW_KEY_TYPE_IPV4 | NIX_FLOW_KEY_TYPE_IPV6 | NIX_FLOW_KEY_TYPE_TCP | NIX_FLOW_KEY_TYPE_UDP | - NIX_FLOW_KEY_TYPE_SCTP; + NIX_FLOW_KEY_TYPE_SCTP | NIX_FLOW_KEY_TYPE_VLAN; ret = otx2_set_flowkey_cfg(pfvf); if (ret) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index ac47762cce9b..d6253f2a414d 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -20,6 +20,7 @@ #include <mbox.h> #include "otx2_reg.h" #include "otx2_txrx.h" +#include <rvu_trace.h> /* PCI device IDs */ #define PCI_DEVID_OCTEONTX2_RVU_PF 0xA063 @@ -523,6 +524,7 @@ static struct _req_type __maybe_unused \ return NULL; \ req->hdr.sig = OTX2_MBOX_REQ_SIG; \ req->hdr.id = _id; \ + trace_otx2_msg_alloc(mbox->mbox.pdev, _id, sizeof(*req)); \ return req; \ } diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c index 0341d9694e8b..662fb80dbb9d 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c @@ -428,6 +428,8 @@ static int otx2_get_rss_hash_opts(struct otx2_nic *pfvf, /* Mimimum is IPv4 and IPv6, SIP/DIP */ nfc->data = RXH_IP_SRC | RXH_IP_DST; + if (rss->flowkey_cfg & NIX_FLOW_KEY_TYPE_VLAN) + nfc->data |= RXH_VLAN; switch (nfc->flow_type) { case TCP_V4_FLOW: @@ -477,6 +479,11 @@ static int otx2_set_rss_hash_opts(struct otx2_nic *pfvf, if (!(nfc->data & RXH_IP_SRC) || !(nfc->data & RXH_IP_DST)) return -EINVAL; + if (nfc->data & RXH_VLAN) + rss_cfg |= NIX_FLOW_KEY_TYPE_VLAN; + else + rss_cfg &= ~NIX_FLOW_KEY_TYPE_VLAN; + switch (nfc->flow_type) { case TCP_V4_FLOW: case TCP_V6_FLOW: diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index aac2845c1fb1..265e4d1b4e64 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -22,6 +22,7 @@ #include "otx2_txrx.h" #include "otx2_struct.h" #include "otx2_ptp.h" +#include <rvu_trace.h> #define DRV_NAME "octeontx2-nicpf" #define DRV_STRING "Marvell OcteonTX2 NIC Physical Function Driver" @@ -558,6 +559,8 @@ static irqreturn_t otx2_pfvf_mbox_intr_handler(int irq, void *pf_irq) otx2_queue_work(mbox, pf->mbox_pfvf_wq, 0, vfs, intr, TYPE_PFVF); + trace_otx2_msg_interrupt(mbox->mbox.pdev, "VF(s) to PF", intr); + return IRQ_HANDLED; } @@ -940,6 +943,9 @@ static irqreturn_t otx2_pfaf_mbox_intr_handler(int irq, void *pf_irq) otx2_write64(pf, RVU_PF_INT, BIT_ULL(0)); mbox = &pf->mbox; + + trace_otx2_msg_interrupt(mbox->mbox.pdev, "AF to PF", BIT_ULL(0)); + otx2_queue_work(mbox, pf->mbox_wq, 0, 1, 1, TYPE_PFAF); return IRQ_HANDLED; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c index 70e0d4ca6688..32daa3e0f296 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c @@ -187,6 +187,8 @@ static irqreturn_t otx2vf_vfaf_mbox_intr_handler(int irq, void *vf_irq) mdev = &mbox->dev[0]; otx2_sync_mbox_bbuf(mbox, 0); + trace_otx2_msg_interrupt(mbox->pdev, "PF to VF", BIT_ULL(0)); + hdr = (struct mbox_hdr *)(mdev->mbase + mbox->rx_start); if (hdr->num_msgs) { vf->mbox.num_msgs = hdr->num_msgs; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index e9010ceb5ea9..5368e06cd71c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -221,6 +221,7 @@ enum mlx5e_priv_flag { MLX5E_PFLAG_RX_STRIDING_RQ, MLX5E_PFLAG_RX_NO_CSUM_COMPLETE, MLX5E_PFLAG_XDP_TX_MPWQE, + MLX5E_PFLAG_SKB_TX_MPWQE, MLX5E_NUM_PFLAGS, /* Keep last */ }; @@ -305,6 +306,7 @@ struct mlx5e_sq_dma { enum { MLX5E_SQ_STATE_ENABLED, + MLX5E_SQ_STATE_MPWQE, MLX5E_SQ_STATE_RECOVERING, MLX5E_SQ_STATE_IPSEC, MLX5E_SQ_STATE_AM, @@ -313,26 +315,40 @@ enum { MLX5E_SQ_STATE_PENDING_XSK_TX, }; +struct mlx5e_tx_mpwqe { + /* Current MPWQE session */ + struct mlx5e_tx_wqe *wqe; + u32 bytes_count; + u8 ds_count; + u8 pkt_count; + u8 inline_on; +}; + struct mlx5e_txqsq { /* data path */ /* dirtied @completion */ u16 cc; + u16 skb_fifo_cc; u32 dma_fifo_cc; struct dim dim; /* Adaptive Moderation */ /* dirtied @xmit */ u16 pc ____cacheline_aligned_in_smp; + u16 skb_fifo_pc; u32 dma_fifo_pc; + struct mlx5e_tx_mpwqe mpwqe; struct mlx5e_cq cq; /* read only */ struct mlx5_wq_cyc wq; u32 dma_fifo_mask; + u16 skb_fifo_mask; struct mlx5e_sq_stats *stats; struct { struct mlx5e_sq_dma *dma_fifo; + struct sk_buff **skb_fifo; struct mlx5e_tx_wqe_info *wqe_info; } db; void __iomem *uar_map; @@ -399,7 +415,7 @@ struct mlx5e_xdp_info { }; }; -struct mlx5e_xdp_xmit_data { +struct mlx5e_xmit_data { dma_addr_t dma_addr; void *data; u32 len; @@ -412,18 +428,10 @@ struct mlx5e_xdp_info_fifo { u32 mask; }; -struct mlx5e_xdp_mpwqe { - /* Current MPWQE session */ - struct mlx5e_tx_wqe *wqe; - u8 ds_count; - u8 pkt_count; - u8 inline_on; -}; - struct mlx5e_xdpsq; typedef int (*mlx5e_fp_xmit_xdp_frame_check)(struct mlx5e_xdpsq *); typedef bool (*mlx5e_fp_xmit_xdp_frame)(struct mlx5e_xdpsq *, - struct mlx5e_xdp_xmit_data *, + struct mlx5e_xmit_data *, struct mlx5e_xdp_info *, int); @@ -438,7 +446,7 @@ struct mlx5e_xdpsq { u32 xdpi_fifo_pc ____cacheline_aligned_in_smp; u16 pc; struct mlx5_wqe_ctrl_seg *doorbell_cseg; - struct mlx5e_xdp_mpwqe mpwqe; + struct mlx5e_tx_mpwqe mpwqe; struct mlx5e_cq cq; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index 24336c60123a..07ee1d236ab3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -7,6 +7,21 @@ #include "en.h" #include <linux/indirect_call_wrapper.h> +#define MLX5E_TX_WQE_EMPTY_DS_COUNT (sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS) + +/* The mult of MLX5_SEND_WQE_MAX_WQEBBS * MLX5_SEND_WQEBB_NUM_DS + * (16 * 4 == 64) does not fit in the 6-bit DS field of Ctrl Segment. + * We use a bound lower that MLX5_SEND_WQE_MAX_WQEBBS to let a + * full-session WQE be cache-aligned. + */ +#if L1_CACHE_BYTES < 128 +#define MLX5E_TX_MPW_MAX_WQEBBS (MLX5_SEND_WQE_MAX_WQEBBS - 1) +#else +#define MLX5E_TX_MPW_MAX_WQEBBS (MLX5_SEND_WQE_MAX_WQEBBS - 2) +#endif + +#define MLX5E_TX_MPW_MAX_NUM_DS (MLX5E_TX_MPW_MAX_WQEBBS * MLX5_SEND_WQEBB_NUM_DS) + #define INL_HDR_START_SZ (sizeof(((struct mlx5_wqe_eth_seg *)NULL)->inline_hdr.start)) enum mlx5e_icosq_wqe_type { @@ -46,8 +61,6 @@ void mlx5e_free_rx_in_progress_descs(struct mlx5e_rq *rq); u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev); -void mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, - struct mlx5e_tx_wqe *wqe, u16 pi, bool xmit_more); bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget); void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq); @@ -110,6 +123,7 @@ struct mlx5e_tx_wqe_info { u32 num_bytes; u8 num_wqebbs; u8 num_dma; + u8 num_fifo_pkts; #ifdef CONFIG_MLX5_EN_TLS struct page *resync_dump_frag_page; #endif @@ -194,23 +208,6 @@ static inline u16 mlx5e_icosq_get_next_pi(struct mlx5e_icosq *sq, u16 size) } static inline void -mlx5e_fill_sq_frag_edge(struct mlx5e_txqsq *sq, struct mlx5_wq_cyc *wq, - u16 pi, u16 nnops) -{ - struct mlx5e_tx_wqe_info *edge_wi, *wi = &sq->db.wqe_info[pi]; - - edge_wi = wi + nnops; - - /* fill sq frag edge with nops to avoid wqe wrapping two pages */ - for (; wi < edge_wi; wi++) { - memset(wi, 0, sizeof(*wi)); - wi->num_wqebbs = 1; - mlx5e_post_nop(wq, sq->sqn, &sq->pc); - } - sq->stats->nop += nnops; -} - -static inline void mlx5e_notify_hw(struct mlx5_wq_cyc *wq, u16 pc, void __iomem *uar_map, struct mlx5_wqe_ctrl_seg *ctrl) { @@ -228,29 +225,6 @@ mlx5e_notify_hw(struct mlx5_wq_cyc *wq, u16 pc, void __iomem *uar_map, mlx5_write64((__be32 *)ctrl, uar_map); } -static inline bool mlx5e_transport_inline_tx_wqe(struct mlx5_wqe_ctrl_seg *cseg) -{ - return cseg && !!cseg->tis_tir_num; -} - -static inline u8 -mlx5e_tx_wqe_inline_mode(struct mlx5e_txqsq *sq, struct mlx5_wqe_ctrl_seg *cseg, - struct sk_buff *skb) -{ - u8 mode; - - if (mlx5e_transport_inline_tx_wqe(cseg)) - return MLX5_INLINE_MODE_TCP_UDP; - - mode = sq->min_inline_mode; - - if (skb_vlan_tag_present(skb) && - test_bit(MLX5E_SQ_STATE_VLAN_NEED_L2_INLINE, &sq->state)) - mode = max_t(u8, MLX5_INLINE_MODE_L2, mode); - - return mode; -} - static inline void mlx5e_cq_arm(struct mlx5e_cq *cq) { struct mlx5_core_cq *mcq; @@ -276,6 +250,23 @@ mlx5e_dma_push(struct mlx5e_txqsq *sq, dma_addr_t addr, u32 size, dma->type = map_type; } +static inline struct sk_buff **mlx5e_skb_fifo_get(struct mlx5e_txqsq *sq, u16 i) +{ + return &sq->db.skb_fifo[i & sq->skb_fifo_mask]; +} + +static inline void mlx5e_skb_fifo_push(struct mlx5e_txqsq *sq, struct sk_buff *skb) +{ + struct sk_buff **skb_item = mlx5e_skb_fifo_get(sq, sq->skb_fifo_pc++); + + *skb_item = skb; +} + +static inline struct sk_buff *mlx5e_skb_fifo_pop(struct mlx5e_txqsq *sq) +{ + return *mlx5e_skb_fifo_get(sq, sq->skb_fifo_cc++); +} + static inline void mlx5e_tx_dma_unmap(struct device *pdev, struct mlx5e_sq_dma *dma) { @@ -291,6 +282,14 @@ mlx5e_tx_dma_unmap(struct device *pdev, struct mlx5e_sq_dma *dma) } } +void mlx5e_sq_xmit_simple(struct mlx5e_txqsq *sq, struct sk_buff *skb, bool xmit_more); +void mlx5e_tx_mpwqe_ensure_complete(struct mlx5e_txqsq *sq); + +static inline bool mlx5e_tx_mpwqe_is_full(struct mlx5e_tx_mpwqe *session) +{ + return session->ds_count == MLX5E_TX_MPW_MAX_NUM_DS; +} + static inline void mlx5e_rqwq_reset(struct mlx5e_rq *rq) { if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 562bc465f82b..ae90d533a350 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -59,7 +59,7 @@ static inline bool mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq, struct mlx5e_dma_info *di, struct xdp_buff *xdp) { - struct mlx5e_xdp_xmit_data xdptxd; + struct mlx5e_xmit_data xdptxd; struct mlx5e_xdp_info xdpi; struct xdp_frame *xdpf; dma_addr_t dma_addr; @@ -194,18 +194,22 @@ static u16 mlx5e_xdpsq_get_next_pi(struct mlx5e_xdpsq *sq, u16 size) static void mlx5e_xdp_mpwqe_session_start(struct mlx5e_xdpsq *sq) { - struct mlx5e_xdp_mpwqe *session = &sq->mpwqe; + struct mlx5e_tx_mpwqe *session = &sq->mpwqe; struct mlx5e_xdpsq_stats *stats = sq->stats; + struct mlx5e_tx_wqe *wqe; u16 pi; - pi = mlx5e_xdpsq_get_next_pi(sq, MLX5_SEND_WQE_MAX_WQEBBS); - session->wqe = MLX5E_TX_FETCH_WQE(sq, pi); + pi = mlx5e_xdpsq_get_next_pi(sq, MLX5E_TX_MPW_MAX_WQEBBS); + wqe = MLX5E_TX_FETCH_WQE(sq, pi); + net_prefetchw(wqe->data); - net_prefetchw(session->wqe->data); - session->ds_count = MLX5E_XDP_TX_EMPTY_DS_COUNT; - session->pkt_count = 0; - - mlx5e_xdp_update_inline_state(sq); + *session = (struct mlx5e_tx_mpwqe) { + .wqe = wqe, + .bytes_count = 0, + .ds_count = MLX5E_TX_WQE_EMPTY_DS_COUNT, + .pkt_count = 0, + .inline_on = mlx5e_xdp_get_inline_state(sq, session->inline_on), + }; stats->mpwqe++; } @@ -213,7 +217,7 @@ static void mlx5e_xdp_mpwqe_session_start(struct mlx5e_xdpsq *sq) void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq) { struct mlx5_wq_cyc *wq = &sq->wq; - struct mlx5e_xdp_mpwqe *session = &sq->mpwqe; + struct mlx5e_tx_mpwqe *session = &sq->mpwqe; struct mlx5_wqe_ctrl_seg *cseg = &session->wqe->ctrl; u16 ds_count = session->ds_count; u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); @@ -258,10 +262,10 @@ INDIRECT_CALLABLE_SCOPE int mlx5e_xmit_xdp_frame_check_mpwqe(struct mlx5e_xdpsq } INDIRECT_CALLABLE_SCOPE bool -mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_xmit_data *xdptxd, +mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, struct mlx5e_xdp_info *xdpi, int check_result) { - struct mlx5e_xdp_mpwqe *session = &sq->mpwqe; + struct mlx5e_tx_mpwqe *session = &sq->mpwqe; struct mlx5e_xdpsq_stats *stats = sq->stats; if (unlikely(xdptxd->len > sq->hw_mtu)) { @@ -284,8 +288,7 @@ mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_xmit_data *x mlx5e_xdp_mpwqe_add_dseg(sq, xdptxd, stats); - if (unlikely(mlx5e_xdp_no_room_for_inline_pkt(session) || - session->ds_count == MLX5E_XDP_MPW_MAX_NUM_DS)) + if (unlikely(mlx5e_xdp_mpqwe_is_full(session))) mlx5e_xdp_mpwqe_complete(sq); mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, xdpi); @@ -306,7 +309,7 @@ INDIRECT_CALLABLE_SCOPE int mlx5e_xmit_xdp_frame_check(struct mlx5e_xdpsq *sq) } INDIRECT_CALLABLE_SCOPE bool -mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_xmit_data *xdptxd, +mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, struct mlx5e_xdp_info *xdpi, int check_result) { struct mlx5_wq_cyc *wq = &sq->wq; @@ -503,7 +506,7 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; - struct mlx5e_xdp_xmit_data xdptxd; + struct mlx5e_xmit_data xdptxd; struct mlx5e_xdp_info xdpi; bool ret; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h index e806c13d491f..d487e5e37162 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h @@ -38,27 +38,12 @@ #include "en/txrx.h" #define MLX5E_XDP_MIN_INLINE (ETH_HLEN + VLAN_HLEN) -#define MLX5E_XDP_TX_EMPTY_DS_COUNT \ - (sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS) -#define MLX5E_XDP_TX_DS_COUNT (MLX5E_XDP_TX_EMPTY_DS_COUNT + 1 /* SG DS */) - -#define MLX5E_XDP_INLINE_WQE_SZ_THRSD (256 - sizeof(struct mlx5_wqe_inline_seg)) -#define MLX5E_XDP_INLINE_WQE_MAX_DS_CNT \ - DIV_ROUND_UP(MLX5E_XDP_INLINE_WQE_SZ_THRSD, MLX5_SEND_WQE_DS) - -/* The mult of MLX5_SEND_WQE_MAX_WQEBBS * MLX5_SEND_WQEBB_NUM_DS - * (16 * 4 == 64) does not fit in the 6-bit DS field of Ctrl Segment. - * We use a bound lower that MLX5_SEND_WQE_MAX_WQEBBS to let a - * full-session WQE be cache-aligned. - */ -#if L1_CACHE_BYTES < 128 -#define MLX5E_XDP_MPW_MAX_WQEBBS (MLX5_SEND_WQE_MAX_WQEBBS - 1) -#else -#define MLX5E_XDP_MPW_MAX_WQEBBS (MLX5_SEND_WQE_MAX_WQEBBS - 2) -#endif +#define MLX5E_XDP_TX_DS_COUNT (MLX5E_TX_WQE_EMPTY_DS_COUNT + 1 /* SG DS */) -#define MLX5E_XDP_MPW_MAX_NUM_DS \ - (MLX5E_XDP_MPW_MAX_WQEBBS * MLX5_SEND_WQEBB_NUM_DS) +#define MLX5E_XDP_INLINE_WQE_MAX_DS_CNT 16 +#define MLX5E_XDP_INLINE_WQE_SZ_THRSD \ + (MLX5E_XDP_INLINE_WQE_MAX_DS_CNT * MLX5_SEND_WQE_DS - \ + sizeof(struct mlx5_wqe_inline_seg)) struct mlx5e_xsk_param; int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk); @@ -73,11 +58,11 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags); INDIRECT_CALLABLE_DECLARE(bool mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, - struct mlx5e_xdp_xmit_data *xdptxd, + struct mlx5e_xmit_data *xdptxd, struct mlx5e_xdp_info *xdpi, int check_result)); INDIRECT_CALLABLE_DECLARE(bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, - struct mlx5e_xdp_xmit_data *xdptxd, + struct mlx5e_xmit_data *xdptxd, struct mlx5e_xdp_info *xdpi, int check_result)); INDIRECT_CALLABLE_DECLARE(int mlx5e_xmit_xdp_frame_check_mpwqe(struct mlx5e_xdpsq *sq)); @@ -122,30 +107,28 @@ static inline void mlx5e_xmit_xdp_doorbell(struct mlx5e_xdpsq *sq) /* Enable inline WQEs to shift some load from a congested HCA (HW) to * a less congested cpu (SW). */ -static inline void mlx5e_xdp_update_inline_state(struct mlx5e_xdpsq *sq) +static inline bool mlx5e_xdp_get_inline_state(struct mlx5e_xdpsq *sq, bool cur) { u16 outstanding = sq->xdpi_fifo_pc - sq->xdpi_fifo_cc; - struct mlx5e_xdp_mpwqe *session = &sq->mpwqe; #define MLX5E_XDP_INLINE_WATERMARK_LOW 10 #define MLX5E_XDP_INLINE_WATERMARK_HIGH 128 - if (session->inline_on) { - if (outstanding <= MLX5E_XDP_INLINE_WATERMARK_LOW) - session->inline_on = 0; - return; - } + if (cur && outstanding <= MLX5E_XDP_INLINE_WATERMARK_LOW) + return false; + + if (!cur && outstanding >= MLX5E_XDP_INLINE_WATERMARK_HIGH) + return true; - /* inline is false */ - if (outstanding >= MLX5E_XDP_INLINE_WATERMARK_HIGH) - session->inline_on = 1; + return cur; } -static inline bool -mlx5e_xdp_no_room_for_inline_pkt(struct mlx5e_xdp_mpwqe *session) +static inline bool mlx5e_xdp_mpqwe_is_full(struct mlx5e_tx_mpwqe *session) { - return session->inline_on && - session->ds_count + MLX5E_XDP_INLINE_WQE_MAX_DS_CNT > MLX5E_XDP_MPW_MAX_NUM_DS; + if (session->inline_on) + return session->ds_count + MLX5E_XDP_INLINE_WQE_MAX_DS_CNT > + MLX5E_TX_MPW_MAX_NUM_DS; + return mlx5e_tx_mpwqe_is_full(session); } struct mlx5e_xdp_wqe_info { @@ -155,15 +138,16 @@ struct mlx5e_xdp_wqe_info { static inline void mlx5e_xdp_mpwqe_add_dseg(struct mlx5e_xdpsq *sq, - struct mlx5e_xdp_xmit_data *xdptxd, + struct mlx5e_xmit_data *xdptxd, struct mlx5e_xdpsq_stats *stats) { - struct mlx5e_xdp_mpwqe *session = &sq->mpwqe; + struct mlx5e_tx_mpwqe *session = &sq->mpwqe; struct mlx5_wqe_data_seg *dseg = (struct mlx5_wqe_data_seg *)session->wqe + session->ds_count; u32 dma_len = xdptxd->len; session->pkt_count++; + session->bytes_count += dma_len; if (session->inline_on && dma_len <= MLX5E_XDP_INLINE_WQE_SZ_THRSD) { struct mlx5_wqe_inline_seg *inline_dseg = diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c index aa91cbdfe969..fb671a457129 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c @@ -67,8 +67,8 @@ static void mlx5e_xsk_tx_post_err(struct mlx5e_xdpsq *sq, bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget) { struct xsk_buff_pool *pool = sq->xsk_pool; + struct mlx5e_xmit_data xdptxd; struct mlx5e_xdp_info xdpi; - struct mlx5e_xdp_xmit_data xdptxd; bool work_done = true; bool flush = false; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h index 110476bdeffb..2ea1cdc1ca54 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h @@ -128,26 +128,38 @@ static inline bool mlx5e_accel_tx_begin(struct net_device *dev, return true; } -static inline bool mlx5e_accel_tx_finish(struct mlx5e_priv *priv, - struct mlx5e_txqsq *sq, - struct sk_buff *skb, - struct mlx5e_tx_wqe *wqe, - struct mlx5e_accel_tx_state *state) -{ -#ifdef CONFIG_MLX5_EN_TLS - mlx5e_tls_handle_tx_wqe(sq, &wqe->ctrl, &state->tls); -#endif +/* Part of the eseg touched by TX offloads */ +#define MLX5E_ACCEL_ESEG_LEN offsetof(struct mlx5_wqe_eth_seg, mss) +static inline bool mlx5e_accel_tx_eseg(struct mlx5e_priv *priv, + struct mlx5e_txqsq *sq, + struct sk_buff *skb, + struct mlx5_wqe_eth_seg *eseg) +{ #ifdef CONFIG_MLX5_EN_IPSEC if (test_bit(MLX5E_SQ_STATE_IPSEC, &sq->state)) { - if (unlikely(!mlx5e_ipsec_handle_tx_skb(priv, &wqe->eth, skb))) + if (unlikely(!mlx5e_ipsec_handle_tx_skb(priv, eseg, skb))) return false; } #endif +#if IS_ENABLED(CONFIG_GENEVE) + if (skb->encapsulation) + mlx5e_tx_tunnel_accel(skb, eseg); +#endif + return true; } +static inline void mlx5e_accel_tx_finish(struct mlx5e_txqsq *sq, + struct mlx5e_tx_wqe *wqe, + struct mlx5e_accel_tx_state *state) +{ +#ifdef CONFIG_MLX5_EN_TLS + mlx5e_tls_handle_tx_wqe(sq, &wqe->ctrl, &state->tls); +#endif +} + static inline int mlx5e_accel_init_rx(struct mlx5e_priv *priv) { return mlx5e_ktls_init_rx(priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c index f4861545b236..b140e13fdcc8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c @@ -345,9 +345,6 @@ void mlx5e_ktls_tx_handle_resync_dump_comp(struct mlx5e_txqsq *sq, struct mlx5e_sq_stats *stats; struct mlx5e_sq_dma *dma; - if (!wi->resync_dump_frag_page) - return; - dma = mlx5e_dma_get(sq, (*dma_fifo_cc)++); stats = sq->stats; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h index ff4c740af10b..7521c9be735b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h @@ -29,12 +29,24 @@ void mlx5e_ktls_handle_get_psv_completion(struct mlx5e_icosq_wqe_info *wi, void mlx5e_ktls_tx_handle_resync_dump_comp(struct mlx5e_txqsq *sq, struct mlx5e_tx_wqe_info *wi, u32 *dma_fifo_cc); +static inline bool +mlx5e_ktls_tx_try_handle_resync_dump_comp(struct mlx5e_txqsq *sq, + struct mlx5e_tx_wqe_info *wi, + u32 *dma_fifo_cc) +{ + if (unlikely(wi->resync_dump_frag_page)) { + mlx5e_ktls_tx_handle_resync_dump_comp(sq, wi, dma_fifo_cc); + return true; + } + return false; +} #else -static inline void -mlx5e_ktls_tx_handle_resync_dump_comp(struct mlx5e_txqsq *sq, - struct mlx5e_tx_wqe_info *wi, - u32 *dma_fifo_cc) +static inline bool +mlx5e_ktls_tx_try_handle_resync_dump_comp(struct mlx5e_txqsq *sq, + struct mlx5e_tx_wqe_info *wi, + u32 *dma_fifo_cc) { + return false; } #endif /* CONFIG_MLX5_EN_TLS */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c index b0c31d49ff8d..6982b193ee8a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c @@ -189,12 +189,10 @@ static bool mlx5e_tls_handle_ooo(struct mlx5e_tls_offload_context_tx *context, struct mlx5e_tls *tls) { u32 tcp_seq = ntohl(tcp_hdr(skb)->seq); - struct mlx5e_tx_wqe *wqe; struct sync_info info; struct sk_buff *nskb; int linear_len = 0; int headln; - u16 pi; int i; sq->stats->tls_ooo++; @@ -246,9 +244,7 @@ static bool mlx5e_tls_handle_ooo(struct mlx5e_tls_offload_context_tx *context, sq->stats->tls_resync_bytes += nskb->len; mlx5e_tls_complete_sync_skb(skb, nskb, tcp_seq, headln, cpu_to_be64(info.rcd_sn)); - pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc); - wqe = MLX5E_TX_FETCH_WQE(sq, pi); - mlx5e_sq_xmit(sq, nskb, wqe, pi, true); + mlx5e_sq_xmit_simple(sq, nskb, true); return true; @@ -274,6 +270,8 @@ bool mlx5e_tls_handle_tx_skb(struct net_device *netdev, struct mlx5e_txqsq *sq, if (!datalen) return true; + mlx5e_tx_mpwqe_ensure_complete(sq); + tls_ctx = tls_get_ctx(skb->sk); if (WARN_ON_ONCE(tls_ctx->netdev != netdev)) goto err_out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 0dda80d8bdca..d25a56ec6876 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1908,7 +1908,7 @@ static int set_pflag_rx_no_csum_complete(struct net_device *netdev, bool enable) return 0; } -static int set_pflag_xdp_tx_mpwqe(struct net_device *netdev, bool enable) +static int set_pflag_tx_mpwqe_common(struct net_device *netdev, u32 flag, bool enable) { struct mlx5e_priv *priv = netdev_priv(netdev); struct mlx5_core_dev *mdev = priv->mdev; @@ -1920,7 +1920,7 @@ static int set_pflag_xdp_tx_mpwqe(struct net_device *netdev, bool enable) new_channels.params = priv->channels.params; - MLX5E_SET_PFLAG(&new_channels.params, MLX5E_PFLAG_XDP_TX_MPWQE, enable); + MLX5E_SET_PFLAG(&new_channels.params, flag, enable); if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { priv->channels.params = new_channels.params; @@ -1931,6 +1931,16 @@ static int set_pflag_xdp_tx_mpwqe(struct net_device *netdev, bool enable) return err; } +static int set_pflag_xdp_tx_mpwqe(struct net_device *netdev, bool enable) +{ + return set_pflag_tx_mpwqe_common(netdev, MLX5E_PFLAG_XDP_TX_MPWQE, enable); +} + +static int set_pflag_skb_tx_mpwqe(struct net_device *netdev, bool enable) +{ + return set_pflag_tx_mpwqe_common(netdev, MLX5E_PFLAG_SKB_TX_MPWQE, enable); +} + static const struct pflag_desc mlx5e_priv_flags[MLX5E_NUM_PFLAGS] = { { "rx_cqe_moder", set_pflag_rx_cqe_based_moder }, { "tx_cqe_moder", set_pflag_tx_cqe_based_moder }, @@ -1938,6 +1948,7 @@ static const struct pflag_desc mlx5e_priv_flags[MLX5E_NUM_PFLAGS] = { { "rx_striding_rq", set_pflag_rx_striding_rq }, { "rx_no_csum_complete", set_pflag_rx_no_csum_complete }, { "xdp_tx_mpwqe", set_pflag_xdp_tx_mpwqe }, + { "skb_tx_mpwqe", set_pflag_skb_tx_mpwqe }, }; static int mlx5e_handle_pflag(struct net_device *netdev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 4d30a81cbadc..961cdce37cc4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1043,6 +1043,7 @@ static void mlx5e_free_icosq(struct mlx5e_icosq *sq) static void mlx5e_free_txqsq_db(struct mlx5e_txqsq *sq) { kvfree(sq->db.wqe_info); + kvfree(sq->db.skb_fifo); kvfree(sq->db.dma_fifo); } @@ -1054,15 +1055,19 @@ static int mlx5e_alloc_txqsq_db(struct mlx5e_txqsq *sq, int numa) sq->db.dma_fifo = kvzalloc_node(array_size(df_sz, sizeof(*sq->db.dma_fifo)), GFP_KERNEL, numa); + sq->db.skb_fifo = kvzalloc_node(array_size(df_sz, + sizeof(*sq->db.skb_fifo)), + GFP_KERNEL, numa); sq->db.wqe_info = kvzalloc_node(array_size(wq_sz, sizeof(*sq->db.wqe_info)), GFP_KERNEL, numa); - if (!sq->db.dma_fifo || !sq->db.wqe_info) { + if (!sq->db.dma_fifo || !sq->db.skb_fifo || !sq->db.wqe_info) { mlx5e_free_txqsq_db(sq); return -ENOMEM; } sq->dma_fifo_mask = df_sz - 1; + sq->skb_fifo_mask = df_sz - 1; return 0; } @@ -1073,6 +1078,12 @@ static int mlx5e_calc_sq_stop_room(struct mlx5e_txqsq *sq, u8 log_sq_size) sq->stop_room = mlx5e_tls_get_stop_room(sq); sq->stop_room += mlx5e_stop_room_for_wqe(MLX5_SEND_WQE_MAX_WQEBBS); + if (test_bit(MLX5E_SQ_STATE_MPWQE, &sq->state)) + /* A MPWQE can take up to the maximum-sized WQE + all the normal + * stop room can be taken if a new packet breaks the active + * MPWQE session and allocates its WQEs right away. + */ + sq->stop_room += mlx5e_stop_room_for_wqe(MLX5_SEND_WQE_MAX_WQEBBS); if (WARN_ON(sq->stop_room >= sq_size)) { netdev_err(sq->channel->netdev, "Stop room %hu is bigger than the SQ size %d\n", @@ -1114,6 +1125,8 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state); if (mlx5_accel_is_tls_device(c->priv->mdev)) set_bit(MLX5E_SQ_STATE_TLS, &sq->state); + if (param->is_mpw) + set_bit(MLX5E_SQ_STATE_MPWQE, &sq->state); err = mlx5e_calc_sq_stop_room(sq, params->log_sq_size); if (err) return err; @@ -2162,6 +2175,7 @@ static void mlx5e_build_sq_param(struct mlx5e_priv *priv, mlx5e_build_sq_param_common(priv, param); MLX5_SET(wq, wq, log_wq_sz, params->log_sq_size); MLX5_SET(sqc, sqc, allow_swp, allow_swp); + param->is_mpw = MLX5E_GET_PFLAG(params, MLX5E_PFLAG_SKB_TX_MPWQE); mlx5e_build_tx_cq_param(priv, params, ¶m->cqp); } @@ -4703,6 +4717,8 @@ void mlx5e_build_nic_params(struct mlx5e_priv *priv, params->log_sq_size = is_kdump_kernel() ? MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE : MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE; + MLX5E_SET_PFLAG(params, MLX5E_PFLAG_SKB_TX_MPWQE, + MLX5_CAP_ETH(mdev, enhanced_multi_pkt_send_wqe)); /* XDP SQ */ MLX5E_SET_PFLAG(params, MLX5E_PFLAG_XDP_TX_MPWQE, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 0d34befc5761..78f6a6f0a7e0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -110,6 +110,8 @@ static const struct counter_desc sw_stats_desc[] = { { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tso_inner_bytes) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_added_vlan_packets) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_nop) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_mpwqe_blks) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_mpwqe_pkts) }, #ifdef CONFIG_MLX5_EN_TLS { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_encrypted_packets) }, @@ -365,6 +367,8 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw) s->tx_tso_inner_bytes += sq_stats->tso_inner_bytes; s->tx_added_vlan_packets += sq_stats->added_vlan_packets; s->tx_nop += sq_stats->nop; + s->tx_mpwqe_blks += sq_stats->mpwqe_blks; + s->tx_mpwqe_pkts += sq_stats->mpwqe_pkts; s->tx_queue_stopped += sq_stats->stopped; s->tx_queue_wake += sq_stats->wake; s->tx_queue_dropped += sq_stats->dropped; @@ -1568,6 +1572,8 @@ static const struct counter_desc sq_stats_desc[] = { { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_partial_inner) }, { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, added_vlan_packets) }, { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, nop) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, mpwqe_blks) }, + { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, mpwqe_pkts) }, #ifdef CONFIG_MLX5_EN_TLS { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tls_encrypted_packets) }, { MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tls_encrypted_bytes) }, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h index 771f30cb0700..162daaadb0d8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h @@ -121,6 +121,8 @@ struct mlx5e_sw_stats { u64 tx_tso_inner_bytes; u64 tx_added_vlan_packets; u64 tx_nop; + u64 tx_mpwqe_blks; + u64 tx_mpwqe_pkts; u64 rx_lro_packets; u64 rx_lro_bytes; u64 rx_mcast_packets; @@ -351,6 +353,8 @@ struct mlx5e_sq_stats { u64 csum_partial_inner; u64 added_vlan_packets; u64 nop; + u64 mpwqe_blks; + u64 mpwqe_pkts; #ifdef CONFIG_MLX5_EN_TLS u64 tls_encrypted_packets; u64 tls_encrypted_bytes; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index da596de3abba..13bd4f254ed7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -232,131 +232,180 @@ dma_unmap_wqe_err: return -ENOMEM; } +struct mlx5e_tx_attr { + u32 num_bytes; + u16 headlen; + u16 ihs; + __be16 mss; + u8 opcode; +}; + +struct mlx5e_tx_wqe_attr { + u16 ds_cnt; + u16 ds_cnt_inl; + u8 num_wqebbs; +}; + +static u8 +mlx5e_tx_wqe_inline_mode(struct mlx5e_txqsq *sq, struct sk_buff *skb, + struct mlx5e_accel_tx_state *accel) +{ + u8 mode; + +#ifdef CONFIG_MLX5_EN_TLS + if (accel && accel->tls.tls_tisn) + return MLX5_INLINE_MODE_TCP_UDP; +#endif + + mode = sq->min_inline_mode; + + if (skb_vlan_tag_present(skb) && + test_bit(MLX5E_SQ_STATE_VLAN_NEED_L2_INLINE, &sq->state)) + mode = max_t(u8, MLX5_INLINE_MODE_L2, mode); + + return mode; +} + +static void mlx5e_sq_xmit_prepare(struct mlx5e_txqsq *sq, struct sk_buff *skb, + struct mlx5e_accel_tx_state *accel, + struct mlx5e_tx_attr *attr) +{ + struct mlx5e_sq_stats *stats = sq->stats; + + if (skb_is_gso(skb)) { + u16 ihs = mlx5e_tx_get_gso_ihs(sq, skb); + + *attr = (struct mlx5e_tx_attr) { + .opcode = MLX5_OPCODE_LSO, + .mss = cpu_to_be16(skb_shinfo(skb)->gso_size), + .ihs = ihs, + .num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs, + .headlen = skb_headlen(skb) - ihs, + }; + + stats->packets += skb_shinfo(skb)->gso_segs; + } else { + u8 mode = mlx5e_tx_wqe_inline_mode(sq, skb, accel); + u16 ihs = mlx5e_calc_min_inline(mode, skb); + + *attr = (struct mlx5e_tx_attr) { + .opcode = MLX5_OPCODE_SEND, + .mss = cpu_to_be16(0), + .ihs = ihs, + .num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN), + .headlen = skb_headlen(skb) - ihs, + }; + + stats->packets++; + } + + stats->bytes += attr->num_bytes; +} + +static void mlx5e_sq_calc_wqe_attr(struct sk_buff *skb, const struct mlx5e_tx_attr *attr, + struct mlx5e_tx_wqe_attr *wqe_attr) +{ + u16 ds_cnt = MLX5E_TX_WQE_EMPTY_DS_COUNT; + u16 ds_cnt_inl = 0; + + ds_cnt += !!attr->headlen + skb_shinfo(skb)->nr_frags; + + if (attr->ihs) { + u16 inl = attr->ihs - INL_HDR_START_SZ; + + if (skb_vlan_tag_present(skb)) + inl += VLAN_HLEN; + + ds_cnt_inl = DIV_ROUND_UP(inl, MLX5_SEND_WQE_DS); + ds_cnt += ds_cnt_inl; + } + + *wqe_attr = (struct mlx5e_tx_wqe_attr) { + .ds_cnt = ds_cnt, + .ds_cnt_inl = ds_cnt_inl, + .num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS), + }; +} + +static void mlx5e_tx_skb_update_hwts_flags(struct sk_buff *skb) +{ + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) + skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; +} + +static void mlx5e_tx_check_stop(struct mlx5e_txqsq *sq) +{ + if (unlikely(!mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, sq->stop_room))) { + netif_tx_stop_queue(sq->txq); + sq->stats->stopped++; + } +} + static inline void mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb, - u8 opcode, u16 ds_cnt, u8 num_wqebbs, u32 num_bytes, u8 num_dma, + const struct mlx5e_tx_attr *attr, + const struct mlx5e_tx_wqe_attr *wqe_attr, u8 num_dma, struct mlx5e_tx_wqe_info *wi, struct mlx5_wqe_ctrl_seg *cseg, bool xmit_more) { struct mlx5_wq_cyc *wq = &sq->wq; bool send_doorbell; - wi->num_bytes = num_bytes; - wi->num_dma = num_dma; - wi->num_wqebbs = num_wqebbs; - wi->skb = skb; + *wi = (struct mlx5e_tx_wqe_info) { + .skb = skb, + .num_bytes = attr->num_bytes, + .num_dma = num_dma, + .num_wqebbs = wqe_attr->num_wqebbs, + .num_fifo_pkts = 0, + }; - cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode); - cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); + cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | attr->opcode); + cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | wqe_attr->ds_cnt); - if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) - skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; + mlx5e_tx_skb_update_hwts_flags(skb); sq->pc += wi->num_wqebbs; - if (unlikely(!mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, sq->stop_room))) { - netif_tx_stop_queue(sq->txq); - sq->stats->stopped++; - } - send_doorbell = __netdev_tx_sent_queue(sq->txq, num_bytes, - xmit_more); + mlx5e_tx_check_stop(sq); + + send_doorbell = __netdev_tx_sent_queue(sq->txq, attr->num_bytes, xmit_more); if (send_doorbell) mlx5e_notify_hw(wq, sq->pc, sq->uar_map, cseg); } -void mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, - struct mlx5e_tx_wqe *wqe, u16 pi, bool xmit_more) +static void +mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, + const struct mlx5e_tx_attr *attr, const struct mlx5e_tx_wqe_attr *wqe_attr, + struct mlx5e_tx_wqe *wqe, u16 pi, bool xmit_more) { - struct mlx5_wq_cyc *wq = &sq->wq; struct mlx5_wqe_ctrl_seg *cseg; struct mlx5_wqe_eth_seg *eseg; struct mlx5_wqe_data_seg *dseg; struct mlx5e_tx_wqe_info *wi; struct mlx5e_sq_stats *stats = sq->stats; - u16 headlen, ihs, contig_wqebbs_room; - u16 ds_cnt, ds_cnt_inl = 0; - u8 num_wqebbs, opcode; - u32 num_bytes; int num_dma; - __be16 mss; - - /* Calc ihs and ds cnt, no writes to wqe yet */ - ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; - if (skb_is_gso(skb)) { - opcode = MLX5_OPCODE_LSO; - mss = cpu_to_be16(skb_shinfo(skb)->gso_size); - ihs = mlx5e_tx_get_gso_ihs(sq, skb); - num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs; - stats->packets += skb_shinfo(skb)->gso_segs; - } else { - u8 mode = mlx5e_tx_wqe_inline_mode(sq, &wqe->ctrl, skb); - - opcode = MLX5_OPCODE_SEND; - mss = 0; - ihs = mlx5e_calc_min_inline(mode, skb); - num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN); - stats->packets++; - } - stats->bytes += num_bytes; stats->xmit_more += xmit_more; - headlen = skb->len - ihs - skb->data_len; - ds_cnt += !!headlen; - ds_cnt += skb_shinfo(skb)->nr_frags; - - if (ihs) { - ihs += !!skb_vlan_tag_present(skb) * VLAN_HLEN; - - ds_cnt_inl = DIV_ROUND_UP(ihs - INL_HDR_START_SZ, MLX5_SEND_WQE_DS); - ds_cnt += ds_cnt_inl; - } - - num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); - contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); - if (unlikely(contig_wqebbs_room < num_wqebbs)) { -#ifdef CONFIG_MLX5_EN_IPSEC - struct mlx5_wqe_eth_seg cur_eth = wqe->eth; -#endif -#ifdef CONFIG_MLX5_EN_TLS - struct mlx5_wqe_ctrl_seg cur_ctrl = wqe->ctrl; -#endif - mlx5e_fill_sq_frag_edge(sq, wq, pi, contig_wqebbs_room); - pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); - wqe = MLX5E_TX_FETCH_WQE(sq, pi); -#ifdef CONFIG_MLX5_EN_IPSEC - wqe->eth = cur_eth; -#endif -#ifdef CONFIG_MLX5_EN_TLS - wqe->ctrl = cur_ctrl; -#endif - } - /* fill wqe */ wi = &sq->db.wqe_info[pi]; cseg = &wqe->ctrl; eseg = &wqe->eth; dseg = wqe->data; -#if IS_ENABLED(CONFIG_GENEVE) - if (skb->encapsulation) - mlx5e_tx_tunnel_accel(skb, eseg); -#endif - mlx5e_txwqe_build_eseg_csum(sq, skb, eseg); - - eseg->mss = mss; + eseg->mss = attr->mss; - if (ihs) { - eseg->inline_hdr.sz = cpu_to_be16(ihs); + if (attr->ihs) { if (skb_vlan_tag_present(skb)) { - ihs -= VLAN_HLEN; - mlx5e_insert_vlan(eseg->inline_hdr.start, skb, ihs); + eseg->inline_hdr.sz = cpu_to_be16(attr->ihs + VLAN_HLEN); + mlx5e_insert_vlan(eseg->inline_hdr.start, skb, attr->ihs); stats->added_vlan_packets++; } else { - memcpy(eseg->inline_hdr.start, skb->data, ihs); + eseg->inline_hdr.sz = cpu_to_be16(attr->ihs); + memcpy(eseg->inline_hdr.start, skb->data, attr->ihs); } - dseg += ds_cnt_inl; + dseg += wqe_attr->ds_cnt_inl; } else if (skb_vlan_tag_present(skb)) { eseg->insert.type = cpu_to_be16(MLX5_ETH_WQE_INSERT_VLAN); if (skb->vlan_proto == cpu_to_be16(ETH_P_8021AD)) @@ -365,12 +414,12 @@ void mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, stats->added_vlan_packets++; } - num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + ihs, headlen, dseg); + num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr->ihs, + attr->headlen, dseg); if (unlikely(num_dma < 0)) goto err_drop; - mlx5e_txwqe_complete(sq, skb, opcode, ds_cnt, num_wqebbs, num_bytes, - num_dma, wi, cseg, xmit_more); + mlx5e_txwqe_complete(sq, skb, attr, wqe_attr, num_dma, wi, cseg, xmit_more); return; @@ -379,10 +428,172 @@ err_drop: dev_kfree_skb_any(skb); } +static bool mlx5e_tx_skb_supports_mpwqe(struct sk_buff *skb, struct mlx5e_tx_attr *attr) +{ + return !skb_is_nonlinear(skb) && !skb_vlan_tag_present(skb) && !attr->ihs; +} + +static bool mlx5e_tx_mpwqe_same_eseg(struct mlx5e_txqsq *sq, struct mlx5_wqe_eth_seg *eseg) +{ + struct mlx5e_tx_mpwqe *session = &sq->mpwqe; + + /* Assumes the session is already running and has at least one packet. */ + return !memcmp(&session->wqe->eth, eseg, MLX5E_ACCEL_ESEG_LEN); +} + +static void mlx5e_tx_mpwqe_session_start(struct mlx5e_txqsq *sq, + struct mlx5_wqe_eth_seg *eseg) +{ + struct mlx5e_tx_mpwqe *session = &sq->mpwqe; + struct mlx5e_tx_wqe *wqe; + u16 pi; + + pi = mlx5e_txqsq_get_next_pi(sq, MLX5E_TX_MPW_MAX_WQEBBS); + wqe = MLX5E_TX_FETCH_WQE(sq, pi); + prefetchw(wqe->data); + + *session = (struct mlx5e_tx_mpwqe) { + .wqe = wqe, + .bytes_count = 0, + .ds_count = MLX5E_TX_WQE_EMPTY_DS_COUNT, + .pkt_count = 0, + .inline_on = 0, + }; + + memcpy(&session->wqe->eth, eseg, MLX5E_ACCEL_ESEG_LEN); + + sq->stats->mpwqe_blks++; +} + +static bool mlx5e_tx_mpwqe_session_is_active(struct mlx5e_txqsq *sq) +{ + return sq->mpwqe.wqe; +} + +static void mlx5e_tx_mpwqe_add_dseg(struct mlx5e_txqsq *sq, struct mlx5e_xmit_data *txd) +{ + struct mlx5e_tx_mpwqe *session = &sq->mpwqe; + struct mlx5_wqe_data_seg *dseg; + + dseg = (struct mlx5_wqe_data_seg *)session->wqe + session->ds_count; + + session->pkt_count++; + session->bytes_count += txd->len; + + dseg->addr = cpu_to_be64(txd->dma_addr); + dseg->byte_count = cpu_to_be32(txd->len); + dseg->lkey = sq->mkey_be; + session->ds_count++; + + sq->stats->mpwqe_pkts++; +} + +static struct mlx5_wqe_ctrl_seg *mlx5e_tx_mpwqe_session_complete(struct mlx5e_txqsq *sq) +{ + struct mlx5e_tx_mpwqe *session = &sq->mpwqe; + u8 ds_count = session->ds_count; + struct mlx5_wqe_ctrl_seg *cseg; + struct mlx5e_tx_wqe_info *wi; + u16 pi; + + cseg = &session->wqe->ctrl; + cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_ENHANCED_MPSW); + cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_count); + + pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc); + wi = &sq->db.wqe_info[pi]; + *wi = (struct mlx5e_tx_wqe_info) { + .skb = NULL, + .num_bytes = session->bytes_count, + .num_wqebbs = DIV_ROUND_UP(ds_count, MLX5_SEND_WQEBB_NUM_DS), + .num_dma = session->pkt_count, + .num_fifo_pkts = session->pkt_count, + }; + + sq->pc += wi->num_wqebbs; + + session->wqe = NULL; + + mlx5e_tx_check_stop(sq); + + return cseg; +} + +static void +mlx5e_sq_xmit_mpwqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, + struct mlx5_wqe_eth_seg *eseg, bool xmit_more) +{ + struct mlx5_wqe_ctrl_seg *cseg; + struct mlx5e_xmit_data txd; + + if (!mlx5e_tx_mpwqe_session_is_active(sq)) { + mlx5e_tx_mpwqe_session_start(sq, eseg); + } else if (!mlx5e_tx_mpwqe_same_eseg(sq, eseg)) { + mlx5e_tx_mpwqe_session_complete(sq); + mlx5e_tx_mpwqe_session_start(sq, eseg); + } + + sq->stats->xmit_more += xmit_more; + + txd.data = skb->data; + txd.len = skb->len; + + txd.dma_addr = dma_map_single(sq->pdev, txd.data, txd.len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(sq->pdev, txd.dma_addr))) + goto err_unmap; + mlx5e_dma_push(sq, txd.dma_addr, txd.len, MLX5E_DMA_MAP_SINGLE); + + mlx5e_skb_fifo_push(sq, skb); + + mlx5e_tx_mpwqe_add_dseg(sq, &txd); + + mlx5e_tx_skb_update_hwts_flags(skb); + + if (unlikely(mlx5e_tx_mpwqe_is_full(&sq->mpwqe))) { + /* Might stop the queue and affect the retval of __netdev_tx_sent_queue. */ + cseg = mlx5e_tx_mpwqe_session_complete(sq); + + if (__netdev_tx_sent_queue(sq->txq, txd.len, xmit_more)) + mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); + } else if (__netdev_tx_sent_queue(sq->txq, txd.len, xmit_more)) { + /* Might stop the queue, but we were asked to ring the doorbell anyway. */ + cseg = mlx5e_tx_mpwqe_session_complete(sq); + + mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); + } + + return; + +err_unmap: + mlx5e_dma_unmap_wqe_err(sq, 1); + sq->stats->dropped++; + dev_kfree_skb_any(skb); +} + +void mlx5e_tx_mpwqe_ensure_complete(struct mlx5e_txqsq *sq) +{ + /* Unlikely in non-MPWQE workloads; not important in MPWQE workloads. */ + if (unlikely(mlx5e_tx_mpwqe_session_is_active(sq))) + mlx5e_tx_mpwqe_session_complete(sq); +} + +static bool mlx5e_txwqe_build_eseg(struct mlx5e_priv *priv, struct mlx5e_txqsq *sq, + struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg) +{ + if (unlikely(!mlx5e_accel_tx_eseg(priv, sq, skb, eseg))) + return false; + + mlx5e_txwqe_build_eseg_csum(sq, skb, eseg); + + return true; +} + netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev) { struct mlx5e_priv *priv = netdev_priv(dev); struct mlx5e_accel_tx_state accel = {}; + struct mlx5e_tx_wqe_attr wqe_attr; + struct mlx5e_tx_attr attr; struct mlx5e_tx_wqe *wqe; struct mlx5e_txqsq *sq; u16 pi; @@ -391,21 +602,91 @@ netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev) /* May send SKBs and WQEs. */ if (unlikely(!mlx5e_accel_tx_begin(dev, sq, skb, &accel))) - goto out; + return NETDEV_TX_OK; - pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc); + mlx5e_sq_xmit_prepare(sq, skb, &accel, &attr); + + if (test_bit(MLX5E_SQ_STATE_MPWQE, &sq->state)) { + if (mlx5e_tx_skb_supports_mpwqe(skb, &attr)) { + struct mlx5_wqe_eth_seg eseg = {}; + + if (unlikely(!mlx5e_txwqe_build_eseg(priv, sq, skb, &eseg))) + return NETDEV_TX_OK; + + mlx5e_sq_xmit_mpwqe(sq, skb, &eseg, netdev_xmit_more()); + return NETDEV_TX_OK; + } + + mlx5e_tx_mpwqe_ensure_complete(sq); + } + + mlx5e_sq_calc_wqe_attr(skb, &attr, &wqe_attr); + pi = mlx5e_txqsq_get_next_pi(sq, wqe_attr.num_wqebbs); wqe = MLX5E_TX_FETCH_WQE(sq, pi); /* May update the WQE, but may not post other WQEs. */ - if (unlikely(!mlx5e_accel_tx_finish(priv, sq, skb, wqe, &accel))) - goto out; + mlx5e_accel_tx_finish(sq, wqe, &accel); + if (unlikely(!mlx5e_txwqe_build_eseg(priv, sq, skb, &wqe->eth))) + return NETDEV_TX_OK; - mlx5e_sq_xmit(sq, skb, wqe, pi, netdev_xmit_more()); + mlx5e_sq_xmit_wqe(sq, skb, &attr, &wqe_attr, wqe, pi, netdev_xmit_more()); -out: return NETDEV_TX_OK; } +void mlx5e_sq_xmit_simple(struct mlx5e_txqsq *sq, struct sk_buff *skb, bool xmit_more) +{ + struct mlx5e_tx_wqe_attr wqe_attr; + struct mlx5e_tx_attr attr; + struct mlx5e_tx_wqe *wqe; + u16 pi; + + mlx5e_sq_xmit_prepare(sq, skb, NULL, &attr); + mlx5e_sq_calc_wqe_attr(skb, &attr, &wqe_attr); + pi = mlx5e_txqsq_get_next_pi(sq, wqe_attr.num_wqebbs); + wqe = MLX5E_TX_FETCH_WQE(sq, pi); + mlx5e_txwqe_build_eseg_csum(sq, skb, &wqe->eth); + mlx5e_sq_xmit_wqe(sq, skb, &attr, &wqe_attr, wqe, pi, xmit_more); +} + +static void mlx5e_tx_wi_dma_unmap(struct mlx5e_txqsq *sq, struct mlx5e_tx_wqe_info *wi, + u32 *dma_fifo_cc) +{ + int i; + + for (i = 0; i < wi->num_dma; i++) { + struct mlx5e_sq_dma *dma = mlx5e_dma_get(sq, (*dma_fifo_cc)++); + + mlx5e_tx_dma_unmap(sq->pdev, dma); + } +} + +static void mlx5e_consume_skb(struct mlx5e_txqsq *sq, struct sk_buff *skb, + struct mlx5_cqe64 *cqe, int napi_budget) +{ + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { + struct skb_shared_hwtstamps hwts = {}; + u64 ts = get_cqe_ts(cqe); + + hwts.hwtstamp = mlx5_timecounter_cyc2time(sq->clock, ts); + skb_tstamp_tx(skb, &hwts); + } + + napi_consume_skb(skb, napi_budget); +} + +static void mlx5e_tx_wi_consume_fifo_skbs(struct mlx5e_txqsq *sq, struct mlx5e_tx_wqe_info *wi, + struct mlx5_cqe64 *cqe, int napi_budget) +{ + int i; + + for (i = 0; i < wi->num_fifo_pkts; i++) { + struct sk_buff *skb = mlx5e_skb_fifo_pop(sq); + + mlx5e_consume_skb(sq, skb, cqe, napi_budget); + } +} + bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget) { struct mlx5e_sq_stats *stats; @@ -451,42 +732,33 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget) wqe_counter = be16_to_cpu(cqe->wqe_counter); do { - struct sk_buff *skb; - int j; - last_wqe = (sqcc == wqe_counter); ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); wi = &sq->db.wqe_info[ci]; - skb = wi->skb; - if (unlikely(!skb)) { - mlx5e_ktls_tx_handle_resync_dump_comp(sq, wi, &dma_fifo_cc); - sqcc += wi->num_wqebbs; - continue; - } + sqcc += wi->num_wqebbs; - if (unlikely(skb_shinfo(skb)->tx_flags & - SKBTX_HW_TSTAMP)) { - struct skb_shared_hwtstamps hwts = {}; + if (likely(wi->skb)) { + mlx5e_tx_wi_dma_unmap(sq, wi, &dma_fifo_cc); + mlx5e_consume_skb(sq, wi->skb, cqe, napi_budget); - hwts.hwtstamp = - mlx5_timecounter_cyc2time(sq->clock, - get_cqe_ts(cqe)); - skb_tstamp_tx(skb, &hwts); + npkts++; + nbytes += wi->num_bytes; + continue; } - for (j = 0; j < wi->num_dma; j++) { - struct mlx5e_sq_dma *dma = - mlx5e_dma_get(sq, dma_fifo_cc++); + if (unlikely(mlx5e_ktls_tx_try_handle_resync_dump_comp(sq, wi, + &dma_fifo_cc))) + continue; - mlx5e_tx_dma_unmap(sq->pdev, dma); - } + if (wi->num_fifo_pkts) { + mlx5e_tx_wi_dma_unmap(sq, wi, &dma_fifo_cc); + mlx5e_tx_wi_consume_fifo_skbs(sq, wi, cqe, napi_budget); - npkts++; - nbytes += wi->num_bytes; - sqcc += wi->num_wqebbs; - napi_consume_skb(skb, napi_budget); + npkts += wi->num_fifo_pkts; + nbytes += wi->num_bytes; + } } while (!last_wqe); if (unlikely(get_cqe_opcode(cqe) == MLX5_CQE_REQ_ERR)) { @@ -525,13 +797,19 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget) return (i == MLX5E_TX_CQ_POLL_BUDGET); } +static void mlx5e_tx_wi_kfree_fifo_skbs(struct mlx5e_txqsq *sq, struct mlx5e_tx_wqe_info *wi) +{ + int i; + + for (i = 0; i < wi->num_fifo_pkts; i++) + dev_kfree_skb_any(mlx5e_skb_fifo_pop(sq)); +} + void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq) { struct mlx5e_tx_wqe_info *wi; u32 dma_fifo_cc, nbytes = 0; u16 ci, sqcc, npkts = 0; - struct sk_buff *skb; - int i; sqcc = sq->cc; dma_fifo_cc = sq->dma_fifo_cc; @@ -539,25 +817,28 @@ void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq) while (sqcc != sq->pc) { ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); wi = &sq->db.wqe_info[ci]; - skb = wi->skb; - if (!skb) { - mlx5e_ktls_tx_handle_resync_dump_comp(sq, wi, &dma_fifo_cc); - sqcc += wi->num_wqebbs; + sqcc += wi->num_wqebbs; + + if (likely(wi->skb)) { + mlx5e_tx_wi_dma_unmap(sq, wi, &dma_fifo_cc); + dev_kfree_skb_any(wi->skb); + + npkts++; + nbytes += wi->num_bytes; continue; } - for (i = 0; i < wi->num_dma; i++) { - struct mlx5e_sq_dma *dma = - mlx5e_dma_get(sq, dma_fifo_cc++); + if (unlikely(mlx5e_ktls_tx_try_handle_resync_dump_comp(sq, wi, &dma_fifo_cc))) + continue; - mlx5e_tx_dma_unmap(sq->pdev, dma); - } + if (wi->num_fifo_pkts) { + mlx5e_tx_wi_dma_unmap(sq, wi, &dma_fifo_cc); + mlx5e_tx_wi_kfree_fifo_skbs(sq, wi); - dev_kfree_skb_any(skb); - npkts++; - nbytes += wi->num_bytes; - sqcc += wi->num_wqebbs; + npkts += wi->num_fifo_pkts; + nbytes += wi->num_bytes; + } } sq->dma_fifo_cc = dma_fifo_cc; @@ -576,9 +857,34 @@ mlx5i_txwqe_build_datagram(struct mlx5_av *av, u32 dqpn, u32 dqkey, dseg->av.key.qkey.qkey = cpu_to_be32(dqkey); } +static void mlx5i_sq_calc_wqe_attr(struct sk_buff *skb, + const struct mlx5e_tx_attr *attr, + struct mlx5e_tx_wqe_attr *wqe_attr) +{ + u16 ds_cnt = sizeof(struct mlx5i_tx_wqe) / MLX5_SEND_WQE_DS; + u16 ds_cnt_inl = 0; + + ds_cnt += !!attr->headlen + skb_shinfo(skb)->nr_frags; + + if (attr->ihs) { + u16 inl = attr->ihs - INL_HDR_START_SZ; + + ds_cnt_inl = DIV_ROUND_UP(inl, MLX5_SEND_WQE_DS); + ds_cnt += ds_cnt_inl; + } + + *wqe_attr = (struct mlx5e_tx_wqe_attr) { + .ds_cnt = ds_cnt, + .ds_cnt_inl = ds_cnt_inl, + .num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS), + }; +} + void mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5_av *av, u32 dqpn, u32 dqkey, bool xmit_more) { + struct mlx5e_tx_wqe_attr wqe_attr; + struct mlx5e_tx_attr attr; struct mlx5i_tx_wqe *wqe; struct mlx5_wqe_datagram_seg *datagram; @@ -588,47 +894,17 @@ void mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5e_tx_wqe_info *wi; struct mlx5e_sq_stats *stats = sq->stats; - u16 ds_cnt, ds_cnt_inl = 0; - u8 num_wqebbs, opcode; - u16 headlen, ihs, pi; - u32 num_bytes; int num_dma; - __be16 mss; + u16 pi; - /* Calc ihs and ds cnt, no writes to wqe yet */ - ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; - if (skb_is_gso(skb)) { - opcode = MLX5_OPCODE_LSO; - mss = cpu_to_be16(skb_shinfo(skb)->gso_size); - ihs = mlx5e_tx_get_gso_ihs(sq, skb); - num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs; - stats->packets += skb_shinfo(skb)->gso_segs; - } else { - u8 mode = mlx5e_tx_wqe_inline_mode(sq, NULL, skb); + mlx5e_sq_xmit_prepare(sq, skb, NULL, &attr); + mlx5i_sq_calc_wqe_attr(skb, &attr, &wqe_attr); - opcode = MLX5_OPCODE_SEND; - mss = 0; - ihs = mlx5e_calc_min_inline(mode, skb); - num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN); - stats->packets++; - } + pi = mlx5e_txqsq_get_next_pi(sq, wqe_attr.num_wqebbs); + wqe = MLX5I_SQ_FETCH_WQE(sq, pi); - stats->bytes += num_bytes; stats->xmit_more += xmit_more; - headlen = skb->len - ihs - skb->data_len; - ds_cnt += !!headlen; - ds_cnt += skb_shinfo(skb)->nr_frags; - - if (ihs) { - ds_cnt_inl = DIV_ROUND_UP(ihs - INL_HDR_START_SZ, MLX5_SEND_WQE_DS); - ds_cnt += ds_cnt_inl; - } - - num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); - pi = mlx5e_txqsq_get_next_pi(sq, num_wqebbs); - wqe = MLX5I_SQ_FETCH_WQE(sq, pi); - /* fill wqe */ wi = &sq->db.wqe_info[pi]; cseg = &wqe->ctrl; @@ -640,20 +916,20 @@ void mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, mlx5e_txwqe_build_eseg_csum(sq, skb, eseg); - eseg->mss = mss; + eseg->mss = attr.mss; - if (ihs) { - memcpy(eseg->inline_hdr.start, skb->data, ihs); - eseg->inline_hdr.sz = cpu_to_be16(ihs); - dseg += ds_cnt_inl; + if (attr.ihs) { + memcpy(eseg->inline_hdr.start, skb->data, attr.ihs); + eseg->inline_hdr.sz = cpu_to_be16(attr.ihs); + dseg += wqe_attr.ds_cnt_inl; } - num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + ihs, headlen, dseg); + num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + attr.ihs, + attr.headlen, dseg); if (unlikely(num_dma < 0)) goto err_drop; - mlx5e_txwqe_complete(sq, skb, opcode, ds_cnt, num_wqebbs, num_bytes, - num_dma, wi, cseg, xmit_more); + mlx5e_txwqe_complete(sq, skb, &attr, &wqe_attr, num_dma, wi, cseg, xmit_more); return; diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index de93cc6ebc1a..7e236c9ee4b1 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -3038,7 +3038,6 @@ static int lan743x_pm_suspend(struct device *dev) struct pci_dev *pdev = to_pci_dev(dev); struct net_device *netdev = pci_get_drvdata(pdev); struct lan743x_adapter *adapter = netdev_priv(netdev); - int ret; lan743x_pcidev_shutdown(pdev); @@ -3051,9 +3050,7 @@ static int lan743x_pm_suspend(struct device *dev) lan743x_pm_set_wol(adapter); /* Host sets PME_En, put D3hot */ - ret = pci_prepare_to_sleep(pdev); - - return 0; + return pci_prepare_to_sleep(pdev);; } static int lan743x_pm_resume(struct device *dev) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 8518e1d60da4..0445c5ee5551 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -413,26 +413,20 @@ void ocelot_port_disable(struct ocelot *ocelot, int port) } EXPORT_SYMBOL(ocelot_port_disable); -int ocelot_port_add_txtstamp_skb(struct ocelot_port *ocelot_port, - struct sk_buff *skb) +void ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port, + struct sk_buff *clone) { - struct skb_shared_info *shinfo = skb_shinfo(skb); - struct ocelot *ocelot = ocelot_port->ocelot; + struct ocelot_port *ocelot_port = ocelot->ports[port]; - if (ocelot->ptp && shinfo->tx_flags & SKBTX_HW_TSTAMP && - ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { - spin_lock(&ocelot_port->ts_id_lock); + spin_lock(&ocelot_port->ts_id_lock); - shinfo->tx_flags |= SKBTX_IN_PROGRESS; - /* Store timestamp ID in cb[0] of sk_buff */ - skb->cb[0] = ocelot_port->ts_id; - ocelot_port->ts_id = (ocelot_port->ts_id + 1) % 4; - skb_queue_tail(&ocelot_port->tx_skbs, skb); + skb_shinfo(clone)->tx_flags |= SKBTX_IN_PROGRESS; + /* Store timestamp ID in cb[0] of sk_buff */ + clone->cb[0] = ocelot_port->ts_id; + ocelot_port->ts_id = (ocelot_port->ts_id + 1) % 4; + skb_queue_tail(&ocelot_port->tx_skbs, clone); - spin_unlock(&ocelot_port->ts_id_lock); - return 0; - } - return -ENODATA; + spin_unlock(&ocelot_port->ts_id_lock); } EXPORT_SYMBOL(ocelot_port_add_txtstamp_skb); @@ -511,9 +505,7 @@ void ocelot_get_txtstamp(struct ocelot *ocelot) /* Set the timestamp into the skb */ memset(&shhwtstamps, 0, sizeof(shhwtstamps)); shhwtstamps.hwtstamp = ktime_set(ts.tv_sec, ts.tv_nsec); - skb_tstamp_tx(skb_match, &shhwtstamps); - - dev_kfree_skb_any(skb_match); + skb_complete_tx_timestamp(skb_match, &shhwtstamps); /* Next ts */ ocelot_write(ocelot, SYS_PTP_NXT_PTP_NXT, SYS_PTP_NXT); diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 8490e42e9e2d..028a0150f97d 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -330,7 +330,6 @@ static int ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) u8 grp = 0; /* Send everything on CPU group 0 */ unsigned int i, count, last; int port = priv->chip_port; - bool do_tstamp; val = ocelot_read(ocelot, QS_INJ_STATUS); if (!(val & QS_INJ_STATUS_FIFO_RDY(BIT(grp))) || @@ -345,7 +344,23 @@ static int ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) info.vid = skb_vlan_tag_get(skb); /* Check if timestamping is needed */ - do_tstamp = (ocelot_port_add_txtstamp_skb(ocelot_port, skb) == 0); + if (ocelot->ptp && (shinfo->tx_flags & SKBTX_HW_TSTAMP)) { + info.rew_op = ocelot_port->ptp_cmd; + + if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { + struct sk_buff *clone; + + clone = skb_clone_sk(skb); + if (!clone) { + kfree_skb(skb); + return NETDEV_TX_OK; + } + + ocelot_port_add_txtstamp_skb(ocelot, port, clone); + + info.rew_op |= clone->cb[0] << 3; + } + } if (ocelot->ptp && shinfo->tx_flags & SKBTX_HW_TSTAMP) { info.rew_op = ocelot_port->ptp_cmd; @@ -383,8 +398,7 @@ static int ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) dev->stats.tx_packets++; dev->stats.tx_bytes += skb->len; - if (!do_tstamp) - dev_kfree_skb_any(skb); + kfree_skb(skb); return NETDEV_TX_OK; } diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index ac02369174a9..53851853562c 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -111,7 +111,9 @@ static int nfp_map_ptrs_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, struct bpf_prog *prog) { - int i, cnt, err; + int i, cnt, err = 0; + + mutex_lock(&prog->aux->used_maps_mutex); /* Quickly count the maps we will have to remember */ cnt = 0; @@ -119,13 +121,15 @@ nfp_map_ptrs_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, if (bpf_map_offload_neutral(prog->aux->used_maps[i])) cnt++; if (!cnt) - return 0; + goto out; nfp_prog->map_records = kmalloc_array(cnt, sizeof(nfp_prog->map_records[0]), GFP_KERNEL); - if (!nfp_prog->map_records) - return -ENOMEM; + if (!nfp_prog->map_records) { + err = -ENOMEM; + goto out; + } for (i = 0; i < prog->aux->used_map_cnt; i++) if (bpf_map_offload_neutral(prog->aux->used_maps[i])) { @@ -133,12 +137,14 @@ nfp_map_ptrs_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, prog->aux->used_maps[i]); if (err) { nfp_map_ptrs_forget(bpf, nfp_prog); - return err; + goto out; } } WARN_ON(cnt != nfp_prog->map_records_cnt); - return 0; +out: + mutex_unlock(&prog->aux->used_maps_mutex); + return err; } static int diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c index e291e6ac40cb..4e44313b7651 100644 --- a/drivers/net/ethernet/realtek/8139cp.c +++ b/drivers/net/ethernet/realtek/8139cp.c @@ -1239,7 +1239,7 @@ static void cp_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct cp_private *cp = netdev_priv(dev); unsigned long flags; - int rc, i; + int i; netdev_warn(dev, "Transmit timeout, status %2x %4x %4x %4x\n", cpr8(Cmd), cpr16(CpCmd), @@ -1260,7 +1260,7 @@ static void cp_tx_timeout(struct net_device *dev, unsigned int txqueue) cp_stop_hw(cp); cp_clean_rings(cp); - rc = cp_init_rings(cp); + cp_init_rings(cp); cp_start_hw(cp); __cp_set_rx_mode(dev); cpw16_f(IntrMask, cp_norx_intr_mask); diff --git a/drivers/net/mdio/mdio-ipq4019.c b/drivers/net/mdio/mdio-ipq4019.c index 1ce81ff2f41d..25c25ea6da66 100644 --- a/drivers/net/mdio/mdio-ipq4019.c +++ b/drivers/net/mdio/mdio-ipq4019.c @@ -12,6 +12,7 @@ #include <linux/phy.h> #include <linux/platform_device.h> +#define MDIO_MODE_REG 0x40 #define MDIO_ADDR_REG 0x44 #define MDIO_DATA_WRITE_REG 0x48 #define MDIO_DATA_READ_REG 0x4c @@ -20,9 +21,15 @@ #define MDIO_CMD_ACCESS_START BIT(8) #define MDIO_CMD_ACCESS_CODE_READ 0 #define MDIO_CMD_ACCESS_CODE_WRITE 1 +#define MDIO_CMD_ACCESS_CODE_C45_ADDR 0 +#define MDIO_CMD_ACCESS_CODE_C45_WRITE 1 +#define MDIO_CMD_ACCESS_CODE_C45_READ 2 -#define ipq4019_MDIO_TIMEOUT 10000 -#define ipq4019_MDIO_SLEEP 10 +/* 0 = Clause 22, 1 = Clause 45 */ +#define MDIO_MODE_C45 BIT(8) + +#define IPQ4019_MDIO_TIMEOUT 10000 +#define IPQ4019_MDIO_SLEEP 10 struct ipq4019_mdio_data { void __iomem *membase; @@ -35,25 +42,50 @@ static int ipq4019_mdio_wait_busy(struct mii_bus *bus) return readl_poll_timeout(priv->membase + MDIO_CMD_REG, busy, (busy & MDIO_CMD_ACCESS_BUSY) == 0, - ipq4019_MDIO_SLEEP, ipq4019_MDIO_TIMEOUT); + IPQ4019_MDIO_SLEEP, IPQ4019_MDIO_TIMEOUT); } static int ipq4019_mdio_read(struct mii_bus *bus, int mii_id, int regnum) { struct ipq4019_mdio_data *priv = bus->priv; + unsigned int data; unsigned int cmd; - /* Reject clause 45 */ - if (regnum & MII_ADDR_C45) - return -EOPNOTSUPP; - if (ipq4019_mdio_wait_busy(bus)) return -ETIMEDOUT; - /* issue the phy address and reg */ - writel((mii_id << 8) | regnum, priv->membase + MDIO_ADDR_REG); + /* Clause 45 support */ + if (regnum & MII_ADDR_C45) { + unsigned int mmd = (regnum >> 16) & 0x1F; + unsigned int reg = regnum & 0xFFFF; + + /* Enter Clause 45 mode */ + data = readl(priv->membase + MDIO_MODE_REG); + + data |= MDIO_MODE_C45; + + writel(data, priv->membase + MDIO_MODE_REG); + + /* issue the phy address and mmd */ + writel((mii_id << 8) | mmd, priv->membase + MDIO_ADDR_REG); + + /* issue reg */ + writel(reg, priv->membase + MDIO_DATA_WRITE_REG); + + cmd = MDIO_CMD_ACCESS_START | MDIO_CMD_ACCESS_CODE_C45_ADDR; + } else { + /* Enter Clause 22 mode */ + data = readl(priv->membase + MDIO_MODE_REG); - cmd = MDIO_CMD_ACCESS_START | MDIO_CMD_ACCESS_CODE_READ; + data &= ~MDIO_MODE_C45; + + writel(data, priv->membase + MDIO_MODE_REG); + + /* issue the phy address and reg */ + writel((mii_id << 8) | regnum, priv->membase + MDIO_ADDR_REG); + + cmd = MDIO_CMD_ACCESS_START | MDIO_CMD_ACCESS_CODE_READ; + } /* issue read command */ writel(cmd, priv->membase + MDIO_CMD_REG); @@ -62,6 +94,15 @@ static int ipq4019_mdio_read(struct mii_bus *bus, int mii_id, int regnum) if (ipq4019_mdio_wait_busy(bus)) return -ETIMEDOUT; + if (regnum & MII_ADDR_C45) { + cmd = MDIO_CMD_ACCESS_START | MDIO_CMD_ACCESS_CODE_C45_READ; + + writel(cmd, priv->membase + MDIO_CMD_REG); + + if (ipq4019_mdio_wait_busy(bus)) + return -ETIMEDOUT; + } + /* Read and return data */ return readl(priv->membase + MDIO_DATA_READ_REG); } @@ -70,23 +111,57 @@ static int ipq4019_mdio_write(struct mii_bus *bus, int mii_id, int regnum, u16 value) { struct ipq4019_mdio_data *priv = bus->priv; + unsigned int data; unsigned int cmd; - /* Reject clause 45 */ - if (regnum & MII_ADDR_C45) - return -EOPNOTSUPP; - if (ipq4019_mdio_wait_busy(bus)) return -ETIMEDOUT; - /* issue the phy address and reg */ - writel((mii_id << 8) | regnum, priv->membase + MDIO_ADDR_REG); + /* Clause 45 support */ + if (regnum & MII_ADDR_C45) { + unsigned int mmd = (regnum >> 16) & 0x1F; + unsigned int reg = regnum & 0xFFFF; + + /* Enter Clause 45 mode */ + data = readl(priv->membase + MDIO_MODE_REG); + + data |= MDIO_MODE_C45; + + writel(data, priv->membase + MDIO_MODE_REG); + + /* issue the phy address and mmd */ + writel((mii_id << 8) | mmd, priv->membase + MDIO_ADDR_REG); + + /* issue reg */ + writel(reg, priv->membase + MDIO_DATA_WRITE_REG); + + cmd = MDIO_CMD_ACCESS_START | MDIO_CMD_ACCESS_CODE_C45_ADDR; + + writel(cmd, priv->membase + MDIO_CMD_REG); + + if (ipq4019_mdio_wait_busy(bus)) + return -ETIMEDOUT; + } else { + /* Enter Clause 22 mode */ + data = readl(priv->membase + MDIO_MODE_REG); + + data &= ~MDIO_MODE_C45; + + writel(data, priv->membase + MDIO_MODE_REG); + + /* issue the phy address and reg */ + writel((mii_id << 8) | regnum, priv->membase + MDIO_ADDR_REG); + } /* issue write data */ writel(value, priv->membase + MDIO_DATA_WRITE_REG); - cmd = MDIO_CMD_ACCESS_START | MDIO_CMD_ACCESS_CODE_WRITE; /* issue write command */ + if (regnum & MII_ADDR_C45) + cmd = MDIO_CMD_ACCESS_START | MDIO_CMD_ACCESS_CODE_C45_WRITE; + else + cmd = MDIO_CMD_ACCESS_START | MDIO_CMD_ACCESS_CODE_WRITE; + writel(cmd, priv->membase + MDIO_CMD_REG); /* Wait write complete */ diff --git a/drivers/net/pcs/pcs-lynx.c b/drivers/net/pcs/pcs-lynx.c index c43d97682083..62bb9272dcb2 100644 --- a/drivers/net/pcs/pcs-lynx.c +++ b/drivers/net/pcs/pcs-lynx.c @@ -93,6 +93,9 @@ static void lynx_pcs_get_state(struct phylink_pcs *pcs, case PHY_INTERFACE_MODE_USXGMII: lynx_pcs_get_state_usxgmii(lynx->mdio, state); break; + case PHY_INTERFACE_MODE_10GBASER: + phylink_mii_c45_pcs_get_state(lynx->mdio, state); + break; default: break; } @@ -172,6 +175,9 @@ static int lynx_pcs_config(struct phylink_pcs *pcs, unsigned int mode, break; case PHY_INTERFACE_MODE_USXGMII: return lynx_pcs_config_usxgmii(lynx->mdio, mode, advertising); + case PHY_INTERFACE_MODE_10GBASER: + /* Nothing to do here for 10GBASER */ + break; default: return -EOPNOTSUPP; } diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index de5b869139d7..8d333d3084ed 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -6,6 +6,11 @@ #include <linux/phy.h> #include <linux/of.h> +/** + * phy_speed_to_str - Return a string representing the PHY link speed + * + * @speed: Speed of the link + */ const char *phy_speed_to_str(int speed) { BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 92, @@ -52,6 +57,11 @@ const char *phy_speed_to_str(int speed) } EXPORT_SYMBOL_GPL(phy_speed_to_str); +/** + * phy_duplex_to_str - Return string describing the duplex + * + * @duplex: Duplex setting to describe + */ const char *phy_duplex_to_str(unsigned int duplex) { if (duplex == DUPLEX_HALF) @@ -252,6 +262,16 @@ static int __set_phy_supported(struct phy_device *phydev, u32 max_speed) return __set_linkmode_max_speed(max_speed, phydev->supported); } +/** + * phy_set_max_speed - Set the maximum speed the PHY should support + * + * @phydev: The phy_device struct + * @max_speed: Maximum speed + * + * The PHY might be more capable than the MAC. For example a Fast Ethernet + * is connected to a 1G PHY. This function allows the MAC to indicate its + * maximum speed, and so limit what the PHY will advertise. + */ int phy_set_max_speed(struct phy_device *phydev, u32 max_speed) { int err; @@ -308,6 +328,16 @@ void of_set_phy_eee_broken(struct phy_device *phydev) phydev->eee_broken_modes = broken; } +/** + * phy_resolve_aneg_pause - Determine pause autoneg results + * + * @phydev: The phy_device struct + * + * Once autoneg has completed the local pause settings can be + * resolved. Determine if pause and asymmetric pause should be used + * by the MAC. + */ + void phy_resolve_aneg_pause(struct phy_device *phydev) { if (phydev->duplex == DUPLEX_FULL) { @@ -321,7 +351,7 @@ void phy_resolve_aneg_pause(struct phy_device *phydev) EXPORT_SYMBOL_GPL(phy_resolve_aneg_pause); /** - * phy_resolve_aneg_linkmode - resolve the advertisements into phy settings + * phy_resolve_aneg_linkmode - resolve the advertisements into PHY settings * @phydev: The phy_device struct * * Resolve our and the link partner advertisements into their corresponding diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 8947d58f2a25..35525a671400 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -456,7 +456,16 @@ int phy_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } EXPORT_SYMBOL(phy_do_ioctl); -/* same as phy_do_ioctl, but ensures that net_device is running */ +/** + * phy_do_ioctl_running - generic ndo_do_ioctl implementation but test first + * + * @dev: the net_device struct + * @ifr: &struct ifreq for socket ioctl's + * @cmd: ioctl cmd to execute + * + * Same as phy_do_ioctl, but ensures that net_device is running before + * handling the ioctl. + */ int phy_do_ioctl_running(struct net_device *dev, struct ifreq *ifr, int cmd) { if (!netif_running(dev)) @@ -466,6 +475,12 @@ int phy_do_ioctl_running(struct net_device *dev, struct ifreq *ifr, int cmd) } EXPORT_SYMBOL(phy_do_ioctl_running); +/** + * phy_queue_state_machine - Trigger the state machine to run soon + * + * @phydev: the phy_device struct + * @jiffies: Run the state machine after these jiffies + */ void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies) { mod_delayed_work(system_power_efficient_wq, &phydev->state_queue, @@ -473,6 +488,11 @@ void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies) } EXPORT_SYMBOL(phy_queue_state_machine); +/** + * phy_queue_state_machine - Trigger the state machine to run now + * + * @phydev: the phy_device struct + */ static void phy_trigger_machine(struct phy_device *phydev) { phy_queue_state_machine(phydev, 0); @@ -489,6 +509,12 @@ static void phy_abort_cable_test(struct phy_device *phydev) phydev_err(phydev, "Error while aborting cable test"); } +/** + * phy_ethtool_get_strings - Get the statistic counter names + * + * @phydev: the phy_device struct + * @data: Where to put the strings + */ int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data) { if (!phydev->drv) @@ -502,6 +528,11 @@ int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data) } EXPORT_SYMBOL(phy_ethtool_get_strings); +/** + * phy_ethtool_get_sset_count - Get the number of statistic counters + * + * @phydev: the phy_device struct + */ int phy_ethtool_get_sset_count(struct phy_device *phydev) { int ret; @@ -523,6 +554,13 @@ int phy_ethtool_get_sset_count(struct phy_device *phydev) } EXPORT_SYMBOL(phy_ethtool_get_sset_count); +/** + * phy_ethtool_get_stats - Get the statistic counters + * + * @phydev: the phy_device struct + * @stats: What counters to get + * @data: Where to store the counters + */ int phy_ethtool_get_stats(struct phy_device *phydev, struct ethtool_stats *stats, u64 *data) { @@ -537,6 +575,12 @@ int phy_ethtool_get_stats(struct phy_device *phydev, } EXPORT_SYMBOL(phy_ethtool_get_stats); +/** + * phy_start_cable_test - Start a cable test + * + * @phydev: the phy_device struct + * @extack: extack for reporting useful error messages + */ int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack) { @@ -600,6 +644,13 @@ out: } EXPORT_SYMBOL(phy_start_cable_test); +/** + * phy_start_cable_test_tdr - Start a raw TDR cable test + * + * @phydev: the phy_device struct + * @extack: extack for reporting useful error messages + * @config: Configuration of the test to run + */ int phy_start_cable_test_tdr(struct phy_device *phydev, struct netlink_ext_ack *extack, const struct phy_tdr_config *config) @@ -1363,6 +1414,12 @@ int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data) } EXPORT_SYMBOL(phy_ethtool_set_eee); +/** + * phy_ethtool_set_wol - Configure Wake On LAN + * + * @phydev: target phy_device struct + * @wol: Configuration requested + */ int phy_ethtool_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol) { if (phydev->drv && phydev->drv->set_wol) @@ -1372,6 +1429,12 @@ int phy_ethtool_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol) } EXPORT_SYMBOL(phy_ethtool_set_wol); +/** + * phy_ethtool_get_wol - Get the current Wake On LAN configuration + * + * @phydev: target phy_device struct + * @wol: Store the current configuration here + */ void phy_ethtool_get_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol) { if (phydev->drv && phydev->drv->get_wol) @@ -1405,6 +1468,10 @@ int phy_ethtool_set_link_ksettings(struct net_device *ndev, } EXPORT_SYMBOL(phy_ethtool_set_link_ksettings); +/** + * phy_ethtool_nway_reset - Restart auto negotiation + * @ndev: Network device to restart autoneg for + */ int phy_ethtool_nway_reset(struct net_device *ndev) { struct phy_device *phydev = ndev->phydev; diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index cb32d7ef4938..4daf94bb56a5 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -338,6 +338,29 @@ unregister: EXPORT_SYMBOL(of_mdiobus_register); /** + * of_mdio_find_device - Given a device tree node, find the mdio_device + * @np: pointer to the mdio_device's device tree node + * + * If successful, returns a pointer to the mdio_device with the embedded + * struct device refcount incremented by one, or NULL on failure. + * The caller should call put_device() on the mdio_device after its use + */ +struct mdio_device *of_mdio_find_device(struct device_node *np) +{ + struct device *d; + + if (!np) + return NULL; + + d = bus_find_device_by_of_node(&mdio_bus_type, np); + if (!d) + return NULL; + + return to_mdio_device(d); +} +EXPORT_SYMBOL(of_mdio_find_device); + +/** * of_phy_find_device - Give a PHY node, find the phy_device * @phy_np: Pointer to the phy's device tree node * @@ -346,19 +369,16 @@ EXPORT_SYMBOL(of_mdiobus_register); */ struct phy_device *of_phy_find_device(struct device_node *phy_np) { - struct device *d; struct mdio_device *mdiodev; - if (!phy_np) + mdiodev = of_mdio_find_device(phy_np); + if (!mdiodev) return NULL; - d = bus_find_device_by_of_node(&mdio_bus_type, phy_np); - if (d) { - mdiodev = to_mdio_device(d); - if (mdiodev->flags & MDIO_DEVICE_FLAG_PHY) - return to_phy_device(d); - put_device(d); - } + if (mdiodev->flags & MDIO_DEVICE_FLAG_PHY) + return to_phy_device(&mdiodev->dev); + + put_device(&mdiodev->dev); return NULL; } diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index 2c14012ca35d..f321eabefbe4 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -195,8 +195,8 @@ struct qeth_vnicc_info { #define QETH_IN_BUF_SIZE_DEFAULT 65536 #define QETH_IN_BUF_COUNT_DEFAULT 64 #define QETH_IN_BUF_COUNT_HSDEFAULT 128 -#define QETH_IN_BUF_COUNT_MIN 8 -#define QETH_IN_BUF_COUNT_MAX 128 +#define QETH_IN_BUF_COUNT_MIN 8U +#define QETH_IN_BUF_COUNT_MAX 128U #define QETH_MAX_BUFFER_ELEMENTS(card) ((card)->qdio.in_buf_size >> 12) #define QETH_IN_BUF_REQUEUE_THRESHOLD(card) \ ((card)->qdio.in_buf_pool.buf_count / 2) @@ -753,7 +753,7 @@ struct qeth_discipline { const struct device_type *devtype; int (*setup) (struct ccwgroup_device *); void (*remove) (struct ccwgroup_device *); - int (*set_online)(struct qeth_card *card); + int (*set_online)(struct qeth_card *card, bool carrier_ok); void (*set_offline)(struct qeth_card *card); int (*do_ioctl)(struct net_device *dev, struct ifreq *rq, int cmd); int (*control_event_handler)(struct qeth_card *card, @@ -814,12 +814,16 @@ struct qeth_card { struct workqueue_struct *event_wq; struct workqueue_struct *cmd_wq; wait_queue_head_t wait_q; + + struct mutex ip_lock; + /* protected by ip_lock: */ DECLARE_HASHTABLE(ip_htable, 4); + struct qeth_ipato ipato; + DECLARE_HASHTABLE(local_addrs4, 4); DECLARE_HASHTABLE(local_addrs6, 4); spinlock_t local_addrs4_lock; spinlock_t local_addrs6_lock; - struct mutex ip_lock; DECLARE_HASHTABLE(rx_mode_addrs, 4); struct work_struct rx_mode_work; struct work_struct kernel_thread_starter; @@ -827,7 +831,6 @@ struct qeth_card { unsigned long thread_start_mask; unsigned long thread_allowed_mask; unsigned long thread_running_mask; - struct qeth_ipato ipato; struct list_head cmd_waiter_list; /* QDIO buffer handling */ struct qeth_qdio_info qdio; @@ -1034,11 +1037,8 @@ struct net_device *qeth_clone_netdev(struct net_device *orig); struct qeth_card *qeth_get_card_by_busid(char *bus_id); void qeth_set_allowed_threads(struct qeth_card *, unsigned long , int); int qeth_threads_running(struct qeth_card *, unsigned long); -int qeth_core_hardsetup_card(struct qeth_card *card, bool *carrier_ok); -int qeth_stop_channel(struct qeth_channel *channel); int qeth_set_offline(struct qeth_card *card, bool resetting); -void qeth_print_status_message(struct qeth_card *); int qeth_send_ipa_cmd(struct qeth_card *, struct qeth_cmd_buffer *, int (*reply_cb) (struct qeth_card *, struct qeth_reply *, unsigned long), @@ -1062,12 +1062,7 @@ void qeth_notify_cmd(struct qeth_cmd_buffer *iob, int reason); void qeth_put_cmd(struct qeth_cmd_buffer *iob); int qeth_schedule_recovery(struct qeth_card *card); -void qeth_flush_local_addrs(struct qeth_card *card); int qeth_poll(struct napi_struct *napi, int budget); -void qeth_clear_ipacmd_list(struct qeth_card *); -int qeth_qdio_clear_card(struct qeth_card *, int); -void qeth_clear_working_pool_list(struct qeth_card *); -void qeth_drain_output_queues(struct qeth_card *card); void qeth_setadp_promisc_mode(struct qeth_card *card, bool enable); int qeth_setadpparms_change_macaddr(struct qeth_card *); void qeth_tx_timeout(struct net_device *, unsigned int txqueue); @@ -1091,7 +1086,6 @@ int qeth_do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); void qeth_dbf_longtext(debug_info_t *id, int level, char *text, ...); int qeth_configure_cq(struct qeth_card *, enum qeth_cq); int qeth_hw_trap(struct qeth_card *, enum qeth_diags_trap_action); -void qeth_trace_features(struct qeth_card *); int qeth_setassparms_cb(struct qeth_card *, struct qeth_reply *, unsigned long); int qeth_setup_netdev(struct qeth_card *card); int qeth_set_features(struct net_device *, netdev_features_t); diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 7cd0cbf8a4f0..fc2c3db9259f 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -201,7 +201,7 @@ int qeth_threads_running(struct qeth_card *card, unsigned long threads) } EXPORT_SYMBOL_GPL(qeth_threads_running); -void qeth_clear_working_pool_list(struct qeth_card *card) +static void qeth_clear_working_pool_list(struct qeth_card *card) { struct qeth_buffer_pool_entry *pool_entry, *tmp; struct qeth_qdio_q *queue = card->qdio.in_q; @@ -216,7 +216,6 @@ void qeth_clear_working_pool_list(struct qeth_card *card) for (i = 0; i < ARRAY_SIZE(queue->bufs); i++) queue->bufs[i].pool_entry = NULL; } -EXPORT_SYMBOL_GPL(qeth_clear_working_pool_list); static void qeth_free_pool_entry(struct qeth_buffer_pool_entry *entry) { @@ -658,12 +657,11 @@ static void qeth_flush_local_addrs6(struct qeth_card *card) spin_unlock_irq(&card->local_addrs6_lock); } -void qeth_flush_local_addrs(struct qeth_card *card) +static void qeth_flush_local_addrs(struct qeth_card *card) { qeth_flush_local_addrs4(card); qeth_flush_local_addrs6(card); } -EXPORT_SYMBOL_GPL(qeth_flush_local_addrs); static void qeth_add_local_addrs4(struct qeth_card *card, struct qeth_ipacmd_local_addrs4 *cmd) @@ -965,7 +963,7 @@ static struct qeth_ipa_cmd *qeth_check_ipa_data(struct qeth_card *card, } } -void qeth_clear_ipacmd_list(struct qeth_card *card) +static void qeth_clear_ipacmd_list(struct qeth_card *card) { struct qeth_cmd_buffer *iob; unsigned long flags; @@ -977,7 +975,6 @@ void qeth_clear_ipacmd_list(struct qeth_card *card) qeth_notify_cmd(iob, -ECANCELED); spin_unlock_irqrestore(&card->lock, flags); } -EXPORT_SYMBOL_GPL(qeth_clear_ipacmd_list); static int qeth_check_idx_response(struct qeth_card *card, unsigned char *buffer) @@ -1502,7 +1499,7 @@ static void qeth_drain_output_queue(struct qeth_qdio_out_q *q, bool free) } } -void qeth_drain_output_queues(struct qeth_card *card) +static void qeth_drain_output_queues(struct qeth_card *card) { int i; @@ -1513,7 +1510,6 @@ void qeth_drain_output_queues(struct qeth_card *card) qeth_drain_output_queue(card->qdio.out_qs[i], false); } } -EXPORT_SYMBOL_GPL(qeth_drain_output_queues); static int qeth_osa_set_output_queues(struct qeth_card *card, bool single) { @@ -1754,7 +1750,7 @@ static int qeth_halt_channel(struct qeth_card *card, return 0; } -int qeth_stop_channel(struct qeth_channel *channel) +static int qeth_stop_channel(struct qeth_channel *channel) { struct ccw_device *cdev = channel->ccwdev; int rc; @@ -1772,7 +1768,6 @@ int qeth_stop_channel(struct qeth_channel *channel) return rc; } -EXPORT_SYMBOL_GPL(qeth_stop_channel); static int qeth_start_channel(struct qeth_channel *channel) { @@ -1842,7 +1837,7 @@ static int qeth_clear_halt_card(struct qeth_card *card, int halt) return qeth_clear_channels(card); } -int qeth_qdio_clear_card(struct qeth_card *card, int use_halt) +static int qeth_qdio_clear_card(struct qeth_card *card, int use_halt) { int rc = 0; @@ -1870,7 +1865,6 @@ int qeth_qdio_clear_card(struct qeth_card *card, int use_halt) QETH_CARD_TEXT_(card, 3, "2err%d", rc); return rc; } -EXPORT_SYMBOL_GPL(qeth_qdio_clear_card); static enum qeth_discipline_id qeth_vm_detect_layer(struct qeth_card *card) { @@ -2867,7 +2861,7 @@ static int qeth_mpc_initialize(struct qeth_card *card) return 0; } -void qeth_print_status_message(struct qeth_card *card) +static void qeth_print_status_message(struct qeth_card *card) { switch (card->info.type) { case QETH_CARD_TYPE_OSD: @@ -2908,7 +2902,6 @@ void qeth_print_status_message(struct qeth_card *card) (card->info.mcl_level[0]) ? ")" : "", qeth_get_cardname_short(card)); } -EXPORT_SYMBOL_GPL(qeth_print_status_message); static void qeth_initialize_working_pool_list(struct qeth_card *card) { @@ -5124,7 +5117,7 @@ static void qeth_core_free_card(struct qeth_card *card) kfree(card); } -void qeth_trace_features(struct qeth_card *card) +static void qeth_trace_features(struct qeth_card *card) { QETH_CARD_TEXT(card, 2, "features"); QETH_CARD_HEX(card, 2, &card->options.ipa4, sizeof(card->options.ipa4)); @@ -5133,7 +5126,6 @@ void qeth_trace_features(struct qeth_card *card) QETH_CARD_HEX(card, 2, &card->info.diagass_support, sizeof(card->info.diagass_support)); } -EXPORT_SYMBOL_GPL(qeth_trace_features); static struct ccw_device_id qeth_ids[] = { {CCW_DEVICE_DEVTYPE(0x1731, 0x01, 0x1732, 0x01), @@ -5164,7 +5156,7 @@ static struct ccw_driver qeth_ccw_driver = { .remove = ccwgroup_remove_ccwdev, }; -int qeth_core_hardsetup_card(struct qeth_card *card, bool *carrier_ok) +static int qeth_hardsetup_card(struct qeth_card *card, bool *carrier_ok) { int retries = 3; int rc; @@ -5278,6 +5270,8 @@ retriable: QETH_CARD_TEXT_(card, 2, "8err%d", rc); } + qeth_trace_features(card); + if (!qeth_is_diagass_supported(card, QETH_DIAGS_CMD_TRAP) || (card->info.hwtrap && qeth_hw_trap(card, QETH_DIAGS_TRAP_ARM))) card->info.hwtrap = 0; @@ -5303,21 +5297,49 @@ out: CARD_DEVID(card), rc); return rc; } -EXPORT_SYMBOL_GPL(qeth_core_hardsetup_card); static int qeth_set_online(struct qeth_card *card) { + bool carrier_ok; int rc; mutex_lock(&card->discipline_mutex); mutex_lock(&card->conf_mutex); QETH_CARD_TEXT(card, 2, "setonlin"); - rc = card->discipline->set_online(card); + rc = qeth_hardsetup_card(card, &carrier_ok); + if (rc) { + QETH_CARD_TEXT_(card, 2, "2err%04x", rc); + rc = -ENODEV; + goto err_hardsetup; + } + + qeth_print_status_message(card); + + rc = card->discipline->set_online(card, carrier_ok); + if (rc) + goto err_online; + + /* let user_space know that device is online */ + kobject_uevent(&card->gdev->dev.kobj, KOBJ_CHANGE); mutex_unlock(&card->conf_mutex); mutex_unlock(&card->discipline_mutex); + return 0; +err_online: +err_hardsetup: + qeth_qdio_clear_card(card, 0); + qeth_clear_working_pool_list(card); + qeth_flush_local_addrs(card); + + qeth_stop_channel(&card->data); + qeth_stop_channel(&card->write); + qeth_stop_channel(&card->read); + qdio_free(CARD_DDEV(card)); + + mutex_unlock(&card->conf_mutex); + mutex_unlock(&card->discipline_mutex); return rc; } @@ -5334,6 +5356,9 @@ int qeth_set_offline(struct qeth_card *card, bool resetting) card->info.hwtrap = 1; } + /* cancel any stalled cmd that might block the rtnl: */ + qeth_clear_ipacmd_list(card); + rtnl_lock(); card->info.open_when_online = card->dev->flags & IFF_UP; dev_close(card->dev); @@ -5341,8 +5366,16 @@ int qeth_set_offline(struct qeth_card *card, bool resetting) netif_carrier_off(card->dev); rtnl_unlock(); + cancel_work_sync(&card->rx_mode_work); + card->discipline->set_offline(card); + qeth_qdio_clear_card(card, 0); + qeth_drain_output_queues(card); + qeth_clear_working_pool_list(card); + qeth_flush_local_addrs(card); + card->info.promisc_mode = 0; + rc = qeth_stop_channel(&card->data); rc2 = qeth_stop_channel(&card->write); rc3 = qeth_stop_channel(&card->read); diff --git a/drivers/s390/net/qeth_core_sys.c b/drivers/s390/net/qeth_core_sys.c index 8def82336f53..74c70364edc1 100644 --- a/drivers/s390/net/qeth_core_sys.c +++ b/drivers/s390/net/qeth_core_sys.c @@ -103,21 +103,21 @@ static ssize_t qeth_dev_portno_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct qeth_card *card = dev_get_drvdata(dev); - char *tmp; unsigned int portno, limit; int rc = 0; + rc = kstrtouint(buf, 16, &portno); + if (rc) + return rc; + if (portno > QETH_MAX_PORTNO) + return -EINVAL; + mutex_lock(&card->conf_mutex); if (card->state != CARD_STATE_DOWN) { rc = -EPERM; goto out; } - portno = simple_strtoul(buf, &tmp, 16); - if (portno > QETH_MAX_PORTNO) { - rc = -EINVAL; - goto out; - } limit = (card->ssqd.pcnt ? card->ssqd.pcnt - 1 : card->ssqd.pcnt); if (portno > limit) { rc = -EINVAL; @@ -248,19 +248,19 @@ static ssize_t qeth_dev_bufcnt_store(struct device *dev, { struct qeth_card *card = dev_get_drvdata(dev); unsigned int cnt; - char *tmp; int rc = 0; + rc = kstrtouint(buf, 10, &cnt); + if (rc) + return rc; + mutex_lock(&card->conf_mutex); if (card->state != CARD_STATE_DOWN) { rc = -EPERM; goto out; } - cnt = simple_strtoul(buf, &tmp, 10); - cnt = (cnt < QETH_IN_BUF_COUNT_MIN) ? QETH_IN_BUF_COUNT_MIN : - ((cnt > QETH_IN_BUF_COUNT_MAX) ? QETH_IN_BUF_COUNT_MAX : cnt); - + cnt = clamp(cnt, QETH_IN_BUF_COUNT_MIN, QETH_IN_BUF_COUNT_MAX); rc = qeth_resize_buffer_pool(card, cnt); out: @@ -341,18 +341,15 @@ static ssize_t qeth_dev_layer2_store(struct device *dev, { struct qeth_card *card = dev_get_drvdata(dev); struct net_device *ndev; - char *tmp; - int i, rc = 0; enum qeth_discipline_id newdis; + unsigned int input; + int rc; - mutex_lock(&card->discipline_mutex); - if (card->state != CARD_STATE_DOWN) { - rc = -EPERM; - goto out; - } + rc = kstrtouint(buf, 16, &input); + if (rc) + return rc; - i = simple_strtoul(buf, &tmp, 16); - switch (i) { + switch (input) { case 0: newdis = QETH_DISCIPLINE_LAYER3; break; @@ -360,7 +357,12 @@ static ssize_t qeth_dev_layer2_store(struct device *dev, newdis = QETH_DISCIPLINE_LAYER2; break; default: - rc = -EINVAL; + return -EINVAL; + } + + mutex_lock(&card->discipline_mutex); + if (card->state != CARD_STATE_DOWN) { + rc = -EPERM; goto out; } @@ -551,20 +553,21 @@ static DEVICE_ATTR(hw_trap, 0644, qeth_hw_trap_show, static ssize_t qeth_dev_blkt_store(struct qeth_card *card, const char *buf, size_t count, int *value, int max_value) { - char *tmp; - int i, rc = 0; + unsigned int input; + int rc; + + rc = kstrtouint(buf, 10, &input); + if (rc) + return rc; + + if (input > max_value) + return -EINVAL; mutex_lock(&card->conf_mutex); - if (card->state != CARD_STATE_DOWN) { + if (card->state != CARD_STATE_DOWN) rc = -EPERM; - goto out; - } - i = simple_strtoul(buf, &tmp, 10); - if (i <= max_value) - *value = i; else - rc = -EINVAL; -out: + *value = input; mutex_unlock(&card->conf_mutex); return rc ? rc : count; } diff --git a/drivers/s390/net/qeth_l2.h b/drivers/s390/net/qeth_l2.h index cc95675c8bc4..296d73d84326 100644 --- a/drivers/s390/net/qeth_l2.h +++ b/drivers/s390/net/qeth_l2.h @@ -31,4 +31,11 @@ struct qeth_mac { struct hlist_node hnode; }; +static inline bool qeth_bridgeport_is_in_use(struct qeth_card *card) +{ + return card->options.sbp.role || + card->options.sbp.reflect_promisc || + card->options.sbp.hostnotification; +} + #endif /* __QETH_L2_H__ */ diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index e12ac32b8b47..1852d0a3c10a 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -28,17 +28,6 @@ #include "qeth_core.h" #include "qeth_l2.h" -static void qeth_bridgeport_query_support(struct qeth_card *card); -static void qeth_bridge_state_change(struct qeth_card *card, - struct qeth_ipa_cmd *cmd); -static void qeth_addr_change_event(struct qeth_card *card, - struct qeth_ipa_cmd *cmd); -static bool qeth_bridgeport_is_in_use(struct qeth_card *card); -static void qeth_l2_vnicc_set_defaults(struct qeth_card *card); -static void qeth_l2_vnicc_init(struct qeth_card *card); -static bool qeth_l2_vnicc_recover_timeout(struct qeth_card *card, u32 vnicc, - u32 *timeout); - static int qeth_l2_setdelmac_makerc(struct qeth_card *card, u16 retcode) { int rc; @@ -304,36 +293,6 @@ static void qeth_l2_dev2br_fdb_flush(struct qeth_card *card) card->dev, &info.info, NULL); } -static void qeth_l2_stop_card(struct qeth_card *card) -{ - struct qeth_priv *priv = netdev_priv(card->dev); - - QETH_CARD_TEXT(card, 2, "stopcard"); - - qeth_set_allowed_threads(card, 0, 1); - - cancel_work_sync(&card->rx_mode_work); - qeth_l2_drain_rx_mode_cache(card); - - if (card->state == CARD_STATE_SOFTSETUP) { - qeth_clear_ipacmd_list(card); - card->state = CARD_STATE_DOWN; - } - - qeth_qdio_clear_card(card, 0); - qeth_drain_output_queues(card); - qeth_clear_working_pool_list(card); - qeth_l2_set_pnso_mode(card, QETH_PNSO_NONE); - qeth_flush_local_addrs(card); - card->info.promisc_mode = 0; - - if (priv->brport_features & BR_LEARNING_SYNC) { - rtnl_lock(); - qeth_l2_dev2br_fdb_flush(card); - rtnl_unlock(); - } -} - static int qeth_l2_request_initial_mac(struct qeth_card *card) { int rc = 0; @@ -617,49 +576,6 @@ static u16 qeth_l2_select_queue(struct net_device *dev, struct sk_buff *skb, qeth_get_priority_queue(card, skb); } -static const struct device_type qeth_l2_devtype = { - .name = "qeth_layer2", - .groups = qeth_l2_attr_groups, -}; - -static int qeth_l2_probe_device(struct ccwgroup_device *gdev) -{ - struct qeth_card *card = dev_get_drvdata(&gdev->dev); - int rc; - - if (IS_OSN(card)) - dev_notice(&gdev->dev, "OSN support will be dropped in 2021\n"); - - qeth_l2_vnicc_set_defaults(card); - mutex_init(&card->sbp_lock); - - if (gdev->dev.type == &qeth_generic_devtype) { - rc = qeth_l2_create_device_attributes(&gdev->dev); - if (rc) - return rc; - } - - INIT_WORK(&card->rx_mode_work, qeth_l2_rx_mode_work); - return 0; -} - -static void qeth_l2_remove_device(struct ccwgroup_device *cgdev) -{ - struct qeth_card *card = dev_get_drvdata(&cgdev->dev); - - if (cgdev->dev.type == &qeth_generic_devtype) - qeth_l2_remove_device_attributes(&cgdev->dev); - qeth_set_allowed_threads(card, 0, 1); - wait_event(card->wait_q, qeth_threads_running(card, 0xffffffff) == 0); - - if (cgdev->state == CCWGROUP_ONLINE) - qeth_set_offline(card, false); - - cancel_work_sync(&card->close_dev_work); - if (card->dev->reg_state == NETREG_REGISTERED) - unregister_netdev(card->dev); -} - static void qeth_l2_set_rx_mode(struct net_device *dev) { struct qeth_card *card = dev->ml_priv; @@ -1140,134 +1056,6 @@ static void qeth_l2_enable_brport_features(struct qeth_card *card) } } -static int qeth_l2_set_online(struct qeth_card *card) -{ - struct ccwgroup_device *gdev = card->gdev; - struct net_device *dev = card->dev; - int rc = 0; - bool carrier_ok; - - rc = qeth_core_hardsetup_card(card, &carrier_ok); - if (rc) { - QETH_CARD_TEXT_(card, 2, "2err%04x", rc); - rc = -ENODEV; - goto out_remove; - } - - /* query before bridgeport_notification may be enabled */ - qeth_l2_detect_dev2br_support(card); - - mutex_lock(&card->sbp_lock); - qeth_bridgeport_query_support(card); - if (card->options.sbp.supported_funcs) { - qeth_l2_setup_bridgeport_attrs(card); - dev_info(&card->gdev->dev, - "The device represents a Bridge Capable Port\n"); - } - mutex_unlock(&card->sbp_lock); - - qeth_l2_register_dev_addr(card); - - /* for the rx_bcast characteristic, init VNICC after setmac */ - qeth_l2_vnicc_init(card); - - qeth_trace_features(card); - qeth_l2_trace_features(card); - - qeth_print_status_message(card); - - /* softsetup */ - QETH_CARD_TEXT(card, 2, "softsetp"); - - card->state = CARD_STATE_SOFTSETUP; - - qeth_set_allowed_threads(card, 0xffffffff, 0); - - if (dev->reg_state != NETREG_REGISTERED) { - rc = qeth_l2_setup_netdev(card); - if (rc) - goto out_remove; - - if (carrier_ok) - netif_carrier_on(dev); - } else { - rtnl_lock(); - if (carrier_ok) - netif_carrier_on(dev); - else - netif_carrier_off(dev); - - netif_device_attach(dev); - qeth_enable_hw_features(dev); - qeth_l2_enable_brport_features(card); - - if (card->info.open_when_online) { - card->info.open_when_online = 0; - dev_open(dev, NULL); - } - rtnl_unlock(); - } - /* let user_space know that device is online */ - kobject_uevent(&gdev->dev.kobj, KOBJ_CHANGE); - return 0; - -out_remove: - qeth_l2_stop_card(card); - qeth_stop_channel(&card->data); - qeth_stop_channel(&card->write); - qeth_stop_channel(&card->read); - qdio_free(CARD_DDEV(card)); - return rc; -} - -static void qeth_l2_set_offline(struct qeth_card *card) -{ - qeth_l2_stop_card(card); -} - -static int __init qeth_l2_init(void) -{ - pr_info("register layer 2 discipline\n"); - return 0; -} - -static void __exit qeth_l2_exit(void) -{ - pr_info("unregister layer 2 discipline\n"); -} - -/* Returns zero if the command is successfully "consumed" */ -static int qeth_l2_control_event(struct qeth_card *card, - struct qeth_ipa_cmd *cmd) -{ - switch (cmd->hdr.command) { - case IPA_CMD_SETBRIDGEPORT_OSA: - case IPA_CMD_SETBRIDGEPORT_IQD: - if (cmd->data.sbp.hdr.command_code == - IPA_SBP_BRIDGE_PORT_STATE_CHANGE) { - qeth_bridge_state_change(card, cmd); - return 0; - } else - return 1; - case IPA_CMD_ADDRESS_CHANGE_NOTIF: - qeth_addr_change_event(card, cmd); - return 0; - default: - return 1; - } -} - -struct qeth_discipline qeth_l2_discipline = { - .devtype = &qeth_l2_devtype, - .setup = qeth_l2_probe_device, - .remove = qeth_l2_remove_device, - .set_online = qeth_l2_set_online, - .set_offline = qeth_l2_set_offline, - .do_ioctl = NULL, - .control_event_handler = qeth_l2_control_event, -}; -EXPORT_SYMBOL_GPL(qeth_l2_discipline); - #ifdef CONFIG_QETH_OSN static void qeth_osn_assist_cb(struct qeth_card *card, struct qeth_cmd_buffer *iob, @@ -1987,12 +1775,6 @@ int qeth_bridgeport_an_set(struct qeth_card *card, int enable) return rc; } -static bool qeth_bridgeport_is_in_use(struct qeth_card *card) -{ - return (card->options.sbp.role || card->options.sbp.reflect_promisc || - card->options.sbp.hostnotification); -} - /* VNIC Characteristics support */ /* handle VNICC IPA command return codes; convert to error codes */ @@ -2138,6 +1920,19 @@ static int qeth_l2_vnicc_getset_timeout(struct qeth_card *card, u32 vnicc, return qeth_send_ipa_cmd(card, iob, qeth_l2_vnicc_request_cb, timeout); } +/* recover user timeout setting */ +static bool qeth_l2_vnicc_recover_timeout(struct qeth_card *card, u32 vnicc, + u32 *timeout) +{ + if (card->options.vnicc.sup_chars & vnicc && + card->options.vnicc.getset_timeout_sup & vnicc && + !qeth_l2_vnicc_getset_timeout(card, vnicc, IPA_VNICC_SET_TIMEOUT, + timeout)) + return false; + *timeout = QETH_VNICC_DEFAULT_TIMEOUT; + return true; +} + /* set current VNICC flag state; called from sysfs store function */ int qeth_l2_vnicc_set_state(struct qeth_card *card, u32 vnicc, bool state) { @@ -2308,19 +2103,6 @@ bool qeth_bridgeport_allowed(struct qeth_card *card) !(priv->brport_features & BR_LEARNING_SYNC)); } -/* recover user timeout setting */ -static bool qeth_l2_vnicc_recover_timeout(struct qeth_card *card, u32 vnicc, - u32 *timeout) -{ - if (card->options.vnicc.sup_chars & vnicc && - card->options.vnicc.getset_timeout_sup & vnicc && - !qeth_l2_vnicc_getset_timeout(card, vnicc, IPA_VNICC_SET_TIMEOUT, - timeout)) - return false; - *timeout = QETH_VNICC_DEFAULT_TIMEOUT; - return true; -} - /* recover user characteristic setting */ static bool qeth_l2_vnicc_recover_char(struct qeth_card *card, u32 vnicc, bool enable) @@ -2409,6 +2191,174 @@ static void qeth_l2_vnicc_set_defaults(struct qeth_card *card) card->options.vnicc.wanted_chars = QETH_VNICC_DEFAULT; } +static const struct device_type qeth_l2_devtype = { + .name = "qeth_layer2", + .groups = qeth_l2_attr_groups, +}; + +static int qeth_l2_probe_device(struct ccwgroup_device *gdev) +{ + struct qeth_card *card = dev_get_drvdata(&gdev->dev); + int rc; + + if (IS_OSN(card)) + dev_notice(&gdev->dev, "OSN support will be dropped in 2021\n"); + + qeth_l2_vnicc_set_defaults(card); + mutex_init(&card->sbp_lock); + + if (gdev->dev.type == &qeth_generic_devtype) { + rc = qeth_l2_create_device_attributes(&gdev->dev); + if (rc) + return rc; + } + + INIT_WORK(&card->rx_mode_work, qeth_l2_rx_mode_work); + return 0; +} + +static void qeth_l2_remove_device(struct ccwgroup_device *gdev) +{ + struct qeth_card *card = dev_get_drvdata(&gdev->dev); + + if (gdev->dev.type == &qeth_generic_devtype) + qeth_l2_remove_device_attributes(&gdev->dev); + qeth_set_allowed_threads(card, 0, 1); + wait_event(card->wait_q, qeth_threads_running(card, 0xffffffff) == 0); + + if (gdev->state == CCWGROUP_ONLINE) + qeth_set_offline(card, false); + + cancel_work_sync(&card->close_dev_work); + if (card->dev->reg_state == NETREG_REGISTERED) + unregister_netdev(card->dev); +} + +static int qeth_l2_set_online(struct qeth_card *card, bool carrier_ok) +{ + struct net_device *dev = card->dev; + int rc = 0; + + /* query before bridgeport_notification may be enabled */ + qeth_l2_detect_dev2br_support(card); + + mutex_lock(&card->sbp_lock); + qeth_bridgeport_query_support(card); + if (card->options.sbp.supported_funcs) { + qeth_l2_setup_bridgeport_attrs(card); + dev_info(&card->gdev->dev, + "The device represents a Bridge Capable Port\n"); + } + mutex_unlock(&card->sbp_lock); + + qeth_l2_register_dev_addr(card); + + /* for the rx_bcast characteristic, init VNICC after setmac */ + qeth_l2_vnicc_init(card); + + qeth_l2_trace_features(card); + + /* softsetup */ + QETH_CARD_TEXT(card, 2, "softsetp"); + + card->state = CARD_STATE_SOFTSETUP; + + qeth_set_allowed_threads(card, 0xffffffff, 0); + + if (dev->reg_state != NETREG_REGISTERED) { + rc = qeth_l2_setup_netdev(card); + if (rc) + goto err_setup; + + if (carrier_ok) + netif_carrier_on(dev); + } else { + rtnl_lock(); + if (carrier_ok) + netif_carrier_on(dev); + else + netif_carrier_off(dev); + + netif_device_attach(dev); + qeth_enable_hw_features(dev); + qeth_l2_enable_brport_features(card); + + if (card->info.open_when_online) { + card->info.open_when_online = 0; + dev_open(dev, NULL); + } + rtnl_unlock(); + } + return 0; + +err_setup: + qeth_set_allowed_threads(card, 0, 1); + card->state = CARD_STATE_DOWN; + return rc; +} + +static void qeth_l2_set_offline(struct qeth_card *card) +{ + struct qeth_priv *priv = netdev_priv(card->dev); + + qeth_set_allowed_threads(card, 0, 1); + qeth_l2_drain_rx_mode_cache(card); + + if (card->state == CARD_STATE_SOFTSETUP) + card->state = CARD_STATE_DOWN; + + qeth_l2_set_pnso_mode(card, QETH_PNSO_NONE); + if (priv->brport_features & BR_LEARNING_SYNC) { + rtnl_lock(); + qeth_l2_dev2br_fdb_flush(card); + rtnl_unlock(); + } +} + +/* Returns zero if the command is successfully "consumed" */ +static int qeth_l2_control_event(struct qeth_card *card, + struct qeth_ipa_cmd *cmd) +{ + switch (cmd->hdr.command) { + case IPA_CMD_SETBRIDGEPORT_OSA: + case IPA_CMD_SETBRIDGEPORT_IQD: + if (cmd->data.sbp.hdr.command_code == + IPA_SBP_BRIDGE_PORT_STATE_CHANGE) { + qeth_bridge_state_change(card, cmd); + return 0; + } + + return 1; + case IPA_CMD_ADDRESS_CHANGE_NOTIF: + qeth_addr_change_event(card, cmd); + return 0; + default: + return 1; + } +} + +struct qeth_discipline qeth_l2_discipline = { + .devtype = &qeth_l2_devtype, + .setup = qeth_l2_probe_device, + .remove = qeth_l2_remove_device, + .set_online = qeth_l2_set_online, + .set_offline = qeth_l2_set_offline, + .do_ioctl = NULL, + .control_event_handler = qeth_l2_control_event, +}; +EXPORT_SYMBOL_GPL(qeth_l2_discipline); + +static int __init qeth_l2_init(void) +{ + pr_info("register layer 2 discipline\n"); + return 0; +} + +static void __exit qeth_l2_exit(void) +{ + pr_info("unregister layer 2 discipline\n"); +} + module_init(qeth_l2_init); module_exit(qeth_l2_exit); MODULE_AUTHOR("Frank Blaschka <[email protected]>"); diff --git a/drivers/s390/net/qeth_l3.h b/drivers/s390/net/qeth_l3.h index 6ccfe2121095..acd130cfbab3 100644 --- a/drivers/s390/net/qeth_l3.h +++ b/drivers/s390/net/qeth_l3.h @@ -96,7 +96,7 @@ struct qeth_ipato_entry { struct list_head entry; enum qeth_prot_versions proto; char addr[16]; - int mask_bits; + unsigned int mask_bits; }; extern const struct attribute_group *qeth_l3_attr_groups[]; @@ -110,7 +110,7 @@ int qeth_l3_setrouting_v6(struct qeth_card *); int qeth_l3_add_ipato_entry(struct qeth_card *, struct qeth_ipato_entry *); int qeth_l3_del_ipato_entry(struct qeth_card *card, enum qeth_prot_versions proto, u8 *addr, - int mask_bits); + unsigned int mask_bits); void qeth_l3_update_ipato(struct qeth_card *card); int qeth_l3_modify_hsuid(struct qeth_card *card, bool add); int qeth_l3_modify_rxip_vipa(struct qeth_card *card, bool add, const u8 *ip, diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 33fdad1a6887..a6f8878b55c6 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -105,11 +105,9 @@ static bool qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card, (ipatoe->proto == QETH_PROT_IPV4) ? 4 : 16); if (addr->proto == QETH_PROT_IPV4) - rc = !memcmp(addr_bits, ipatoe_bits, - min(32, ipatoe->mask_bits)); + rc = !memcmp(addr_bits, ipatoe_bits, ipatoe->mask_bits); else - rc = !memcmp(addr_bits, ipatoe_bits, - min(128, ipatoe->mask_bits)); + rc = !memcmp(addr_bits, ipatoe_bits, ipatoe->mask_bits); if (rc) break; } @@ -536,7 +534,6 @@ int qeth_l3_add_ipato_entry(struct qeth_card *card, QETH_CARD_TEXT(card, 2, "addipato"); - mutex_lock(&card->conf_mutex); mutex_lock(&card->ip_lock); list_for_each_entry(ipatoe, &card->ipato.entries, entry) { @@ -556,21 +553,19 @@ int qeth_l3_add_ipato_entry(struct qeth_card *card, } mutex_unlock(&card->ip_lock); - mutex_unlock(&card->conf_mutex); return rc; } int qeth_l3_del_ipato_entry(struct qeth_card *card, enum qeth_prot_versions proto, u8 *addr, - int mask_bits) + unsigned int mask_bits) { struct qeth_ipato_entry *ipatoe, *tmp; int rc = -ENOENT; QETH_CARD_TEXT(card, 2, "delipato"); - mutex_lock(&card->conf_mutex); mutex_lock(&card->ip_lock); list_for_each_entry_safe(ipatoe, tmp, &card->ipato.entries, entry) { @@ -587,7 +582,6 @@ int qeth_l3_del_ipato_entry(struct qeth_card *card, } mutex_unlock(&card->ip_lock); - mutex_unlock(&card->conf_mutex); return rc; } @@ -597,7 +591,6 @@ int qeth_l3_modify_rxip_vipa(struct qeth_card *card, bool add, const u8 *ip, enum qeth_prot_versions proto) { struct qeth_ipaddr addr; - int rc; qeth_l3_init_ipaddr(&addr, type, proto); if (proto == QETH_PROT_IPV4) @@ -605,11 +598,7 @@ int qeth_l3_modify_rxip_vipa(struct qeth_card *card, bool add, const u8 *ip, else memcpy(&addr.u.a6.addr, ip, 16); - mutex_lock(&card->conf_mutex); - rc = qeth_l3_modify_ip(card, &addr, add); - mutex_unlock(&card->conf_mutex); - - return rc; + return qeth_l3_modify_ip(card, &addr, add); } int qeth_l3_modify_hsuid(struct qeth_card *card, bool add) @@ -1153,33 +1142,6 @@ static int qeth_l3_vlan_rx_kill_vid(struct net_device *dev, return 0; } -static void qeth_l3_stop_card(struct qeth_card *card) -{ - QETH_CARD_TEXT(card, 2, "stopcard"); - - qeth_set_allowed_threads(card, 0, 1); - - cancel_work_sync(&card->rx_mode_work); - qeth_l3_drain_rx_mode_cache(card); - - if (card->options.sniffer && - (card->info.promisc_mode == SET_PROMISC_MODE_ON)) - qeth_diags_trace(card, QETH_DIAGS_CMD_TRACE_DISABLE); - - if (card->state == CARD_STATE_SOFTSETUP) { - qeth_l3_clear_ip_htable(card, 1); - qeth_clear_ipacmd_list(card); - card->state = CARD_STATE_DOWN; - } - - qeth_qdio_clear_card(card, 0); - qeth_drain_output_queues(card); - qeth_clear_working_pool_list(card); - flush_workqueue(card->event_wq); - qeth_flush_local_addrs(card); - card->info.promisc_mode = 0; -} - static void qeth_l3_set_promisc_mode(struct qeth_card *card) { bool enable = card->dev->flags & IFF_PROMISC; @@ -1235,7 +1197,6 @@ static void qeth_l3_rx_mode_work(struct work_struct *work) kfree(addr); break; } - addr->ref_counter = 1; fallthrough; default: /* for next call to set_rx_mode(): */ @@ -2025,21 +1986,10 @@ static void qeth_l3_remove_device(struct ccwgroup_device *cgdev) qeth_l3_clear_ipato_list(card); } -static int qeth_l3_set_online(struct qeth_card *card) +static int qeth_l3_set_online(struct qeth_card *card, bool carrier_ok) { - struct ccwgroup_device *gdev = card->gdev; struct net_device *dev = card->dev; int rc = 0; - bool carrier_ok; - - rc = qeth_core_hardsetup_card(card, &carrier_ok); - if (rc) { - QETH_CARD_TEXT_(card, 2, "2err%04x", rc); - rc = -ENODEV; - goto out_remove; - } - - qeth_print_status_message(card); /* softsetup */ QETH_CARD_TEXT(card, 2, "softsetp"); @@ -2066,7 +2016,7 @@ static int qeth_l3_set_online(struct qeth_card *card) if (dev->reg_state != NETREG_REGISTERED) { rc = qeth_l3_setup_netdev(card); if (rc) - goto out_remove; + goto err_setup; if (carrier_ok) netif_carrier_on(dev); @@ -2086,22 +2036,28 @@ static int qeth_l3_set_online(struct qeth_card *card) } rtnl_unlock(); } - qeth_trace_features(card); - /* let user_space know that device is online */ - kobject_uevent(&gdev->dev.kobj, KOBJ_CHANGE); return 0; -out_remove: - qeth_l3_stop_card(card); - qeth_stop_channel(&card->data); - qeth_stop_channel(&card->write); - qeth_stop_channel(&card->read); - qdio_free(CARD_DDEV(card)); + +err_setup: + qeth_set_allowed_threads(card, 0, 1); + card->state = CARD_STATE_DOWN; + qeth_l3_clear_ip_htable(card, 1); return rc; } static void qeth_l3_set_offline(struct qeth_card *card) { - qeth_l3_stop_card(card); + qeth_set_allowed_threads(card, 0, 1); + qeth_l3_drain_rx_mode_cache(card); + + if (card->options.sniffer && + (card->info.promisc_mode == SET_PROMISC_MODE_ON)) + qeth_diags_trace(card, QETH_DIAGS_CMD_TRACE_DISABLE); + + if (card->state == CARD_STATE_SOFTSETUP) { + card->state = CARD_STATE_DOWN; + qeth_l3_clear_ip_htable(card, 1); + } } /* Returns zero if the command is successfully "consumed" */ diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c index dd0b39082534..351763ae9b9c 100644 --- a/drivers/s390/net/qeth_l3_sys.c +++ b/drivers/s390/net/qeth_l3_sys.c @@ -301,19 +301,21 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev, goto out; } + mutex_lock(&card->ip_lock); if (sysfs_streq(buf, "toggle")) { enable = !card->ipato.enabled; } else if (kstrtobool(buf, &enable)) { rc = -EINVAL; - goto out; + goto unlock_ip; } if (card->ipato.enabled != enable) { card->ipato.enabled = enable; - mutex_lock(&card->ip_lock); qeth_l3_update_ipato(card); - mutex_unlock(&card->ip_lock); } + +unlock_ip: + mutex_unlock(&card->ip_lock); out: mutex_unlock(&card->conf_mutex); return rc ? rc : count; @@ -339,7 +341,7 @@ static ssize_t qeth_l3_dev_ipato_invert4_store(struct device *dev, bool invert; int rc = 0; - mutex_lock(&card->conf_mutex); + mutex_lock(&card->ip_lock); if (sysfs_streq(buf, "toggle")) { invert = !card->ipato.invert4; } else if (kstrtobool(buf, &invert)) { @@ -349,12 +351,11 @@ static ssize_t qeth_l3_dev_ipato_invert4_store(struct device *dev, if (card->ipato.invert4 != invert) { card->ipato.invert4 = invert; - mutex_lock(&card->ip_lock); qeth_l3_update_ipato(card); - mutex_unlock(&card->ip_lock); } + out: - mutex_unlock(&card->conf_mutex); + mutex_unlock(&card->ip_lock); return rc ? rc : count; } @@ -406,29 +407,29 @@ static ssize_t qeth_l3_dev_ipato_add4_show(struct device *dev, } static int qeth_l3_parse_ipatoe(const char *buf, enum qeth_prot_versions proto, - u8 *addr, int *mask_bits) + u8 *addr, unsigned int *mask_bits) { - const char *start, *end; - char *tmp; - char buffer[40] = {0, }; + char *sep; + int rc; - start = buf; - /* get address string */ - end = strchr(start, '/'); - if (!end || (end - start >= 40)) { + /* Expected input pattern: %addr/%mask */ + sep = strnchr(buf, 40, '/'); + if (!sep) return -EINVAL; - } - strncpy(buffer, start, end - start); - if (qeth_l3_string_to_ipaddr(buffer, proto, addr)) { - return -EINVAL; - } - start = end + 1; - *mask_bits = simple_strtoul(start, &tmp, 10); - if (!strlen(start) || - (tmp == start) || - (*mask_bits > ((proto == QETH_PROT_IPV4) ? 32 : 128))) { + + /* Terminate the %addr sub-string, and parse it: */ + *sep = '\0'; + rc = qeth_l3_string_to_ipaddr(buf, proto, addr); + if (rc) + return rc; + + rc = kstrtouint(sep + 1, 10, mask_bits); + if (rc) + return rc; + + if (*mask_bits > ((proto == QETH_PROT_IPV4) ? 32 : 128)) return -EINVAL; - } + return 0; } @@ -436,8 +437,8 @@ static ssize_t qeth_l3_dev_ipato_add_store(const char *buf, size_t count, struct qeth_card *card, enum qeth_prot_versions proto) { struct qeth_ipato_entry *ipatoe; + unsigned int mask_bits; u8 addr[16]; - int mask_bits; int rc = 0; rc = qeth_l3_parse_ipatoe(buf, proto, addr, &mask_bits); @@ -474,8 +475,8 @@ static QETH_DEVICE_ATTR(ipato_add4, add4, 0644, static ssize_t qeth_l3_dev_ipato_del_store(const char *buf, size_t count, struct qeth_card *card, enum qeth_prot_versions proto) { + unsigned int mask_bits; u8 addr[16]; - int mask_bits; int rc = 0; rc = qeth_l3_parse_ipatoe(buf, proto, addr, &mask_bits); @@ -510,7 +511,7 @@ static ssize_t qeth_l3_dev_ipato_invert6_store(struct device *dev, bool invert; int rc = 0; - mutex_lock(&card->conf_mutex); + mutex_lock(&card->ip_lock); if (sysfs_streq(buf, "toggle")) { invert = !card->ipato.invert6; } else if (kstrtobool(buf, &invert)) { @@ -520,12 +521,11 @@ static ssize_t qeth_l3_dev_ipato_invert6_store(struct device *dev, if (card->ipato.invert6 != invert) { card->ipato.invert6 = invert; - mutex_lock(&card->ip_lock); qeth_l3_update_ipato(card); - mutex_unlock(&card->ip_lock); } + out: - mutex_unlock(&card->conf_mutex); + mutex_unlock(&card->ip_lock); return rc ? rc : count; } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c6d9f2c444f4..fc5c901c7542 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -292,6 +292,7 @@ enum bpf_arg_type { ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ + __BPF_ARG_TYPE_MAX, }; /* type of values returned from helper functions */ @@ -326,12 +327,16 @@ struct bpf_func_proto { }; enum bpf_arg_type arg_type[5]; }; - int *btf_id; /* BTF ids of arguments */ - bool (*check_btf_id)(u32 btf_id, u32 arg); /* if the argument btf_id is - * valid. Often used if more - * than one btf id is permitted - * for this argument. - */ + union { + struct { + u32 *arg1_btf_id; + u32 *arg2_btf_id; + u32 *arg3_btf_id; + u32 *arg4_btf_id; + u32 *arg5_btf_id; + }; + u32 *arg_btf_id[5]; + }; int *ret_btf_id; /* return value btf_id */ bool (*allowed)(const struct bpf_prog *prog); }; @@ -697,16 +702,19 @@ enum bpf_jit_poke_reason { /* Descriptor of pokes pointing /into/ the JITed image. */ struct bpf_jit_poke_descriptor { - void *ip; + void *tailcall_target; + void *tailcall_bypass; + void *bypass_addr; union { struct { struct bpf_map *map; u32 key; } tail_call; }; - bool ip_stable; + bool tailcall_target_stable; u8 adj_off; u16 reason; + u32 insn_idx; }; /* reg_type info for ctx arguments */ @@ -737,6 +745,7 @@ struct bpf_prog_aux { bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; bool sleepable; + bool tail_call_reachable; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; @@ -751,6 +760,7 @@ struct bpf_prog_aux { struct bpf_ksym ksym; const struct bpf_prog_ops *ops; struct bpf_map **used_maps; + struct mutex used_maps_mutex; /* mutex for used_maps and used_map_cnt */ struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ @@ -1380,8 +1390,6 @@ int btf_struct_access(struct bpf_verifier_log *log, u32 *next_btf_id); bool btf_struct_ids_match(struct bpf_verifier_log *log, int off, u32 id, u32 need_type_id); -int btf_resolve_helper_id(struct bpf_verifier_log *log, - const struct bpf_func_proto *fn, int); int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, @@ -1900,6 +1908,6 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); struct btf_id_set; -bool btf_id_set_contains(struct btf_id_set *set, u32 id); +bool btf_id_set_contains(const struct btf_id_set *set, u32 id); #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 53c7bd568c5d..2bb48a2c4d08 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -358,6 +358,9 @@ struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ + bool has_tail_call; + bool tail_call_reachable; + bool has_ld_abs; }; /* single container for all structs diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 210b086188a3..57890b357f85 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -76,6 +76,13 @@ extern u32 name[]; #define BTF_ID_LIST_GLOBAL(name) \ __BTF_ID_LIST(name, globl) +/* The BTF_ID_LIST_SINGLE macro defines a BTF_ID_LIST with + * a single entry. + */ +#define BTF_ID_LIST_SINGLE(name, prefix, typename) \ + BTF_ID_LIST(name) \ + BTF_ID(prefix, typename) + /* * The BTF_ID_UNUSED macro defines 4 zero bytes. * It's used when we want to define 'unused' entry @@ -140,6 +147,7 @@ extern struct btf_id_set name; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name) u32 name[1]; +#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; #define BTF_SET_START(name) static struct btf_id_set name = { 0 }; #define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; #define BTF_SET_END(name) diff --git a/include/linux/filter.h b/include/linux/filter.h index 05b4052715b9..20fc24c9779a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1287,6 +1287,8 @@ int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len); struct bpf_sk_lookup_kern { u16 family; u16 protocol; + __be16 sport; + u16 dport; struct { __be32 saddr; __be32 daddr; @@ -1295,8 +1297,6 @@ struct bpf_sk_lookup_kern { const struct in6_addr *saddr; const struct in6_addr *daddr; } v6; - __be16 sport; - u16 dport; struct sock *selected_sk; bool no_reuseport; }; diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 6479a38e52fa..556caed00258 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -19,7 +19,13 @@ struct br_ip { #if IS_ENABLED(CONFIG_IPV6) struct in6_addr ip6; #endif - } u; + } src; + union { + __be32 ip4; +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr ip6; +#endif + } dst; __be16 proto; __u16 vid; }; diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 3a88b699b758..dbd69b3d170b 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -306,7 +306,7 @@ static inline u32 linkmode_adv_to_mii_10gbt_adv_t(unsigned long *advertising) /** * mii_10gbt_stat_mod_linkmode_lpa_t * @advertising: target the linkmode advertisement settings - * @adv: value of the C45 10GBASE-T AN STATUS register + * @lpa: value of the C45 10GBASE-T AN STATUS register * * A small helper function that translates C45 10GBASE-T AN STATUS register bits * to linkmode advertisement settings. Other bits in advertising aren't changed. @@ -371,6 +371,7 @@ struct phy_device *mdiobus_get_phy(struct mii_bus *bus, int addr); /** * mdio_module_driver() - Helper macro for registering mdio drivers + * @_mdio_driver: driver to register * * Helper macro for MDIO drivers which do not do anything special in module * init/exit. Each module may only use this macro once, and calling it diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h index 1efb88d9f892..cfe8c607a628 100644 --- a/include/linux/of_mdio.h +++ b/include/linux/of_mdio.h @@ -17,6 +17,7 @@ bool of_mdiobus_child_is_phy(struct device_node *child); int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np); int devm_of_mdiobus_register(struct device *dev, struct mii_bus *mdio, struct device_node *np); +struct mdio_device *of_mdio_find_device(struct device_node *np); struct phy_device *of_phy_find_device(struct device_node *phy_np); struct phy_device * of_phy_connect(struct net_device *dev, struct device_node *phy_np, @@ -74,6 +75,11 @@ static inline int of_mdiobus_register(struct mii_bus *mdio, struct device_node * return mdiobus_register(mdio); } +static inline struct mdio_device *of_mdio_find_device(struct device_node *np) +{ + return NULL; +} + static inline struct phy_device *of_phy_find_device(struct device_node *phy_np) { return NULL; diff --git a/include/linux/phy.h b/include/linux/phy.h index 3a09d2bf69ea..eb3cb1a98b45 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -82,7 +82,39 @@ extern const int phy_10gbit_features_array[1]; #define PHY_POLL_CABLE_TEST 0x00000004 #define MDIO_DEVICE_IS_PHY 0x80000000 -/* Interface Mode definitions */ +/** + * enum phy_interface_t - Interface Mode definitions + * + * @PHY_INTERFACE_MODE_NA: Not Applicable - don't touch + * @PHY_INTERFACE_MODE_INTERNAL: No interface, MAC and PHY combined + * @PHY_INTERFACE_MODE_MII: Median-independent interface + * @PHY_INTERFACE_MODE_GMII: Gigabit median-independent interface + * @PHY_INTERFACE_MODE_SGMII: Serial gigabit media-independent interface + * @PHY_INTERFACE_MODE_TBI: Ten Bit Interface + * @PHY_INTERFACE_MODE_REVMII: Reverse Media Independent Interface + * @PHY_INTERFACE_MODE_RMII: Reduced Media Independent Interface + * @PHY_INTERFACE_MODE_RGMII: Reduced gigabit media-independent interface + * @PHY_INTERFACE_MODE_RGMII_ID: RGMII with Internal RX+TX delay + * @PHY_INTERFACE_MODE_RGMII_RXID: RGMII with Internal RX delay + * @PHY_INTERFACE_MODE_RGMII_TXID: RGMII with Internal RX delay + * @PHY_INTERFACE_MODE_RTBI: Reduced TBI + * @PHY_INTERFACE_MODE_SMII: ??? MII + * @PHY_INTERFACE_MODE_XGMII: 10 gigabit media-independent interface + * @PHY_INTERFACE_MODE_XLGMII:40 gigabit media-independent interface + * @PHY_INTERFACE_MODE_MOCA: Multimedia over Coax + * @PHY_INTERFACE_MODE_QSGMII: Quad SGMII + * @PHY_INTERFACE_MODE_TRGMII: Turbo RGMII + * @PHY_INTERFACE_MODE_1000BASEX: 1000 BaseX + * @PHY_INTERFACE_MODE_2500BASEX: 2500 BaseX + * @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI + * @PHY_INTERFACE_MODE_XAUI: 10 Gigabit Attachment Unit Interface + * @PHY_INTERFACE_MODE_10GBASER: 10G BaseR + * @PHY_INTERFACE_MODE_USXGMII: Universal Serial 10GE MII + * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN + * @PHY_INTERFACE_MODE_MAX: Book keeping + * + * Describes the interface between the MAC and PHY. + */ typedef enum { PHY_INTERFACE_MODE_NA, PHY_INTERFACE_MODE_INTERNAL, @@ -116,8 +148,8 @@ typedef enum { } phy_interface_t; /** - * phy_supported_speeds - return all speeds currently supported by a phy device - * @phy: The phy device to return supported speeds of. + * phy_supported_speeds - return all speeds currently supported by a PHY device + * @phy: The PHY device to return supported speeds of. * @speeds: buffer to store supported speeds in. * @size: size of speeds buffer. * @@ -134,9 +166,9 @@ unsigned int phy_supported_speeds(struct phy_device *phy, * phy_modes - map phy_interface_t enum to device tree binding of phy-mode * @interface: enum phy_interface_t value * - * Description: maps 'enum phy_interface_t' defined in this file + * Description: maps enum &phy_interface_t defined in this file * into the device tree binding of 'phy-mode', so that Ethernet - * device driver can get phy interface from device tree. + * device driver can get PHY interface from device tree. */ static inline const char *phy_modes(phy_interface_t interface) { @@ -215,6 +247,14 @@ struct sfp_bus; struct sfp_upstream_ops; struct sk_buff; +/** + * struct mdio_bus_stats - Statistics counters for MDIO busses + * @transfers: Total number of transfers, i.e. @writes + @reads + * @errors: Number of MDIO transfers that returned an error + * @writes: Number of write transfers + * @reads: Number of read transfers + * @syncp: Synchronisation for incrementing statistics + */ struct mdio_bus_stats { u64_stats_t transfers; u64_stats_t errors; @@ -224,7 +264,15 @@ struct mdio_bus_stats { struct u64_stats_sync syncp; }; -/* Represents a shared structure between different phydev's in the same +/** + * struct phy_package_shared - Shared information in PHY packages + * @addr: Common PHY address used to combine PHYs in one package + * @refcnt: Number of PHYs connected to this shared data + * @flags: Initialization of PHY package + * @priv_size: Size of the shared private data @priv + * @priv: Driver private data shared across a PHY package + * + * Represents a shared structure between different phydev's in the same * package, for example a quad PHY. See phy_package_join() and * phy_package_leave(). */ @@ -247,7 +295,14 @@ struct phy_package_shared { #define PHY_SHARED_F_INIT_DONE 0 #define PHY_SHARED_F_PROBE_DONE 1 -/* +/** + * struct mii_bus - Represents an MDIO bus + * + * @owner: Who owns this device + * @name: User friendly name for this MDIO device, or driver name + * @id: Unique identifier for this bus, typical from bus hierarchy + * @priv: Driver private data + * * The Bus class for PHYs. Devices which provide access to * PHYs should register using this structure */ @@ -256,49 +311,58 @@ struct mii_bus { const char *name; char id[MII_BUS_ID_SIZE]; void *priv; + /** @read: Perform a read transfer on the bus */ int (*read)(struct mii_bus *bus, int addr, int regnum); + /** @write: Perform a write transfer on the bus */ int (*write)(struct mii_bus *bus, int addr, int regnum, u16 val); + /** @reset: Perform a reset of the bus */ int (*reset)(struct mii_bus *bus); + + /** @stats: Statistic counters per device on the bus */ struct mdio_bus_stats stats[PHY_MAX_ADDR]; - /* - * A lock to ensure that only one thing can read/write + /** + * @mdio_lock: A lock to ensure that only one thing can read/write * the MDIO bus at a time */ struct mutex mdio_lock; + /** @parent: Parent device of this bus */ struct device *parent; + /** @state: State of bus structure */ enum { MDIOBUS_ALLOCATED = 1, MDIOBUS_REGISTERED, MDIOBUS_UNREGISTERED, MDIOBUS_RELEASED, } state; + + /** @dev: Kernel device representation */ struct device dev; - /* list of all PHYs on bus */ + /** @mdio_map: list of all MDIO devices on bus */ struct mdio_device *mdio_map[PHY_MAX_ADDR]; - /* PHY addresses to be ignored when probing */ + /** @phy_mask: PHY addresses to be ignored when probing */ u32 phy_mask; - /* PHY addresses to ignore the TA/read failure */ + /** @phy_ignore_ta_mask: PHY addresses to ignore the TA/read failure */ u32 phy_ignore_ta_mask; - /* - * An array of interrupts, each PHY's interrupt at the index + /** + * @irq: An array of interrupts, each PHY's interrupt at the index * matching its address */ int irq[PHY_MAX_ADDR]; - /* GPIO reset pulse width in microseconds */ + /** @reset_delay_us: GPIO reset pulse width in microseconds */ int reset_delay_us; - /* GPIO reset deassert delay in microseconds */ + /** @reset_post_delay_us: GPIO reset deassert delay in microseconds */ int reset_post_delay_us; - /* RESET GPIO descriptor pointer */ + /** @reset_gpiod: Reset GPIO descriptor pointer */ struct gpio_desc *reset_gpiod; - /* bus capabilities, used for probing */ + /** @probe_capabilities: bus capabilities, used for probing */ enum { MDIOBUS_NO_CAP = 0, MDIOBUS_C22, @@ -306,15 +370,22 @@ struct mii_bus { MDIOBUS_C22_C45, } probe_capabilities; - /* protect access to the shared element */ + /** @shared_lock: protect access to the shared element */ struct mutex shared_lock; - /* shared state across different PHYs */ + /** @shared: shared state across different PHYs */ struct phy_package_shared *shared[PHY_MAX_ADDR]; }; #define to_mii_bus(d) container_of(d, struct mii_bus, dev) -struct mii_bus *mdiobus_alloc_size(size_t); +struct mii_bus *mdiobus_alloc_size(size_t size); + +/** + * mdiobus_alloc - Allocate an MDIO bus structure + * + * The internal state of the MDIO bus will be set of MDIOBUS_ALLOCATED ready + * for the driver to register the bus. + */ static inline struct mii_bus *mdiobus_alloc(void) { return mdiobus_alloc_size(0); @@ -341,40 +412,41 @@ struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr); #define PHY_INTERRUPT_DISABLED false #define PHY_INTERRUPT_ENABLED true -/* PHY state machine states: +/** + * enum phy_state - PHY state machine states: * - * DOWN: PHY device and driver are not ready for anything. probe + * @PHY_DOWN: PHY device and driver are not ready for anything. probe * should be called if and only if the PHY is in this state, * given that the PHY device exists. - * - PHY driver probe function will set the state to READY + * - PHY driver probe function will set the state to @PHY_READY * - * READY: PHY is ready to send and receive packets, but the + * @PHY_READY: PHY is ready to send and receive packets, but the * controller is not. By default, PHYs which do not implement * probe will be set to this state by phy_probe(). * - start will set the state to UP * - * UP: The PHY and attached device are ready to do work. + * @PHY_UP: The PHY and attached device are ready to do work. * Interrupts should be started here. - * - timer moves to NOLINK or RUNNING + * - timer moves to @PHY_NOLINK or @PHY_RUNNING * - * NOLINK: PHY is up, but not currently plugged in. - * - irq or timer will set RUNNING if link comes back - * - phy_stop moves to HALTED + * @PHY_NOLINK: PHY is up, but not currently plugged in. + * - irq or timer will set @PHY_RUNNING if link comes back + * - phy_stop moves to @PHY_HALTED * - * RUNNING: PHY is currently up, running, and possibly sending + * @PHY_RUNNING: PHY is currently up, running, and possibly sending * and/or receiving packets - * - irq or timer will set NOLINK if link goes down - * - phy_stop moves to HALTED + * - irq or timer will set @PHY_NOLINK if link goes down + * - phy_stop moves to @PHY_HALTED * - * CABLETEST: PHY is performing a cable test. Packet reception/sending + * @PHY_CABLETEST: PHY is performing a cable test. Packet reception/sending * is not expected to work, carrier will be indicated as down. PHY will be * poll once per second, or on interrupt for it current state. * Once complete, move to UP to restart the PHY. - * - phy_stop aborts the running test and moves to HALTED + * - phy_stop aborts the running test and moves to @PHY_HALTED * - * HALTED: PHY is up, but no polling or interrupts are done. Or + * @PHY_HALTED: PHY is up, but no polling or interrupts are done. Or * PHY is in an error state. - * - phy_start moves to UP + * - phy_start moves to @PHY_UP */ enum phy_state { PHY_DOWN = 0, @@ -403,34 +475,67 @@ struct phy_c45_device_ids { struct macsec_context; struct macsec_ops; -/* phy_device: An instance of a PHY +/** + * struct phy_device - An instance of a PHY * - * drv: Pointer to the driver for this PHY instance - * phy_id: UID for this device found during discovery - * c45_ids: 802.3-c45 Device Identifers if is_c45. - * is_c45: Set to true if this phy uses clause 45 addressing. - * is_internal: Set to true if this phy is internal to a MAC. - * is_pseudo_fixed_link: Set to true if this phy is an Ethernet switch, etc. - * is_gigabit_capable: Set to true if PHY supports 1000Mbps - * has_fixups: Set to true if this phy has fixups/quirks. - * suspended: Set to true if this phy has been suspended successfully. - * suspended_by_mdio_bus: Set to true if this phy was suspended by MDIO bus. - * sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal. - * loopback_enabled: Set true if this phy has been loopbacked successfully. - * downshifted_rate: Set true if link speed has been downshifted. - * state: state of the PHY for management purposes - * dev_flags: Device-specific flags used by the PHY driver. - * irq: IRQ number of the PHY's interrupt (-1 if none) - * phy_timer: The timer for handling the state machine - * sfp_bus_attached: flag indicating whether the SFP bus has been attached - * sfp_bus: SFP bus attached to this PHY's fiber port - * attached_dev: The attached enet driver's device instance ptr - * adjust_link: Callback for the enet controller to respond to - * changes in the link state. - * macsec_ops: MACsec offloading ops. + * @mdio: MDIO bus this PHY is on + * @drv: Pointer to the driver for this PHY instance + * @phy_id: UID for this device found during discovery + * @c45_ids: 802.3-c45 Device Identifiers if is_c45. + * @is_c45: Set to true if this PHY uses clause 45 addressing. + * @is_internal: Set to true if this PHY is internal to a MAC. + * @is_pseudo_fixed_link: Set to true if this PHY is an Ethernet switch, etc. + * @is_gigabit_capable: Set to true if PHY supports 1000Mbps + * @has_fixups: Set to true if this PHY has fixups/quirks. + * @suspended: Set to true if this PHY has been suspended successfully. + * @suspended_by_mdio_bus: Set to true if this PHY was suspended by MDIO bus. + * @sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal. + * @loopback_enabled: Set true if this PHY has been loopbacked successfully. + * @downshifted_rate: Set true if link speed has been downshifted. + * @state: State of the PHY for management purposes + * @dev_flags: Device-specific flags used by the PHY driver. + * @irq: IRQ number of the PHY's interrupt (-1 if none) + * @phy_timer: The timer for handling the state machine + * @phylink: Pointer to phylink instance for this PHY + * @sfp_bus_attached: Flag indicating whether the SFP bus has been attached + * @sfp_bus: SFP bus attached to this PHY's fiber port + * @attached_dev: The attached enet driver's device instance ptr + * @adjust_link: Callback for the enet controller to respond to changes: in the + * link state. + * @phy_link_change: Callback for phylink for notification of link change + * @macsec_ops: MACsec offloading ops. * - * speed, duplex, pause, supported, advertising, lp_advertising, - * and autoneg are used like in mii_if_info + * @speed: Current link speed + * @duplex: Current duplex + * @pause: Current pause + * @asym_pause: Current asymmetric pause + * @supported: Combined MAC/PHY supported linkmodes + * @advertising: Currently advertised linkmodes + * @adv_old: Saved advertised while power saving for WoL + * @lp_advertising: Current link partner advertised linkmodes + * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited + * @autoneg: Flag autoneg being used + * @link: Current link state + * @autoneg_complete: Flag auto negotiation of the link has completed + * @mdix: Current crossover + * @mdix_ctrl: User setting of crossover + * @interrupts: Flag interrupts have been enabled + * @interface: enum phy_interface_t value + * @skb: Netlink message for cable diagnostics + * @nest: Netlink nest used for cable diagnostics + * @ehdr: nNtlink header for cable diagnostics + * @phy_led_triggers: Array of LED triggers + * @phy_num_led_triggers: Number of triggers in @phy_led_triggers + * @led_link_trigger: LED trigger for link up/down + * @last_triggered: last LED trigger for link speed + * @master_slave_set: User requested master/slave configuration + * @master_slave_get: Current master/slave advertisement + * @master_slave_state: Current master/slave configuration + * @mii_ts: Pointer to time stamper callbacks + * @lock: Mutex for serialization access to PHY + * @state_queue: Work queue for state machine + * @shared: Pointer to private data shared by phys in one package + * @priv: Pointer to driver private data * * interrupts currently only supports enabled or disabled, * but could be changed in the future to support enabling @@ -550,9 +655,18 @@ struct phy_device { #define to_phy_device(d) container_of(to_mdio_device(d), \ struct phy_device, mdio) -/* A structure containing possible configuration parameters +/** + * struct phy_tdr_config - Configuration of a TDR raw test + * + * @first: Distance for first data collection point + * @last: Distance for last data collection point + * @step: Step between data collection points + * @pair: Bitmap of cable pairs to collect data for + * + * A structure containing possible configuration parameters * for a TDR cable test. The driver does not need to implement * all the parameters, but should report what is actually used. + * All distances are in centimeters. */ struct phy_tdr_config { u32 first; @@ -562,18 +676,20 @@ struct phy_tdr_config { }; #define PHY_PAIR_ALL -1 -/* struct phy_driver: Driver structure for a particular PHY type +/** + * struct phy_driver - Driver structure for a particular PHY type * - * driver_data: static driver data - * phy_id: The result of reading the UID registers of this PHY + * @mdiodrv: Data common to all MDIO devices + * @phy_id: The result of reading the UID registers of this PHY * type, and ANDing them with the phy_id_mask. This driver * only works for PHYs with IDs which match this field - * name: The friendly name of this PHY type - * phy_id_mask: Defines the important bits of the phy_id - * features: A mandatory list of features (speed, duplex, etc) + * @name: The friendly name of this PHY type + * @phy_id_mask: Defines the important bits of the phy_id + * @features: A mandatory list of features (speed, duplex, etc) * supported by this PHY - * flags: A bitfield defining certain other features this PHY + * @flags: A bitfield defining certain other features this PHY * supports (like interrupts) + * @driver_data: Static driver data * * All functions are optional. If config_aneg or read_status * are not implemented, the phy core uses the genphy versions. @@ -592,151 +708,178 @@ struct phy_driver { u32 flags; const void *driver_data; - /* - * Called to issue a PHY software reset + /** + * @soft_reset: Called to issue a PHY software reset */ int (*soft_reset)(struct phy_device *phydev); - /* - * Called to initialize the PHY, + /** + * @config_init: Called to initialize the PHY, * including after a reset */ int (*config_init)(struct phy_device *phydev); - /* - * Called during discovery. Used to set + /** + * @probe: Called during discovery. Used to set * up device-specific structures, if any */ int (*probe)(struct phy_device *phydev); - /* - * Probe the hardware to determine what abilities it has. - * Should only set phydev->supported. + /** + * @get_features: Probe the hardware to determine what + * abilities it has. Should only set phydev->supported. */ int (*get_features)(struct phy_device *phydev); /* PHY Power Management */ + /** @suspend: Suspend the hardware, saving state if needed */ int (*suspend)(struct phy_device *phydev); + /** @resume: Resume the hardware, restoring state if needed */ int (*resume)(struct phy_device *phydev); - /* - * Configures the advertisement and resets + /** + * @config_aneg: Configures the advertisement and resets * autonegotiation if phydev->autoneg is on, * forces the speed to the current settings in phydev * if phydev->autoneg is off */ int (*config_aneg)(struct phy_device *phydev); - /* Determines the auto negotiation result */ + /** @aneg_done: Determines the auto negotiation result */ int (*aneg_done)(struct phy_device *phydev); - /* Determines the negotiated speed and duplex */ + /** @read_status: Determines the negotiated speed and duplex */ int (*read_status)(struct phy_device *phydev); - /* Clears any pending interrupts */ + /** @ack_interrupt: Clears any pending interrupts */ int (*ack_interrupt)(struct phy_device *phydev); - /* Enables or disables interrupts */ + /** @config_intr: Enables or disables interrupts */ int (*config_intr)(struct phy_device *phydev); - /* - * Checks if the PHY generated an interrupt. + /** + * @did_interrupt: Checks if the PHY generated an interrupt. * For multi-PHY devices with shared PHY interrupt pin * Set interrupt bits have to be cleared. */ int (*did_interrupt)(struct phy_device *phydev); - /* Override default interrupt handling */ + /** @handle_interrupt: Override default interrupt handling */ irqreturn_t (*handle_interrupt)(struct phy_device *phydev); - /* Clears up any memory if needed */ + /** @remove: Clears up any memory if needed */ void (*remove)(struct phy_device *phydev); - /* Returns true if this is a suitable driver for the given - * phydev. If NULL, matching is based on phy_id and - * phy_id_mask. + /** + * @match_phy_device: Returns true if this is a suitable + * driver for the given phydev. If NULL, matching is based on + * phy_id and phy_id_mask. */ int (*match_phy_device)(struct phy_device *phydev); - /* Some devices (e.g. qnap TS-119P II) require PHY register changes to - * enable Wake on LAN, so set_wol is provided to be called in the - * ethernet driver's set_wol function. */ + /** + * @set_wol: Some devices (e.g. qnap TS-119P II) require PHY + * register changes to enable Wake on LAN, so set_wol is + * provided to be called in the ethernet driver's set_wol + * function. + */ int (*set_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol); - /* See set_wol, but for checking whether Wake on LAN is enabled. */ + /** + * @get_wol: See set_wol, but for checking whether Wake on LAN + * is enabled. + */ void (*get_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol); - /* - * Called to inform a PHY device driver when the core is about to - * change the link state. This callback is supposed to be used as - * fixup hook for drivers that need to take action when the link - * state changes. Drivers are by no means allowed to mess with the + /** + * @link_change_notify: Called to inform a PHY device driver + * when the core is about to change the link state. This + * callback is supposed to be used as fixup hook for drivers + * that need to take action when the link state + * changes. Drivers are by no means allowed to mess with the * PHY device structure in their implementations. */ void (*link_change_notify)(struct phy_device *dev); - /* - * Phy specific driver override for reading a MMD register. - * This function is optional for PHY specific drivers. When - * not provided, the default MMD read function will be used - * by phy_read_mmd(), which will use either a direct read for - * Clause 45 PHYs or an indirect read for Clause 22 PHYs. - * devnum is the MMD device number within the PHY device, - * regnum is the register within the selected MMD device. + /** + * @read_mmd: PHY specific driver override for reading a MMD + * register. This function is optional for PHY specific + * drivers. When not provided, the default MMD read function + * will be used by phy_read_mmd(), which will use either a + * direct read for Clause 45 PHYs or an indirect read for + * Clause 22 PHYs. devnum is the MMD device number within the + * PHY device, regnum is the register within the selected MMD + * device. */ int (*read_mmd)(struct phy_device *dev, int devnum, u16 regnum); - /* - * Phy specific driver override for writing a MMD register. - * This function is optional for PHY specific drivers. When - * not provided, the default MMD write function will be used - * by phy_write_mmd(), which will use either a direct write for - * Clause 45 PHYs, or an indirect write for Clause 22 PHYs. - * devnum is the MMD device number within the PHY device, - * regnum is the register within the selected MMD device. - * val is the value to be written. + /** + * @write_mmd: PHY specific driver override for writing a MMD + * register. This function is optional for PHY specific + * drivers. When not provided, the default MMD write function + * will be used by phy_write_mmd(), which will use either a + * direct write for Clause 45 PHYs, or an indirect write for + * Clause 22 PHYs. devnum is the MMD device number within the + * PHY device, regnum is the register within the selected MMD + * device. val is the value to be written. */ int (*write_mmd)(struct phy_device *dev, int devnum, u16 regnum, u16 val); + /** @read_page: Return the current PHY register page number */ int (*read_page)(struct phy_device *dev); + /** @write_page: Set the current PHY register page number */ int (*write_page)(struct phy_device *dev, int page); - /* Get the size and type of the eeprom contained within a plug-in - * module */ + /** + * @module_info: Get the size and type of the eeprom contained + * within a plug-in module + */ int (*module_info)(struct phy_device *dev, struct ethtool_modinfo *modinfo); - /* Get the eeprom information from the plug-in module */ + /** + * @module_eeprom: Get the eeprom information from the plug-in + * module + */ int (*module_eeprom)(struct phy_device *dev, struct ethtool_eeprom *ee, u8 *data); - /* Start a cable test */ + /** @cable_test_start: Start a cable test */ int (*cable_test_start)(struct phy_device *dev); - /* Start a raw TDR cable test */ + /** @cable_test_tdr_start: Start a raw TDR cable test */ int (*cable_test_tdr_start)(struct phy_device *dev, const struct phy_tdr_config *config); - /* Once per second, or on interrupt, request the status of the - * test. + /** + * @cable_test_get_status: Once per second, or on interrupt, + * request the status of the test. */ int (*cable_test_get_status)(struct phy_device *dev, bool *finished); - /* Get statistics from the phy using ethtool */ + /* Get statistics from the PHY using ethtool */ + /** @get_sset_count: Number of statistic counters */ int (*get_sset_count)(struct phy_device *dev); + /** @get_strings: Names of the statistic counters */ void (*get_strings)(struct phy_device *dev, u8 *data); + /** @get_stats: Return the statistic counter values */ void (*get_stats)(struct phy_device *dev, struct ethtool_stats *stats, u64 *data); /* Get and Set PHY tunables */ + /** @get_tunable: Return the value of a tunable */ int (*get_tunable)(struct phy_device *dev, struct ethtool_tunable *tuna, void *data); + /** @set_tunable: Set the value of a tunable */ int (*set_tunable)(struct phy_device *dev, struct ethtool_tunable *tuna, const void *data); + /** @set_loopback: Set the loopback mood of the PHY */ int (*set_loopback)(struct phy_device *dev, bool enable); + /** @get_sqi: Get the signal quality indication */ int (*get_sqi)(struct phy_device *dev); + /** @get_sqi_max: Get the maximum signal quality indication */ int (*get_sqi_max)(struct phy_device *dev); }; #define to_phy_driver(d) container_of(to_mdio_common_driver(d), \ @@ -890,6 +1033,24 @@ static inline int __phy_modify_changed(struct phy_device *phydev, u32 regnum, */ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); +/** + * phy_read_mmd_poll_timeout - Periodically poll a PHY register until a + * condition is met or a timeout occurs + * + * @phydev: The phy_device struct + * @devaddr: The MMD to read from + * @regnum: The register on the MMD to read + * @val: Variable to read the register into + * @cond: Break condition (usually involving @val) + * @sleep_us: Maximum time to sleep between reads in us (0 + * tight-loops). Should be less than ~20ms since usleep_range + * is used (see Documentation/timers/timers-howto.rst). + * @timeout_us: Timeout in us, 0 means never timeout + * @sleep_before_read: if it is true, sleep @sleep_us before read. + * Returns 0 on success and -ETIMEDOUT upon a timeout. In either + * case, the last read value at @args is stored in @val. Must not + * be called from atomic context if sleep_us or timeout_us are used. + */ #define phy_read_mmd_poll_timeout(phydev, devaddr, regnum, val, cond, \ sleep_us, timeout_us, sleep_before_read) \ ({ \ @@ -1161,7 +1322,7 @@ static inline bool phy_is_internal(struct phy_device *phydev) /** * phy_interface_mode_is_rgmii - Convenience function for testing if a * PHY interface mode is RGMII (all variants) - * @mode: the phy_interface_t enum + * @mode: the &phy_interface_t enum */ static inline bool phy_interface_mode_is_rgmii(phy_interface_t mode) { @@ -1170,11 +1331,11 @@ static inline bool phy_interface_mode_is_rgmii(phy_interface_t mode) }; /** - * phy_interface_mode_is_8023z() - does the phy interface mode use 802.3z + * phy_interface_mode_is_8023z() - does the PHY interface mode use 802.3z * negotiation * @mode: one of &enum phy_interface_t * - * Returns true if the phy interface mode uses the 16-bit negotiation + * Returns true if the PHY interface mode uses the 16-bit negotiation * word as defined in 802.3z. (See 802.3-2015 37.2.1 Config_Reg encoding) */ static inline bool phy_interface_mode_is_8023z(phy_interface_t mode) @@ -1193,7 +1354,7 @@ static inline bool phy_interface_is_rgmii(struct phy_device *phydev) return phy_interface_mode_is_rgmii(phydev->interface); }; -/* +/** * phy_is_pseudo_fixed_link - Convenience function for testing if this * PHY is the CPU port facing side of an Ethernet switch, or similar. * @phydev: the phy_device struct @@ -1566,8 +1727,9 @@ static inline int mdiobus_register_board_info(const struct mdio_board_info *i, /** - * module_phy_driver() - Helper macro for registering PHY drivers + * phy_module_driver() - Helper macro for registering PHY drivers * @__phy_drivers: array of PHY drivers to register + * @__count: Numbers of members in array * * Helper macro for PHY drivers which do not do anything special in module * init/exit. Each module may only use this macro once, and calling it diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c738abeb3265..dc763ca9413c 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -96,7 +96,8 @@ struct inet_connection_sock { void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); struct hlist_node icsk_listen_portaddr_node; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); - __u8 icsk_ca_state:6, + __u8 icsk_ca_state:5, + icsk_ca_initialized:1, icsk_ca_setsockopt:1, icsk_ca_dst_locked:1; __u8 icsk_retransmits; diff --git a/include/net/tcp.h b/include/net/tcp.h index 852f0d71dd40..3601dea931a6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1104,7 +1104,7 @@ void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, - bool reinit, bool cap_net_admin); + bool cap_net_admin); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 0ac4e7fba086..3105bbb6cdcf 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -710,8 +710,8 @@ int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid); int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr); int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr); -int ocelot_port_add_txtstamp_skb(struct ocelot_port *ocelot_port, - struct sk_buff *skb); +void ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port, + struct sk_buff *clone); void ocelot_get_txtstamp(struct ocelot *ocelot); void ocelot_port_set_maxlen(struct ocelot *ocelot, int port, size_t sdu); int ocelot_get_max_mtu(struct ocelot *ocelot, int port); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8dda13880957..a22812561064 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -124,6 +124,7 @@ enum bpf_cmd { BPF_ENABLE_STATS, BPF_ITER_CREATE, BPF_LINK_DETACH, + BPF_PROG_BIND_MAP, }; enum bpf_map_type { @@ -658,6 +659,12 @@ union bpf_attr { __u32 flags; } iter_create; + struct { /* struct used by BPF_PROG_BIND_MAP command */ + __u32 prog_fd; + __u32 map_fd; + __u32 flags; /* extra flags */ + } prog_bind_map; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF @@ -1447,8 +1454,8 @@ union bpf_attr { * Return * The return value depends on the result of the test, and can be: * - * * 0, if the *skb* task belongs to the cgroup2. - * * 1, if the *skb* task does not belong to the cgroup2. + * * 0, if current task belongs to the cgroup2. + * * 1, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. * * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) @@ -3349,38 +3356,38 @@ union bpf_attr { * Description * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *task*, which is a valid - * pointer to struct task_struct. To store the stacktrace, the - * bpf program provides *buf* with a nonnegative *size*. + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. * * The last argument, *flags*, holds the number of stack frames to * skip (from 0 to 255), masked with @@ -3410,12 +3417,12 @@ union bpf_attr { * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) * Description * Load header option. Support reading a particular TCP header - * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). * * If *flags* is 0, it will search the option from the - * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** * has details on what skb_data contains under different - * sock_ops->op. + * *skops*\ **->op**. * * The first byte of the *searchby_res* specifies the * kind that it wants to search. @@ -3435,7 +3442,7 @@ union bpf_attr { * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. * * To search for the standard window scale option (3), - * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. * Note, kind-length must be 0 for regular option. * * Searching for No-Op (0) and End-of-Option-List (1) are @@ -3445,27 +3452,30 @@ union bpf_attr { * of a header option. * * Supported flags: + * * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the * saved_syn packet or the just-received syn packet. * * Return - * >0 when found, the header option is copied to *searchby_res*. - * The return value is the total length copied. + * > 0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. On failure, a + * negative error code is returned: * - * **-EINVAL** If param is invalid + * **-EINVAL** if a parameter is invalid. * - * **-ENOMSG** The option is not found + * **-ENOMSG** if the option is not found. * - * **-ENOENT** No syn packet available when - * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * **-ENOENT** if no syn packet is available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. * - * **-ENOSPC** Not enough space. Only *len* number of - * bytes are copied. + * **-ENOSPC** if there is not enough space. Only *len* number of + * bytes are copied. * - * **-EFAULT** Cannot parse the header options in the packet + * **-EFAULT** on failure to parse the header options in the + * packet. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) * Description @@ -3483,44 +3493,44 @@ union bpf_attr { * by searching the same option in the outgoing skb. * * This helper can only be called during - * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * * Return * 0 on success, or negative error in case of failure: * - * **-EINVAL** If param is invalid + * **-EINVAL** If param is invalid. * - * **-ENOSPC** Not enough space in the header. - * Nothing has been written + * **-ENOSPC** if there is not enough space in the header. + * Nothing has been written * - * **-EEXIST** The option has already existed + * **-EEXIST** if the option already exists. * - * **-EFAULT** Cannot parse the existing header options + * **-EFAULT** on failrue to parse the existing header options. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) * Description * Reserve *len* bytes for the bpf header option. The - * space will be used by bpf_store_hdr_opt() later in - * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * space will be used by **bpf_store_hdr_opt**\ () later in + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * - * If bpf_reserve_hdr_opt() is called multiple times, + * If **bpf_reserve_hdr_opt**\ () is called multiple times, * the total number of bytes will be reserved. * * This helper can only be called during - * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. * * Return * 0 on success, or negative error in case of failure: * - * **-EINVAL** if param is invalid + * **-EINVAL** if a parameter is invalid. * - * **-ENOSPC** Not enough space in the header. + * **-ENOSPC** if there is not enough space in the header. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) * Description @@ -3560,9 +3570,9 @@ union bpf_attr { * * long bpf_d_path(struct path *path, char *buf, u32 sz) * Description - * Return full path for given 'struct path' object, which - * needs to be the kernel BTF 'path' object. The path is - * returned in the provided buffer 'buf' of size 'sz' and + * Return full path for given **struct path** object, which + * needs to be the kernel BTF *path* object. The path is + * returned in the provided buffer *buf* of size *sz* and * is zero terminated. * * Return @@ -3573,7 +3583,7 @@ union bpf_attr { * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) * Description * Read *size* bytes from user space address *user_ptr* and store - * the data in *dst*. This is a wrapper of copy_from_user(). + * the data in *dst*. This is a wrapper of **copy_from_user**\ (). * Return * 0 on success, or a negative error in case of failure. */ diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 75a2ac479247..4c687686aa8f 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -457,6 +457,8 @@ enum { MDBA_MDB_EATTR_TIMER, MDBA_MDB_EATTR_SRC_LIST, MDBA_MDB_EATTR_GROUP_MODE, + MDBA_MDB_EATTR_SOURCE, + MDBA_MDB_EATTR_RTPROT, __MDBA_MDB_EATTR_MAX }; #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1) @@ -516,6 +518,8 @@ struct br_mdb_entry { __u8 state; #define MDB_FLAGS_OFFLOAD (1 << 0) #define MDB_FLAGS_FAST_LEAVE (1 << 1) +#define MDB_FLAGS_STAR_EXCL (1 << 2) +#define MDB_FLAGS_BLOCKED (1 << 3) __u8 flags; __u16 vid; struct { @@ -530,10 +534,23 @@ struct br_mdb_entry { enum { MDBA_SET_ENTRY_UNSPEC, MDBA_SET_ENTRY, + MDBA_SET_ENTRY_ATTRS, __MDBA_SET_ENTRY_MAX, }; #define MDBA_SET_ENTRY_MAX (__MDBA_SET_ENTRY_MAX - 1) +/* [MDBA_SET_ENTRY_ATTRS] = { + * [MDBE_ATTR_xxx] + * ... + * } + */ +enum { + MDBE_ATTR_UNSPEC, + MDBE_ATTR_SOURCE, + __MDBE_ATTR_MAX, +}; +#define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1) + /* Embedded inside LINK_XSTATS_TYPE_BRIDGE */ enum { BRIDGE_XSTATS_UNSPEC, diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e046fb7d17cd..e5fd31268ae0 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -898,6 +898,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { + u8 *old_addr, *new_addr, *old_bypass_addr; struct prog_poke_elem *elem; struct bpf_array_aux *aux; @@ -918,12 +919,13 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * there could be danger of use after free otherwise. * 2) Initially when we start tracking aux, the program * is not JITed yet and also does not have a kallsyms - * entry. We skip these as poke->ip_stable is not - * active yet. The JIT will do the final fixup before - * setting it stable. The various poke->ip_stable are - * successively activated, so tail call updates can - * arrive from here while JIT is still finishing its - * final fixup for non-activated poke entries. + * entry. We skip these as poke->tailcall_target_stable + * is not active yet. The JIT will do the final fixup + * before setting it stable. The various + * poke->tailcall_target_stable are successively + * activated, so tail call updates can arrive from here + * while JIT is still finishing its final fixup for + * non-activated poke entries. * 3) On program teardown, the program's kallsym entry gets * removed out of RCU callback, but we can only untrack * from sleepable context, therefore bpf_arch_text_poke() @@ -940,7 +942,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * 5) Any other error happening below from bpf_arch_text_poke() * is a unexpected bug. */ - if (!READ_ONCE(poke->ip_stable)) + if (!READ_ONCE(poke->tailcall_target_stable)) continue; if (poke->reason != BPF_POKE_REASON_TAIL_CALL) continue; @@ -948,12 +950,39 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, - old ? (u8 *)old->bpf_func + - poke->adj_off : NULL, - new ? (u8 *)new->bpf_func + - poke->adj_off : NULL); - BUG_ON(ret < 0 && ret != -EINVAL); + old_bypass_addr = old ? NULL : poke->bypass_addr; + old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; + new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; + + if (new) { + ret = bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, new_addr); + BUG_ON(ret < 0 && ret != -EINVAL); + if (!old) { + ret = bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + poke->bypass_addr, + NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } + } else { + ret = bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + old_bypass_addr, + poke->bypass_addr); + BUG_ON(ret < 0 && ret != -EINVAL); + /* let other CPUs finish the execution of program + * so that it will not possible to expose them + * to invalid nop, stack unwind, nop state + */ + if (!ret) + synchronize_rcu(); + ret = bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } } } } diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 75be02799c0f..6edff97ad594 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -249,9 +249,7 @@ const struct bpf_map_ops inode_storage_map_ops = { .map_owner_storage_ptr = inode_storage_ptr, }; -BTF_ID_LIST(bpf_inode_storage_btf_ids) -BTF_ID_UNUSED -BTF_ID(struct, inode) +BTF_ID_LIST_SINGLE(bpf_inode_storage_btf_ids, struct, inode) const struct bpf_func_proto bpf_inode_storage_get_proto = { .func = bpf_inode_storage_get, @@ -259,9 +257,9 @@ const struct bpf_func_proto bpf_inode_storage_get_proto = { .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_inode_storage_btf_ids[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, - .btf_id = bpf_inode_storage_btf_ids, }; const struct bpf_func_proto bpf_inode_storage_delete_proto = { @@ -270,5 +268,5 @@ const struct bpf_func_proto bpf_inode_storage_delete_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .btf_id = bpf_inode_storage_btf_ids, + .arg2_btf_id = &bpf_inode_storage_btf_ids[0], }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index ffa7d11fc2bd..5d3a7af9ba9b 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -159,7 +159,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { RCU_INIT_POINTER(selem->local_storage, local_storage); - hlist_add_head(&selem->snode, &local_storage->list); + hlist_add_head_rcu(&selem->snode, &local_storage->list); } void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f9ac6935ab3c..5d3c36e13139 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4193,19 +4193,6 @@ again: return true; } -int btf_resolve_helper_id(struct bpf_verifier_log *log, - const struct bpf_func_proto *fn, int arg) -{ - int id; - - if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID || !btf_vmlinux) - return -EINVAL; - id = fn->btf_id[arg]; - if (!id || id > btf_vmlinux->nr_types) - return -EINVAL; - return id; -} - static int __get_type_size(struct btf *btf, u32 btf_id, const struct btf_type **bad_type) { @@ -4772,7 +4759,7 @@ static int btf_id_cmp_func(const void *a, const void *b) return *pa - *pb; } -bool btf_id_set_contains(struct btf_id_set *set, u32 id) +bool btf_id_set_contains(const struct btf_id_set *set, u32 id) { return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ed0b3578867c..c4811b139caa 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -98,6 +98,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->jit_requested = ebpf_jit_enabled(); INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); + mutex_init(&fp->aux->used_maps_mutex); return fp; } @@ -253,6 +254,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, void __bpf_prog_free(struct bpf_prog *fp) { if (fp->aux) { + mutex_destroy(&fp->aux->used_maps_mutex); free_percpu(fp->aux->stats); kfree(fp->aux->poke_tab); kfree(fp->aux); @@ -773,7 +775,8 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, if (size > poke_tab_max) return -ENOSPC; - if (poke->ip || poke->ip_stable || poke->adj_off) + if (poke->tailcall_target || poke->tailcall_target_stable || + poke->tailcall_bypass || poke->adj_off || poke->bypass_addr) return -EINVAL; switch (poke->reason) { @@ -1747,8 +1750,9 @@ bool bpf_prog_array_compatible(struct bpf_array *array, static int bpf_check_tail_call(const struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; - int i; + int i, ret = 0; + mutex_lock(&aux->used_maps_mutex); for (i = 0; i < aux->used_map_cnt; i++) { struct bpf_map *map = aux->used_maps[i]; struct bpf_array *array; @@ -1757,11 +1761,15 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) continue; array = container_of(map, struct bpf_array, map); - if (!bpf_prog_array_compatible(array, fp)) - return -EINVAL; + if (!bpf_prog_array_compatible(array, fp)) { + ret = -EINVAL; + goto out; + } } - return 0; +out: + mutex_unlock(&aux->used_maps_mutex); + return ret; } static void bpf_prog_select_func(struct bpf_prog *fp) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a2fa006f430e..06065fa27124 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -665,18 +665,17 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, return __bpf_get_stack(regs, task, NULL, buf, size, flags); } -BTF_ID_LIST(bpf_get_task_stack_btf_ids) -BTF_ID(struct, task_struct) +BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct) const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_get_task_stack_btf_ids[0], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, - .btf_id = bpf_get_task_stack_btf_ids, }; BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 178c147350f5..34268491d2de 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2345,12 +2345,8 @@ void bpf_link_put(struct bpf_link *link) if (!atomic64_dec_and_test(&link->refcnt)) return; - if (in_atomic()) { - INIT_WORK(&link->work, bpf_link_put_deferred); - schedule_work(&link->work); - } else { - bpf_link_free(link); - } + INIT_WORK(&link->work, bpf_link_put_deferred); + schedule_work(&link->work); } static int bpf_link_release(struct inode *inode, struct file *filp) @@ -3162,21 +3158,25 @@ static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, const struct bpf_map *map; int i; + mutex_lock(&prog->aux->used_maps_mutex); for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { map = prog->aux->used_maps[i]; if (map == (void *)addr) { *type = BPF_PSEUDO_MAP_FD; - return map; + goto out; } if (!map->ops->map_direct_value_meta) continue; if (!map->ops->map_direct_value_meta(map, addr, off)) { *type = BPF_PSEUDO_MAP_VALUE; - return map; + goto out; } } + map = NULL; - return NULL; +out: + mutex_unlock(&prog->aux->used_maps_mutex); + return map; } static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, @@ -3294,6 +3294,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, memcpy(info.tag, prog->tag, sizeof(prog->tag)); memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); + mutex_lock(&prog->aux->used_maps_mutex); ulen = info.nr_map_ids; info.nr_map_ids = prog->aux->used_map_cnt; ulen = min_t(u32, info.nr_map_ids, ulen); @@ -3303,9 +3304,12 @@ static int bpf_prog_get_info_by_fd(struct file *file, for (i = 0; i < ulen; i++) if (put_user(prog->aux->used_maps[i]->id, - &user_map_ids[i])) + &user_map_ids[i])) { + mutex_unlock(&prog->aux->used_maps_mutex); return -EFAULT; + } } + mutex_unlock(&prog->aux->used_maps_mutex); err = set_info_rec_size(&info); if (err) @@ -4153,6 +4157,66 @@ static int bpf_iter_create(union bpf_attr *attr) return err; } +#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags + +static int bpf_prog_bind_map(union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct bpf_map *map; + struct bpf_map **used_maps_old, **used_maps_new; + int i, ret = 0; + + if (CHECK_ATTR(BPF_PROG_BIND_MAP)) + return -EINVAL; + + if (attr->prog_bind_map.flags) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_bind_map.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + map = bpf_map_get(attr->prog_bind_map.map_fd); + if (IS_ERR(map)) { + ret = PTR_ERR(map); + goto out_prog_put; + } + + mutex_lock(&prog->aux->used_maps_mutex); + + used_maps_old = prog->aux->used_maps; + + for (i = 0; i < prog->aux->used_map_cnt; i++) + if (used_maps_old[i] == map) + goto out_unlock; + + used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1, + sizeof(used_maps_new[0]), + GFP_KERNEL); + if (!used_maps_new) { + ret = -ENOMEM; + goto out_unlock; + } + + memcpy(used_maps_new, used_maps_old, + sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); + used_maps_new[prog->aux->used_map_cnt] = map; + + prog->aux->used_map_cnt++; + prog->aux->used_maps = used_maps_new; + + kfree(used_maps_old); + +out_unlock: + mutex_unlock(&prog->aux->used_maps_mutex); + + if (ret) + bpf_map_put(map); +out_prog_put: + bpf_prog_put(prog); + return ret; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -4286,6 +4350,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_LINK_DETACH: err = link_detach(&attr); break; + case BPF_PROG_BIND_MAP: + err = bpf_prog_bind_map(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 99af4cea1102..5b6af30bfbcd 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -22,7 +22,8 @@ struct bpf_iter_seq_task_info { }; static struct task_struct *task_seq_get_next(struct pid_namespace *ns, - u32 *tid) + u32 *tid, + bool skip_if_dup_files) { struct task_struct *task = NULL; struct pid *pid; @@ -36,6 +37,12 @@ retry: if (!task) { ++*tid; goto retry; + } else if (skip_if_dup_files && task->tgid != task->pid && + task->files == task->group_leader->files) { + put_task_struct(task); + task = NULL; + ++*tid; + goto retry; } } rcu_read_unlock(); @@ -48,7 +55,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_task_info *info = seq->private; struct task_struct *task; - task = task_seq_get_next(info->common.ns, &info->tid); + task = task_seq_get_next(info->common.ns, &info->tid, false); if (!task) return NULL; @@ -65,7 +72,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; ++info->tid; put_task_struct((struct task_struct *)v); - task = task_seq_get_next(info->common.ns, &info->tid); + task = task_seq_get_next(info->common.ns, &info->tid, false); if (!task) return NULL; @@ -148,7 +155,7 @@ again: curr_files = *fstruct; curr_fd = info->fd; } else { - curr_task = task_seq_get_next(ns, &curr_tid); + curr_task = task_seq_get_next(ns, &curr_tid, true); if (!curr_task) return NULL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 86fdebb5ffd8..42dee5dcbc74 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -238,7 +238,6 @@ struct bpf_call_arg_meta { u64 msize_max_value; int ref_obj_id; int func_id; - u32 btf_id; }; struct btf *btf_vmlinux; @@ -436,6 +435,15 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) return type == ARG_PTR_TO_SOCK_COMMON; } +static bool arg_type_may_be_null(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_MAP_VALUE_OR_NULL || + type == ARG_PTR_TO_MEM_OR_NULL || + type == ARG_PTR_TO_CTX_OR_NULL || + type == ARG_PTR_TO_SOCKET_OR_NULL || + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; +} + /* Determine whether the function releases some resources allocated by another * function call. The first reference type argument will be assumed to be * released by release_reference(). @@ -1490,6 +1498,13 @@ static int check_subprogs(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; + if (code == (BPF_JMP | BPF_CALL) && + insn[i].imm == BPF_FUNC_tail_call && + insn[i].src_reg != BPF_PSEUDO_CALL) + subprog[cur_subprog].has_tail_call = true; + if (BPF_CLASS(code) == BPF_LD && + (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND)) + subprog[cur_subprog].has_ld_abs = true; if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) @@ -2979,10 +2994,37 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; + bool tail_call_reachable = false; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; + int j; process_func: + /* protect against potential stack overflow that might happen when + * bpf2bpf calls get combined with tailcalls. Limit the caller's stack + * depth for such case down to 256 so that the worst case scenario + * would result in 8k stack size (32 which is tailcall limit * 256 = + * 8k). + * + * To get the idea what might happen, see an example: + * func1 -> sub rsp, 128 + * subfunc1 -> sub rsp, 256 + * tailcall1 -> add rsp, 256 + * func2 -> sub rsp, 192 (total stack size = 128 + 192 = 320) + * subfunc2 -> sub rsp, 64 + * subfunc22 -> sub rsp, 128 + * tailcall2 -> add rsp, 128 + * func3 -> sub rsp, 32 (total stack size 128 + 192 + 64 + 32 = 416) + * + * tailcall will unwind the current stack frame but it will not get rid + * of caller's stack as shown on the example above. + */ + if (idx && subprog[idx].has_tail_call && depth >= 256) { + verbose(env, + "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n", + depth); + return -EACCES; + } /* round up to 32-bytes, since this is granularity * of interpreter stack size */ @@ -3011,6 +3053,10 @@ continue_func: i); return -EFAULT; } + + if (subprog[idx].has_tail_call) + tail_call_reachable = true; + frame++; if (frame >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep !\n", @@ -3019,6 +3065,15 @@ continue_func: } goto process_func; } + /* if tail call got detected across bpf2bpf calls then mark each of the + * currently present subprog frames as tail call reachable subprogs; + * this info will be utilized by JIT so that we will be preserving the + * tail call counter throughout bpf2bpf calls combined with tailcalls + */ + if (tail_call_reachable) + for (j = 0; j < frame; j++) + subprog[ret_prog[j]].tail_call_reachable = true; + /* end of for() loop means the last insn of the 'subprog' * was reached. Doesn't matter whether it was JA or EXIT */ @@ -3594,18 +3649,6 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, struct bpf_func_state *state = func(env, reg); int err, min_off, max_off, i, j, slot, spi; - if (reg->type != PTR_TO_STACK) { - /* Allow zero-byte read from NULL, regardless of pointer type */ - if (zero_size_allowed && access_size == 0 && - register_is_null(reg)) - return 0; - - verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[reg->type], - reg_type_str[PTR_TO_STACK]); - return -EACCES; - } - if (tnum_is_const(reg->var_off)) { min_off = max_off = reg->var_off.value + reg->off; err = __check_stack_boundary(env, regno, min_off, access_size, @@ -3750,9 +3793,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, access_size, zero_size_allowed, "rdwr", &env->prog->aux->max_rdwr_access); - default: /* scalar_value|ptr_to_stack or invalid ptr */ + case PTR_TO_STACK: return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); + default: /* scalar_value or invalid ptr */ + /* Allow zero-byte read from NULL, regardless of pointer type */ + if (zero_size_allowed && access_size == 0 && + register_is_null(reg)) + return 0; + + verbose(env, "R%d type=%s expected=%s\n", regno, + reg_type_str[reg->type], + reg_type_str[PTR_TO_STACK]); + return -EACCES; } } @@ -3784,10 +3837,6 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; - if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "R%d is not a pointer to map_value\n", regno); - return -EINVAL; - } if (!is_const) { verbose(env, "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", @@ -3854,12 +3903,6 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } -static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type) -{ - return type == ARG_PTR_TO_ALLOC_MEM || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; -} - static bool arg_type_is_alloc_size(enum bpf_arg_type type) { return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; @@ -3908,14 +3951,114 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env, return 0; } +struct bpf_reg_types { + const enum bpf_reg_type types[10]; +}; + +static const struct bpf_reg_types map_key_value_types = { + .types = { + PTR_TO_STACK, + PTR_TO_PACKET, + PTR_TO_PACKET_META, + PTR_TO_MAP_VALUE, + }, +}; + +static const struct bpf_reg_types sock_types = { + .types = { + PTR_TO_SOCK_COMMON, + PTR_TO_SOCKET, + PTR_TO_TCP_SOCK, + PTR_TO_XDP_SOCK, + }, +}; + +static const struct bpf_reg_types mem_types = { + .types = { + PTR_TO_STACK, + PTR_TO_PACKET, + PTR_TO_PACKET_META, + PTR_TO_MAP_VALUE, + PTR_TO_MEM, + PTR_TO_RDONLY_BUF, + PTR_TO_RDWR_BUF, + }, +}; + +static const struct bpf_reg_types int_ptr_types = { + .types = { + PTR_TO_STACK, + PTR_TO_PACKET, + PTR_TO_PACKET_META, + PTR_TO_MAP_VALUE, + }, +}; + +static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } }; +static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; +static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; +static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; +static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; +static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; +static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; + +static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { + [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, + [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, + [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, + [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types, + [ARG_CONST_SIZE] = &scalar_types, + [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, + [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, + [ARG_CONST_MAP_PTR] = &const_map_ptr_types, + [ARG_PTR_TO_CTX] = &context_types, + [ARG_PTR_TO_CTX_OR_NULL] = &context_types, + [ARG_PTR_TO_SOCK_COMMON] = &sock_types, + [ARG_PTR_TO_SOCKET] = &fullsock_types, + [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, + [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, + [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, + [ARG_PTR_TO_MEM] = &mem_types, + [ARG_PTR_TO_MEM_OR_NULL] = &mem_types, + [ARG_PTR_TO_UNINIT_MEM] = &mem_types, + [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, + [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, + [ARG_PTR_TO_INT] = &int_ptr_types, + [ARG_PTR_TO_LONG] = &int_ptr_types, +}; + +static int check_reg_type(struct bpf_verifier_env *env, u32 regno, + const struct bpf_reg_types *compatible) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + enum bpf_reg_type expected, type = reg->type; + int i, j; + + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { + expected = compatible->types[i]; + if (expected == NOT_INIT) + break; + + if (type == expected) + return 0; + } + + verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]); + for (j = 0; j + 1 < i; j++) + verbose(env, "%s, ", reg_type_str[compatible->types[j]]); + verbose(env, "%s\n", reg_type_str[compatible->types[j]]); + return -EACCES; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn) { u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - enum bpf_reg_type expected_type, type = reg->type; enum bpf_arg_type arg_type = fn->arg_type[arg]; + const struct bpf_reg_types *compatible; + enum bpf_reg_type type = reg->type; int err = 0; if (arg_type == ARG_DONTCARE) @@ -3948,125 +4091,48 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return err; } - if (arg_type == ARG_PTR_TO_MAP_KEY || - arg_type == ARG_PTR_TO_MAP_VALUE || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || - arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { - expected_type = PTR_TO_STACK; - if (register_is_null(reg) && - arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) - /* final test in check_stack_boundary() */; - else if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != expected_type) - goto err_type; - } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO || - arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) { - expected_type = SCALAR_VALUE; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_CONST_MAP_PTR) { - expected_type = CONST_PTR_TO_MAP; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_PTR_TO_CTX || - arg_type == ARG_PTR_TO_CTX_OR_NULL) { - expected_type = PTR_TO_CTX; - if (!(register_is_null(reg) && - arg_type == ARG_PTR_TO_CTX_OR_NULL)) { - if (type != expected_type) - goto err_type; - err = check_ctx_reg(env, reg, regno); - if (err < 0) - return err; + if (register_is_null(reg) && arg_type_may_be_null(arg_type)) + /* A NULL register has a SCALAR_VALUE type, so skip + * type checking. + */ + goto skip_type_check; + + compatible = compatible_reg_types[arg_type]; + if (!compatible) { + verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); + return -EFAULT; + } + + err = check_reg_type(env, regno, compatible); + if (err) + return err; + + if (type == PTR_TO_BTF_ID) { + const u32 *btf_id = fn->arg_btf_id[arg]; + + if (!btf_id) { + verbose(env, "verifier internal error: missing BTF ID\n"); + return -EFAULT; } - } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { - expected_type = PTR_TO_SOCK_COMMON; - /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ - if (!type_is_sk_pointer(type)) - goto err_type; - if (reg->ref_obj_id) { - if (meta->ref_obj_id) { - verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, - meta->ref_obj_id); - return -EFAULT; - } - meta->ref_obj_id = reg->ref_obj_id; - } - } else if (arg_type == ARG_PTR_TO_SOCKET || - arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { - expected_type = PTR_TO_SOCKET; - if (!(register_is_null(reg) && - arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) { - if (type != expected_type) - goto err_type; - } - } else if (arg_type == ARG_PTR_TO_BTF_ID) { - bool ids_match = false; - - expected_type = PTR_TO_BTF_ID; - if (type != expected_type) - goto err_type; - if (!fn->check_btf_id) { - if (reg->btf_id != meta->btf_id) { - ids_match = btf_struct_ids_match(&env->log, reg->off, reg->btf_id, - meta->btf_id); - if (!ids_match) { - verbose(env, "Helper has type %s got %s in R%d\n", - kernel_type_name(meta->btf_id), - kernel_type_name(reg->btf_id), regno); - return -EACCES; - } - } - } else if (!fn->check_btf_id(reg->btf_id, arg)) { - verbose(env, "Helper does not support %s in R%d\n", - kernel_type_name(reg->btf_id), regno); + if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id, *btf_id)) { + verbose(env, "R%d is of type %s but %s is expected\n", + regno, kernel_type_name(reg->btf_id), kernel_type_name(*btf_id)); return -EACCES; } - if ((reg->off && !ids_match) || !tnum_is_const(reg->var_off) || reg->var_off.value) { + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", regno); return -EACCES; } - } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { - if (meta->func_id == BPF_FUNC_spin_lock) { - if (process_spin_lock(env, regno, true)) - return -EACCES; - } else if (meta->func_id == BPF_FUNC_spin_unlock) { - if (process_spin_lock(env, regno, false)) - return -EACCES; - } else { - verbose(env, "verifier internal error\n"); - return -EFAULT; - } - } else if (arg_type_is_mem_ptr(arg_type)) { - expected_type = PTR_TO_STACK; - /* One exception here. In case function allows for NULL to be - * passed in as argument, it's a SCALAR_VALUE type. Final test - * happens during stack boundary checking. - */ - if (register_is_null(reg) && - (arg_type == ARG_PTR_TO_MEM_OR_NULL || - arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)) - /* final test in check_stack_boundary() */; - else if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != PTR_TO_MEM && - type != PTR_TO_RDONLY_BUF && - type != PTR_TO_RDWR_BUF && - type != expected_type) - goto err_type; - meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; - } else if (arg_type_is_alloc_mem_ptr(arg_type)) { - expected_type = PTR_TO_MEM; - if (register_is_null(reg) && - arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL) - /* final test in check_stack_boundary() */; - else if (type != expected_type) - goto err_type; + } else if (type == PTR_TO_CTX) { + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; + } + +skip_type_check: + if (reg->ref_obj_id) { if (meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", regno, reg->ref_obj_id, @@ -4074,15 +4140,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EFAULT; } meta->ref_obj_id = reg->ref_obj_id; - } else if (arg_type_is_int_ptr(arg_type)) { - expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != expected_type) - goto err_type; - } else { - verbose(env, "unsupported arg_type %d\n", arg_type); - return -EFAULT; } if (arg_type == ARG_CONST_MAP_PTR) { @@ -4121,6 +4178,22 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); + } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { + if (meta->func_id == BPF_FUNC_spin_lock) { + if (process_spin_lock(env, regno, true)) + return -EACCES; + } else if (meta->func_id == BPF_FUNC_spin_unlock) { + if (process_spin_lock(env, regno, false)) + return -EACCES; + } else { + verbose(env, "verifier internal error\n"); + return -EFAULT; + } + } else if (arg_type_is_mem_ptr(arg_type)) { + /* The access to this pointer is only checked when we hit the + * next is_mem_size argument below. + */ + meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MEM); } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); @@ -4186,10 +4259,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } return err; -err_type: - verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[type], reg_type_str[expected_type]); - return -EACCES; } static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) @@ -4224,6 +4293,11 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) return false; } +static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env) +{ + return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64); +} + static int check_map_func_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, int func_id) { @@ -4339,8 +4413,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; - if (env->subprog_cnt > 1) { - verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); + if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) { + verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); return -EINVAL; } break; @@ -4495,10 +4569,22 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) return count <= 1; } +static bool check_btf_id_ok(const struct bpf_func_proto *fn) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) + if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) + return false; + + return true; +} + static int check_func_proto(const struct bpf_func_proto *fn, int func_id) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && + check_btf_id_ok(fn) && check_refcount_ok(fn, func_id) ? 0 : -EINVAL; } @@ -4894,11 +4980,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ for (i = 0; i < 5; i++) { - if (!fn->check_btf_id) { - err = btf_resolve_helper_id(&env->log, fn, i); - if (err > 0) - meta.btf_id = err; - } err = check_func_arg(env, i, &meta, fn); if (err) return err; @@ -5317,6 +5398,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst, reg_type_str[ptr_reg->type]); return -EACCES; case CONST_PTR_TO_MAP: + /* smin_val represents the known value */ + if (known && smin_val == 0 && opcode == BPF_ADD) + break; + /* fall-through */ case PTR_TO_PACKET_END: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: @@ -7461,18 +7546,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - if (env->subprog_cnt > 1) { - /* when program has LD_ABS insn JITs and interpreter assume - * that r1 == ctx == skb which is not the case for callees - * that can have arbitrary arguments. It's problematic - * for main prog as well since JITs would need to analyze - * all functions in order to make proper register save/restore - * decisions in the main prog. Hence disallow LD_ABS with calls - */ - verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n"); - return -EINVAL; - } - if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { @@ -7883,6 +7956,23 @@ err_free: return ret; } +static int check_abnormal_return(struct bpf_verifier_env *env) +{ + int i; + + for (i = 1; i < env->subprog_cnt; i++) { + if (env->subprog_info[i].has_ld_abs) { + verbose(env, "LD_ABS is not allowed in subprogs without BTF\n"); + return -EINVAL; + } + if (env->subprog_info[i].has_tail_call) { + verbose(env, "tail_call is not allowed in subprogs without BTF\n"); + return -EINVAL; + } + } + return 0; +} + /* The minimum supported BTF func info size */ #define MIN_BPF_FUNCINFO_SIZE 8 #define MAX_FUNCINFO_REC_SIZE 252 @@ -7891,20 +7981,24 @@ static int check_btf_func(struct bpf_verifier_env *env, const union bpf_attr *attr, union bpf_attr __user *uattr) { + const struct btf_type *type, *func_proto, *ret_type; u32 i, nfuncs, urec_size, min_size; u32 krec_size = sizeof(struct bpf_func_info); struct bpf_func_info *krecord; struct bpf_func_info_aux *info_aux = NULL; - const struct btf_type *type; struct bpf_prog *prog; const struct btf *btf; void __user *urecord; u32 prev_offset = 0; + bool scalar_return; int ret = -ENOMEM; nfuncs = attr->func_info_cnt; - if (!nfuncs) + if (!nfuncs) { + if (check_abnormal_return(env)) + return -EINVAL; return 0; + } if (nfuncs != env->subprog_cnt) { verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); @@ -7952,25 +8046,23 @@ static int check_btf_func(struct bpf_verifier_env *env, } /* check insn_off */ + ret = -EINVAL; if (i == 0) { if (krecord[i].insn_off) { verbose(env, "nonzero insn_off %u for the first func info record", krecord[i].insn_off); - ret = -EINVAL; goto err_free; } } else if (krecord[i].insn_off <= prev_offset) { verbose(env, "same or smaller insn offset (%u) than previous func info record (%u)", krecord[i].insn_off, prev_offset); - ret = -EINVAL; goto err_free; } if (env->subprog_info[i].start != krecord[i].insn_off) { verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); - ret = -EINVAL; goto err_free; } @@ -7979,10 +8071,26 @@ static int check_btf_func(struct bpf_verifier_env *env, if (!type || !btf_type_is_func(type)) { verbose(env, "invalid type id %d in func info", krecord[i].type_id); - ret = -EINVAL; goto err_free; } info_aux[i].linkage = BTF_INFO_VLEN(type->info); + + func_proto = btf_type_by_id(btf, type->type); + if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto))) + /* btf_func_check() already verified it during BTF load */ + goto err_free; + ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL); + scalar_return = + btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type); + if (i && !scalar_return && env->subprog_info[i].has_ld_abs) { + verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n"); + goto err_free; + } + if (i && !scalar_return && env->subprog_info[i].has_tail_call) { + verbose(env, "tail_call is only allowed in functions that return 'int'.\n"); + goto err_free; + } + prev_offset = krecord[i].insn_off; urecord += urec_size; } @@ -8143,8 +8251,11 @@ static int check_btf_info(struct bpf_verifier_env *env, struct btf *btf; int err; - if (!attr->func_info_cnt && !attr->line_info_cnt) + if (!attr->func_info_cnt && !attr->line_info_cnt) { + if (check_abnormal_return(env)) + return -EINVAL; return 0; + } btf = btf_get_by_fd(attr->prog_btf_fd); if (IS_ERR(btf)) @@ -9619,6 +9730,18 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len } } +static void adjust_poke_descs(struct bpf_prog *prog, u32 len) +{ + struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; + int i, sz = prog->aux->size_poke_tab; + struct bpf_jit_poke_descriptor *desc; + + for (i = 0; i < sz; i++) { + desc = &tab[i]; + desc->insn_idx += len - 1; + } +} + static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len) { @@ -9635,6 +9758,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of if (adjust_insn_aux_data(env, new_prog, off, len)) return NULL; adjust_subprog_starts(env, off, len); + adjust_poke_descs(new_prog, len); return new_prog; } @@ -10165,6 +10289,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog, **func, *tmp; int i, j, subprog_start, subprog_end = 0, len, subprog; + struct bpf_map *map_ptr; struct bpf_insn *insn; void *old_bpf_func; int err, num_exentries; @@ -10232,6 +10357,31 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->btf = prog->aux->btf; func[i]->aux->func_info = prog->aux->func_info; + for (j = 0; j < prog->aux->size_poke_tab; j++) { + u32 insn_idx = prog->aux->poke_tab[j].insn_idx; + int ret; + + if (!(insn_idx >= subprog_start && + insn_idx <= subprog_end)) + continue; + + ret = bpf_jit_add_poke_descriptor(func[i], + &prog->aux->poke_tab[j]); + if (ret < 0) { + verbose(env, "adding tail call poke descriptor failed\n"); + goto out_free; + } + + func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1; + + map_ptr = func[i]->aux->poke_tab[ret].tail_call.map; + ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux); + if (ret < 0) { + verbose(env, "tracking tail call prog failed\n"); + goto out_free; + } + } + /* Use bpf_prog_F_tag to indicate functions in stack traces. * Long term would need debug info to populate names */ @@ -10250,6 +10400,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) num_exentries++; } func[i]->aux->num_exentries = num_exentries; + func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; @@ -10257,6 +10408,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) } cond_resched(); } + + /* Untrack main program's aux structs so that during map_poke_run() + * we will not stumble upon the unfilled poke descriptors; each + * of the main program's poke descs got distributed across subprogs + * and got tracked onto map, so we are sure that none of them will + * be missed after the operation below + */ + for (i = 0; i < prog->aux->size_poke_tab; i++) { + map_ptr = prog->aux->poke_tab[i].tail_call.map; + + map_ptr->ops->map_poke_untrack(map_ptr, prog->aux); + } + /* at this point all bpf functions were successfully JITed * now populate all bpf_calls with correct addresses and * run last pass of JIT @@ -10325,9 +10489,16 @@ static int jit_subprogs(struct bpf_verifier_env *env) bpf_prog_free_unused_jited_linfo(prog); return 0; out_free: - for (i = 0; i < env->subprog_cnt; i++) - if (func[i]) - bpf_jit_free(func[i]); + for (i = 0; i < env->subprog_cnt; i++) { + if (!func[i]) + continue; + + for (j = 0; j < func[i]->aux->size_poke_tab; j++) { + map_ptr = func[i]->aux->poke_tab[j].tail_call.map; + map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux); + } + bpf_jit_free(func[i]); + } kfree(func); out_undo_insn: /* cleanup main prog to be interpreted */ @@ -10361,6 +10532,13 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON + if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { + /* When JIT fails the progs with bpf2bpf calls and tail_calls + * have to be rejected, since interpreter doesn't support them yet. + */ + verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); + return -EINVAL; + } for (i = 0; i < prog->len; i++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) @@ -10524,8 +10702,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) * the program array. */ prog->cb_access = 1; - env->prog->aux->stack_depth = MAX_BPF_STACK; - env->prog->aux->max_pkt_offset = MAX_PACKET_OFF; + if (!allow_tail_call_in_subprogs(env)) + prog->aux->stack_depth = MAX_BPF_STACK; + prog->aux->max_pkt_offset = MAX_PACKET_OFF; /* mark bpf_tail_call as different opcode to avoid * conditional branch in the interpeter for every normal @@ -10545,6 +10724,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) .reason = BPF_POKE_REASON_TAIL_CALL, .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state), .tail_call.key = bpf_map_key_immediate(aux), + .insn_idx = i + delta, }; ret = bpf_jit_add_poke_descriptor(prog, &desc); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b2a5380eb187..36508f46a8db 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -743,19 +743,18 @@ out: return err; } -BTF_ID_LIST(bpf_seq_printf_btf_ids) -BTF_ID(struct, seq_file) +BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file) static const struct bpf_func_proto bpf_seq_printf_proto = { .func = bpf_seq_printf, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM_OR_NULL, .arg5_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_seq_printf_btf_ids, }; BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) @@ -763,17 +762,14 @@ BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) return seq_write(m, data, len) ? -EOVERFLOW : 0; } -BTF_ID_LIST(bpf_seq_write_btf_ids) -BTF_ID(struct, seq_file) - static const struct bpf_func_proto bpf_seq_write_proto = { .func = bpf_seq_write, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_seq_write_btf_ids, }; static __always_inline int @@ -1118,6 +1114,14 @@ BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz) } BTF_SET_START(btf_allowlist_d_path) +#ifdef CONFIG_SECURITY +BTF_ID(func, security_file_permission) +BTF_ID(func, security_inode_getattr) +BTF_ID(func, security_file_open) +#endif +#ifdef CONFIG_SECURITY_PATH +BTF_ID(func, security_path_truncate) +#endif BTF_ID(func, vfs_truncate) BTF_ID(func, vfs_fallocate) BTF_ID(func, dentry_open) @@ -1130,17 +1134,16 @@ static bool bpf_d_path_allowed(const struct bpf_prog *prog) return btf_id_set_contains(&btf_allowlist_d_path, prog->aux->attach_btf_id); } -BTF_ID_LIST(bpf_d_path_btf_ids) -BTF_ID(struct, path) +BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path) static const struct bpf_func_proto bpf_d_path_proto = { .func = bpf_d_path, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_d_path_btf_ids[0], .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_d_path_btf_ids, .allowed = bpf_d_path_allowed, }; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 0746fe2c2c04..9af99c39b9fd 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -221,7 +221,7 @@ static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, * address here, only IPv6 ones */ if (br_ip_entry->addr.proto == htons(ETH_P_IPV6) && - ipv6_addr_is_ll_all_routers(&br_ip_entry->addr.u.ip6)) + ipv6_addr_is_ll_all_routers(&br_ip_entry->addr.dst.ip6)) flags &= ~BATADV_MCAST_WANT_NO_RTR6; list_del(&br_ip_entry->list); @@ -562,10 +562,10 @@ out: static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) { if (src->proto == htons(ETH_P_IP)) - ip_eth_mc_map(src->u.ip4, dst); + ip_eth_mc_map(src->dst.ip4, dst); #if IS_ENABLED(CONFIG_IPV6) else if (src->proto == htons(ETH_P_IPV6)) - ipv6_eth_mc_map(&src->u.ip6, dst); + ipv6_eth_mc_map(&src->dst.ip6, dst); #endif else eth_zero_addr(dst); @@ -609,11 +609,11 @@ static int batadv_mcast_mla_bridge_get(struct net_device *dev, continue; if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && - ipv4_is_local_multicast(br_ip_entry->addr.u.ip4)) + ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4)) continue; if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && - !ipv4_is_local_multicast(br_ip_entry->addr.u.ip4)) + !ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4)) continue; } @@ -623,11 +623,11 @@ static int batadv_mcast_mla_bridge_get(struct net_device *dev, continue; if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && - ipv6_addr_is_ll_all_nodes(&br_ip_entry->addr.u.ip6)) + ipv6_addr_is_ll_all_nodes(&br_ip_entry->addr.dst.ip6)) continue; if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && - IPV6_ADDR_MC_SCOPE(&br_ip_entry->addr.u.ip6) > + IPV6_ADDR_MC_SCOPE(&br_ip_entry->addr.dst.ip6) > IPV6_ADDR_SCOPE_LINKLOCAL) continue; } diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 7629b63f6f30..e28ffadd1371 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -274,14 +274,23 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *prev = NULL; struct net_bridge_port_group *p; + bool allow_mode_include = true; struct hlist_node *rp; rp = rcu_dereference(hlist_first_rcu(&br->router_list)); - p = mdst ? rcu_dereference(mdst->ports) : NULL; + if (mdst) { + p = rcu_dereference(mdst->ports); + if (br_multicast_should_handle_mode(br, mdst->addr.proto) && + br_multicast_is_star_g(&mdst->addr)) + allow_mode_include = false; + } else { + p = NULL; + } + while (p || rp) { struct net_bridge_port *port, *lport, *rport; - lport = p ? p->port : NULL; + lport = p ? p->key.port : NULL; rport = hlist_entry_safe(rp, struct net_bridge_port, rlist); if ((unsigned long)lport > (unsigned long)rport) { @@ -292,6 +301,10 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, local_orig); goto delivered; } + if ((!allow_mode_include && + p->filter_mode == MCAST_INCLUDE) || + (p->flags & MDB_PG_FLAGS_BLOCKED)) + goto delivered; } else { port = rport; } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 00f1651a6aba..e15bab19a012 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -62,19 +62,33 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) e->flags |= MDB_FLAGS_OFFLOAD; if (flags & MDB_PG_FLAGS_FAST_LEAVE) e->flags |= MDB_FLAGS_FAST_LEAVE; + if (flags & MDB_PG_FLAGS_STAR_EXCL) + e->flags |= MDB_FLAGS_STAR_EXCL; + if (flags & MDB_PG_FLAGS_BLOCKED) + e->flags |= MDB_FLAGS_BLOCKED; } -static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip) +static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, + struct nlattr **mdb_attrs) { memset(ip, 0, sizeof(struct br_ip)); ip->vid = entry->vid; ip->proto = entry->addr.proto; - if (ip->proto == htons(ETH_P_IP)) - ip->u.ip4 = entry->addr.u.ip4; + switch (ip->proto) { + case htons(ETH_P_IP): + ip->dst.ip4 = entry->addr.u.ip4; + if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE]) + ip->src.ip4 = nla_get_in_addr(mdb_attrs[MDBE_ATTR_SOURCE]); + break; #if IS_ENABLED(CONFIG_IPV6) - else - ip->u.ip6 = entry->addr.u.ip6; + case htons(ETH_P_IPV6): + ip->dst.ip6 = entry->addr.u.ip6; + if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE]) + ip->src.ip6 = nla_get_in6_addr(mdb_attrs[MDBE_ATTR_SOURCE]); + break; #endif + } + } static int __mdb_fill_srcs(struct sk_buff *skb, @@ -91,14 +105,14 @@ static int __mdb_fill_srcs(struct sk_buff *skb, return -EMSGSIZE; hlist_for_each_entry_rcu(ent, &p->src_list, node, - lockdep_is_held(&p->port->br->multicast_lock)) { + lockdep_is_held(&p->key.port->br->multicast_lock)) { nest_ent = nla_nest_start(skb, MDBA_MDB_SRCLIST_ENTRY); if (!nest_ent) goto out_cancel_err; switch (ent->addr.proto) { case htons(ETH_P_IP): if (nla_put_in_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, - ent->addr.u.ip4)) { + ent->addr.src.ip4)) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } @@ -106,7 +120,7 @@ static int __mdb_fill_srcs(struct sk_buff *skb, #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): if (nla_put_in6_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, - &ent->addr.u.ip6)) { + &ent->addr.src.ip6)) { nla_nest_cancel(skb, nest_ent); goto out_cancel_err; } @@ -146,7 +160,7 @@ static int __mdb_fill_info(struct sk_buff *skb, memset(&e, 0, sizeof(e)); if (p) { - ifindex = p->port->dev->ifindex; + ifindex = p->key.port->dev->ifindex; mtimer = &p->timer; flags = p->flags; } else { @@ -158,10 +172,10 @@ static int __mdb_fill_info(struct sk_buff *skb, e.ifindex = ifindex; e.vid = mp->addr.vid; if (mp->addr.proto == htons(ETH_P_IP)) - e.addr.u.ip4 = mp->addr.u.ip4; + e.addr.u.ip4 = mp->addr.dst.ip4; #if IS_ENABLED(CONFIG_IPV6) if (mp->addr.proto == htons(ETH_P_IPV6)) - e.addr.u.ip6 = mp->addr.u.ip6; + e.addr.u.ip6 = mp->addr.dst.ip6; #endif e.addr.proto = mp->addr.proto; nest_ent = nla_nest_start_noflag(skb, @@ -172,30 +186,47 @@ static int __mdb_fill_info(struct sk_buff *skb, if (nla_put_nohdr(skb, sizeof(e), &e) || nla_put_u32(skb, MDBA_MDB_EATTR_TIMER, - br_timer_value(mtimer))) { - nla_nest_cancel(skb, nest_ent); - return -EMSGSIZE; - } + br_timer_value(mtimer))) + goto nest_err; + switch (mp->addr.proto) { case htons(ETH_P_IP): - dump_srcs_mode = !!(p && mp->br->multicast_igmp_version == 3); + dump_srcs_mode = !!(mp->br->multicast_igmp_version == 3); + if (mp->addr.src.ip4) { + if (nla_put_in_addr(skb, MDBA_MDB_EATTR_SOURCE, + mp->addr.src.ip4)) + goto nest_err; + break; + } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - dump_srcs_mode = !!(p && mp->br->multicast_mld_version == 2); + dump_srcs_mode = !!(mp->br->multicast_mld_version == 2); + if (!ipv6_addr_any(&mp->addr.src.ip6)) { + if (nla_put_in6_addr(skb, MDBA_MDB_EATTR_SOURCE, + &mp->addr.src.ip6)) + goto nest_err; + break; + } break; #endif } - if (dump_srcs_mode && - (__mdb_fill_srcs(skb, p) || - nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, p->filter_mode))) { - nla_nest_cancel(skb, nest_ent); - return -EMSGSIZE; + if (p) { + if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, p->rt_protocol)) + goto nest_err; + if (dump_srcs_mode && + (__mdb_fill_srcs(skb, p) || + nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, + p->filter_mode))) + goto nest_err; } - nla_nest_end(skb, nest_ent); return 0; + +nest_err: + nla_nest_cancel(skb, nest_ent); + return -EMSGSIZE; } static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, @@ -236,7 +267,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; pp = &p->next) { - if (!p->port) + if (!p->key.port) continue; if (pidx < s_pidx) goto skip_pg; @@ -393,15 +424,24 @@ static size_t rtnl_mdb_nlmsg_size(struct net_bridge_port_group *pg) if (!pg) goto out; - switch (pg->addr.proto) { + /* MDBA_MDB_EATTR_RTPROT */ + nlmsg_size += nla_total_size(sizeof(u8)); + + switch (pg->key.addr.proto) { case htons(ETH_P_IP): - if (pg->port->br->multicast_igmp_version == 2) + /* MDBA_MDB_EATTR_SOURCE */ + if (pg->key.addr.src.ip4) + nlmsg_size += nla_total_size(sizeof(__be32)); + if (pg->key.port->br->multicast_igmp_version == 2) goto out; addr_size = sizeof(__be32); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - if (pg->port->br->multicast_mld_version == 1) + /* MDBA_MDB_EATTR_SOURCE */ + if (!ipv6_addr_any(&pg->key.addr.src.ip6)) + nlmsg_size += nla_total_size(sizeof(struct in6_addr)); + if (pg->key.port->br->multicast_mld_version == 1) goto out; addr_size = sizeof(struct in6_addr); break; @@ -450,7 +490,7 @@ static void br_mdb_complete(struct net_device *dev, int err, void *priv) goto out; for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { - if (p->port != port) + if (p->key.port != port) continue; p->flags |= MDB_PG_FLAGS_OFFLOAD; } @@ -474,10 +514,10 @@ static void br_mdb_switchdev_host_port(struct net_device *dev, }; if (mp->addr.proto == htons(ETH_P_IP)) - ip_eth_mc_map(mp->addr.u.ip4, mdb.addr); + ip_eth_mc_map(mp->addr.dst.ip4, mdb.addr); #if IS_ENABLED(CONFIG_IPV6) else - ipv6_eth_mc_map(&mp->addr.u.ip6, mdb.addr); + ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb.addr); #endif mdb.obj.orig_dev = dev; @@ -520,26 +560,26 @@ void br_mdb_notify(struct net_device *dev, if (pg) { if (mp->addr.proto == htons(ETH_P_IP)) - ip_eth_mc_map(mp->addr.u.ip4, mdb.addr); + ip_eth_mc_map(mp->addr.dst.ip4, mdb.addr); #if IS_ENABLED(CONFIG_IPV6) else - ipv6_eth_mc_map(&mp->addr.u.ip6, mdb.addr); + ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb.addr); #endif - mdb.obj.orig_dev = pg->port->dev; + mdb.obj.orig_dev = pg->key.port->dev; switch (type) { case RTM_NEWMDB: complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC); if (!complete_info) break; - complete_info->port = pg->port; + complete_info->port = pg->key.port; complete_info->ip = mp->addr; mdb.obj.complete_priv = complete_info; mdb.obj.complete = br_mdb_complete; - if (switchdev_port_obj_add(pg->port->dev, &mdb.obj, NULL)) + if (switchdev_port_obj_add(pg->key.port->dev, &mdb.obj, NULL)) kfree(complete_info); break; case RTM_DELMDB: - switchdev_port_obj_del(pg->port->dev, &mdb.obj); + switchdev_port_obj_del(pg->key.port->dev, &mdb.obj); break; } } else { @@ -629,33 +669,94 @@ errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } -static bool is_valid_mdb_entry(struct br_mdb_entry *entry) +static bool is_valid_mdb_entry(struct br_mdb_entry *entry, + struct netlink_ext_ack *extack) { - if (entry->ifindex == 0) + if (entry->ifindex == 0) { + NL_SET_ERR_MSG_MOD(extack, "Zero entry ifindex is not allowed"); return false; + } if (entry->addr.proto == htons(ETH_P_IP)) { - if (!ipv4_is_multicast(entry->addr.u.ip4)) + if (!ipv4_is_multicast(entry->addr.u.ip4)) { + NL_SET_ERR_MSG_MOD(extack, "IPv4 entry group address is not multicast"); return false; - if (ipv4_is_local_multicast(entry->addr.u.ip4)) + } + if (ipv4_is_local_multicast(entry->addr.u.ip4)) { + NL_SET_ERR_MSG_MOD(extack, "IPv4 entry group address is local multicast"); return false; + } #if IS_ENABLED(CONFIG_IPV6) } else if (entry->addr.proto == htons(ETH_P_IPV6)) { - if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6)) + if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6)) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 entry group address is link-local all nodes"); return false; + } #endif - } else + } else { + NL_SET_ERR_MSG_MOD(extack, "Unknown entry protocol"); return false; - if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) + } + + if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) { + NL_SET_ERR_MSG_MOD(extack, "Unknown entry state"); return false; - if (entry->vid >= VLAN_VID_MASK) + } + if (entry->vid >= VLAN_VID_MASK) { + NL_SET_ERR_MSG_MOD(extack, "Invalid entry VLAN id"); return false; + } + + return true; +} + +static bool is_valid_mdb_source(struct nlattr *attr, __be16 proto, + struct netlink_ext_ack *extack) +{ + switch (proto) { + case htons(ETH_P_IP): + if (nla_len(attr) != sizeof(struct in_addr)) { + NL_SET_ERR_MSG_MOD(extack, "IPv4 invalid source address length"); + return false; + } + if (ipv4_is_multicast(nla_get_in_addr(attr))) { + NL_SET_ERR_MSG_MOD(extack, "IPv4 multicast source address is not allowed"); + return false; + } + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): { + struct in6_addr src; + + if (nla_len(attr) != sizeof(struct in6_addr)) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 invalid source address length"); + return false; + } + src = nla_get_in6_addr(attr); + if (ipv6_addr_is_multicast(&src)) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 multicast source address is not allowed"); + return false; + } + break; + } +#endif + default: + NL_SET_ERR_MSG_MOD(extack, "Invalid protocol used with source address"); + return false; + } return true; } +static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { + [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, + sizeof(struct in_addr), + sizeof(struct in6_addr)), +}; + static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, - struct net_device **pdev, struct br_mdb_entry **pentry) + struct net_device **pdev, struct br_mdb_entry **pentry, + struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct br_mdb_entry *entry; @@ -671,51 +772,86 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, bpm = nlmsg_data(nlh); if (bpm->ifindex == 0) { - pr_info("PF_BRIDGE: br_mdb_parse() with invalid ifindex\n"); + NL_SET_ERR_MSG_MOD(extack, "Invalid bridge ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, bpm->ifindex); if (dev == NULL) { - pr_info("PF_BRIDGE: br_mdb_parse() with unknown ifindex\n"); + NL_SET_ERR_MSG_MOD(extack, "Bridge device doesn't exist"); return -ENODEV; } if (!(dev->priv_flags & IFF_EBRIDGE)) { - pr_info("PF_BRIDGE: br_mdb_parse() with non-bridge\n"); + NL_SET_ERR_MSG_MOD(extack, "Device is not a bridge"); return -EOPNOTSUPP; } *pdev = dev; - if (!tb[MDBA_SET_ENTRY] || - nla_len(tb[MDBA_SET_ENTRY]) != sizeof(struct br_mdb_entry)) { - pr_info("PF_BRIDGE: br_mdb_parse() with invalid attr\n"); + if (!tb[MDBA_SET_ENTRY]) { + NL_SET_ERR_MSG_MOD(extack, "Missing MDBA_SET_ENTRY attribute"); + return -EINVAL; + } + if (nla_len(tb[MDBA_SET_ENTRY]) != sizeof(struct br_mdb_entry)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid MDBA_SET_ENTRY attribute length"); return -EINVAL; } entry = nla_data(tb[MDBA_SET_ENTRY]); - if (!is_valid_mdb_entry(entry)) { - pr_info("PF_BRIDGE: br_mdb_parse() with invalid entry\n"); + if (!is_valid_mdb_entry(entry, extack)) return -EINVAL; + *pentry = entry; + + if (tb[MDBA_SET_ENTRY_ATTRS]) { + err = nla_parse_nested(mdb_attrs, MDBE_ATTR_MAX, + tb[MDBA_SET_ENTRY_ATTRS], + br_mdbe_attrs_pol, extack); + if (err) + return err; + if (mdb_attrs[MDBE_ATTR_SOURCE] && + !is_valid_mdb_source(mdb_attrs[MDBE_ATTR_SOURCE], + entry->addr.proto, extack)) + return -EINVAL; + } else { + memset(mdb_attrs, 0, + sizeof(struct nlattr *) * (MDBE_ATTR_MAX + 1)); } - *pentry = entry; return 0; } static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group, struct br_mdb_entry *entry) + struct br_mdb_entry *entry, + struct nlattr **mdb_attrs, + struct netlink_ext_ack *extack) { - struct net_bridge_mdb_entry *mp; + struct net_bridge_mdb_entry *mp, *star_mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; + struct br_ip group, star_group; unsigned long now = jiffies; + u8 filter_mode; int err; - mp = br_mdb_ip_get(br, group); + __mdb_entry_to_br_ip(entry, &group, mdb_attrs); + + /* host join errors which can happen before creating the group */ + if (!port) { + /* don't allow any flags for host-joined groups */ + if (entry->state) { + NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); + return -EINVAL; + } + if (!br_multicast_is_star_g(&group)) { + NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined"); + return -EINVAL; + } + } + + mp = br_mdb_ip_get(br, &group); if (!mp) { - mp = br_multicast_new_group(br, group); + mp = br_multicast_new_group(br, &group); err = PTR_ERR_OR_ZERO(mp); if (err) return err; @@ -723,11 +859,10 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, /* host join */ if (!port) { - /* don't allow any flags for host-joined groups */ - if (entry->state) - return -EINVAL; - if (mp->host_joined) + if (mp->host_joined) { + NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host"); return -EEXIST; + } br_multicast_host_join(mp, false); br_mdb_notify(br->dev, mp, NULL, RTM_NEWMDB); @@ -738,56 +873,69 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { - if (p->port == port) + if (p->key.port == port) { + NL_SET_ERR_MSG_MOD(extack, "Group is already joined by port"); return -EEXIST; - if ((unsigned long)p->port < (unsigned long)port) + } + if ((unsigned long)p->key.port < (unsigned long)port) break; } - p = br_multicast_new_port_group(port, group, *pp, entry->state, NULL, - MCAST_EXCLUDE); - if (unlikely(!p)) + filter_mode = br_multicast_is_star_g(&group) ? MCAST_EXCLUDE : + MCAST_INCLUDE; + + p = br_multicast_new_port_group(port, &group, *pp, entry->state, NULL, + filter_mode, RTPROT_STATIC); + if (unlikely(!p)) { + NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group"); return -ENOMEM; + } rcu_assign_pointer(*pp, p); if (entry->state == MDB_TEMPORARY) mod_timer(&p->timer, now + br->multicast_membership_interval); br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); + /* if we are adding a new EXCLUDE port group (*,G) it needs to be also + * added to all S,G entries for proper replication, if we are adding + * a new INCLUDE port (S,G) then all of *,G EXCLUDE ports need to be + * added to it for proper replication + */ + if (br_multicast_should_handle_mode(br, group.proto)) { + switch (filter_mode) { + case MCAST_EXCLUDE: + br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE); + break; + case MCAST_INCLUDE: + star_group = p->key.addr; + memset(&star_group.src, 0, sizeof(star_group.src)); + star_mp = br_mdb_ip_get(br, &star_group); + if (star_mp) + br_multicast_sg_add_exclude_ports(star_mp, p); + break; + } + } return 0; } static int __br_mdb_add(struct net *net, struct net_bridge *br, - struct br_mdb_entry *entry) + struct net_bridge_port *p, + struct br_mdb_entry *entry, + struct nlattr **mdb_attrs, + struct netlink_ext_ack *extack) { - struct br_ip ip; - struct net_device *dev; - struct net_bridge_port *p = NULL; int ret; - if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) - return -EINVAL; - - if (entry->ifindex != br->dev->ifindex) { - dev = __dev_get_by_index(net, entry->ifindex); - if (!dev) - return -ENODEV; - - p = br_port_get_rtnl(dev); - if (!p || p->br != br || p->state == BR_STATE_DISABLED) - return -EINVAL; - } - - __mdb_entry_to_br_ip(entry, &ip); - spin_lock_bh(&br->multicast_lock); - ret = br_mdb_add_group(br, p, &ip, entry); + ret = br_mdb_add_group(br, p, entry, mdb_attrs, extack); spin_unlock_bh(&br->multicast_lock); + return ret; } static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; @@ -797,20 +945,43 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_bridge *br; int err; - err = br_mdb_parse(skb, nlh, &dev, &entry); + err = br_mdb_parse(skb, nlh, &dev, &entry, mdb_attrs, extack); if (err < 0) return err; br = netdev_priv(dev); + if (!netif_running(br->dev)) { + NL_SET_ERR_MSG_MOD(extack, "Bridge device is not running"); + return -EINVAL; + } + + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) { + NL_SET_ERR_MSG_MOD(extack, "Bridge's multicast processing is disabled"); + return -EINVAL; + } + if (entry->ifindex != br->dev->ifindex) { pdev = __dev_get_by_index(net, entry->ifindex); - if (!pdev) + if (!pdev) { + NL_SET_ERR_MSG_MOD(extack, "Port net device doesn't exist"); return -ENODEV; + } p = br_port_get_rtnl(pdev); - if (!p || p->br != br || p->state == BR_STATE_DISABLED) + if (!p) { + NL_SET_ERR_MSG_MOD(extack, "Net device is not a bridge port"); + return -EINVAL; + } + + if (p->br != br) { + NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device"); return -EINVAL; + } + if (p->state == BR_STATE_DISABLED) { + NL_SET_ERR_MSG_MOD(extack, "Port is in disabled state"); + return -EINVAL; + } vg = nbp_vlan_group(p); } else { vg = br_vlan_group(br); @@ -822,18 +993,19 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; - err = __br_mdb_add(net, br, entry); + err = __br_mdb_add(net, br, p, entry, mdb_attrs, extack); if (err) break; } } else { - err = __br_mdb_add(net, br, entry); + err = __br_mdb_add(net, br, p, entry, mdb_attrs, extack); } return err; } -static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) +static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry, + struct nlattr **mdb_attrs) { struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; @@ -844,7 +1016,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) return -EINVAL; - __mdb_entry_to_br_ip(entry, &ip); + __mdb_entry_to_br_ip(entry, &ip, mdb_attrs); spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &ip); @@ -864,10 +1036,10 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { - if (!p->port || p->port->dev->ifindex != entry->ifindex) + if (!p->key.port || p->key.port->dev->ifindex != entry->ifindex) continue; - if (p->port->state == BR_STATE_DISABLED) + if (p->key.port->state == BR_STATE_DISABLED) goto unlock; br_multicast_del_pg(mp, p, pp); @@ -883,6 +1055,7 @@ unlock: static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; @@ -892,7 +1065,7 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_bridge *br; int err; - err = br_mdb_parse(skb, nlh, &dev, &entry); + err = br_mdb_parse(skb, nlh, &dev, &entry, mdb_attrs, extack); if (err < 0) return err; @@ -917,10 +1090,10 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; - err = __br_mdb_del(br, entry); + err = __br_mdb_del(br, entry, mdb_attrs); } } else { - err = __br_mdb_del(br, entry); + err = __br_mdb_del(br, entry, mdb_attrs); } return err; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index e77f1e27caf7..66eb62ded192 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -41,6 +41,13 @@ static const struct rhashtable_params br_mdb_rht_params = { .automatic_shrinking = true, }; +static const struct rhashtable_params br_sg_port_rht_params = { + .head_offset = offsetof(struct net_bridge_port_group, rhnode), + .key_offset = offsetof(struct net_bridge_port_group, key), + .key_len = sizeof(struct net_bridge_port_group_sg_key), + .automatic_shrinking = true, +}; + static void br_multicast_start_querier(struct net_bridge *br, struct bridge_mcast_own_query *query); static void br_multicast_add_router(struct net_bridge *br, @@ -59,6 +66,26 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, const struct in6_addr *group, __u16 vid, const unsigned char *src); #endif +static struct net_bridge_port_group * +__br_multicast_add_group(struct net_bridge *br, + struct net_bridge_port *port, + struct br_ip *group, + const unsigned char *src, + u8 filter_mode, + bool igmpv2_mldv1, + bool blocked); +static void br_multicast_find_del_pg(struct net_bridge *br, + struct net_bridge_port_group *pg); + +static struct net_bridge_port_group * +br_sg_port_find(struct net_bridge *br, + struct net_bridge_port_group_sg_key *sg_p) +{ + lockdep_assert_held_once(&br->multicast_lock); + + return rhashtable_lookup_fast(&br->sg_port_tbl, sg_p, + br_sg_port_rht_params); +} static struct net_bridge_mdb_entry *br_mdb_ip_get_rcu(struct net_bridge *br, struct br_ip *dst) @@ -86,7 +113,7 @@ static struct net_bridge_mdb_entry *br_mdb_ip4_get(struct net_bridge *br, struct br_ip br_dst; memset(&br_dst, 0, sizeof(br_dst)); - br_dst.u.ip4 = dst; + br_dst.dst.ip4 = dst; br_dst.proto = htons(ETH_P_IP); br_dst.vid = vid; @@ -101,7 +128,7 @@ static struct net_bridge_mdb_entry *br_mdb_ip6_get(struct net_bridge *br, struct br_ip br_dst; memset(&br_dst, 0, sizeof(br_dst)); - br_dst.u.ip6 = *dst; + br_dst.dst.ip6 = *dst; br_dst.proto = htons(ETH_P_IPV6); br_dst.vid = vid; @@ -126,11 +153,29 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, switch (skb->protocol) { case htons(ETH_P_IP): - ip.u.ip4 = ip_hdr(skb)->daddr; + ip.dst.ip4 = ip_hdr(skb)->daddr; + if (br->multicast_igmp_version == 3) { + struct net_bridge_mdb_entry *mdb; + + ip.src.ip4 = ip_hdr(skb)->saddr; + mdb = br_mdb_ip_get_rcu(br, &ip); + if (mdb) + return mdb; + ip.src.ip4 = 0; + } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - ip.u.ip6 = ipv6_hdr(skb)->daddr; + ip.dst.ip6 = ipv6_hdr(skb)->daddr; + if (br->multicast_mld_version == 2) { + struct net_bridge_mdb_entry *mdb; + + ip.src.ip6 = ipv6_hdr(skb)->saddr; + mdb = br_mdb_ip_get_rcu(br, &ip); + if (mdb) + return mdb; + memset(&ip.src.ip6, 0, sizeof(ip.src.ip6)); + } break; #endif default: @@ -140,6 +185,326 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, return br_mdb_ip_get_rcu(br, &ip); } +static bool br_port_group_equal(struct net_bridge_port_group *p, + struct net_bridge_port *port, + const unsigned char *src) +{ + if (p->key.port != port) + return false; + + if (!(port->flags & BR_MULTICAST_TO_UNICAST)) + return true; + + return ether_addr_equal(src, p->eth_addr); +} + +static void __fwd_add_star_excl(struct net_bridge_port_group *pg, + struct br_ip *sg_ip) +{ + struct net_bridge_port_group_sg_key sg_key; + struct net_bridge *br = pg->key.port->br; + struct net_bridge_port_group *src_pg; + + memset(&sg_key, 0, sizeof(sg_key)); + sg_key.port = pg->key.port; + sg_key.addr = *sg_ip; + if (br_sg_port_find(br, &sg_key)) + return; + + src_pg = __br_multicast_add_group(br, pg->key.port, sg_ip, pg->eth_addr, + MCAST_INCLUDE, false, false); + if (IS_ERR_OR_NULL(src_pg) || + src_pg->rt_protocol != RTPROT_KERNEL) + return; + + src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL; +} + +static void __fwd_del_star_excl(struct net_bridge_port_group *pg, + struct br_ip *sg_ip) +{ + struct net_bridge_port_group_sg_key sg_key; + struct net_bridge *br = pg->key.port->br; + struct net_bridge_port_group *src_pg; + + memset(&sg_key, 0, sizeof(sg_key)); + sg_key.port = pg->key.port; + sg_key.addr = *sg_ip; + src_pg = br_sg_port_find(br, &sg_key); + if (!src_pg || !(src_pg->flags & MDB_PG_FLAGS_STAR_EXCL) || + src_pg->rt_protocol != RTPROT_KERNEL) + return; + + br_multicast_find_del_pg(br, src_pg); +} + +/* When a port group transitions to (or is added as) EXCLUDE we need to add it + * to all other ports' S,G entries which are not blocked by the current group + * for proper replication, the assumption is that any S,G blocked entries + * are already added so the S,G,port lookup should skip them. + * When a port group transitions from EXCLUDE -> INCLUDE mode or is being + * deleted we need to remove it from all ports' S,G entries where it was + * automatically installed before (i.e. where it's MDB_PG_FLAGS_STAR_EXCL). + */ +void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, + u8 filter_mode) +{ + struct net_bridge *br = pg->key.port->br; + struct net_bridge_port_group *pg_lst; + struct net_bridge_mdb_entry *mp; + struct br_ip sg_ip; + + if (WARN_ON(!br_multicast_is_star_g(&pg->key.addr))) + return; + + mp = br_mdb_ip_get(br, &pg->key.addr); + if (!mp) + return; + + memset(&sg_ip, 0, sizeof(sg_ip)); + sg_ip = pg->key.addr; + for (pg_lst = mlock_dereference(mp->ports, br); + pg_lst; + pg_lst = mlock_dereference(pg_lst->next, br)) { + struct net_bridge_group_src *src_ent; + + if (pg_lst == pg) + continue; + hlist_for_each_entry(src_ent, &pg_lst->src_list, node) { + if (!(src_ent->flags & BR_SGRP_F_INSTALLED)) + continue; + sg_ip.src = src_ent->addr.src; + switch (filter_mode) { + case MCAST_INCLUDE: + __fwd_del_star_excl(pg, &sg_ip); + break; + case MCAST_EXCLUDE: + __fwd_add_star_excl(pg, &sg_ip); + break; + } + } + } +} + +/* called when adding a new S,G with host_joined == false by default */ +static void br_multicast_sg_host_state(struct net_bridge_mdb_entry *star_mp, + struct net_bridge_port_group *sg) +{ + struct net_bridge_mdb_entry *sg_mp; + + if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr))) + return; + if (!star_mp->host_joined) + return; + + sg_mp = br_mdb_ip_get(star_mp->br, &sg->key.addr); + if (!sg_mp) + return; + sg_mp->host_joined = true; +} + +/* set the host_joined state of all of *,G's S,G entries */ +static void br_multicast_star_g_host_state(struct net_bridge_mdb_entry *star_mp) +{ + struct net_bridge *br = star_mp->br; + struct net_bridge_mdb_entry *sg_mp; + struct net_bridge_port_group *pg; + struct br_ip sg_ip; + + if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr))) + return; + + memset(&sg_ip, 0, sizeof(sg_ip)); + sg_ip = star_mp->addr; + for (pg = mlock_dereference(star_mp->ports, br); + pg; + pg = mlock_dereference(pg->next, br)) { + struct net_bridge_group_src *src_ent; + + hlist_for_each_entry(src_ent, &pg->src_list, node) { + if (!(src_ent->flags & BR_SGRP_F_INSTALLED)) + continue; + sg_ip.src = src_ent->addr.src; + sg_mp = br_mdb_ip_get(br, &sg_ip); + if (!sg_mp) + continue; + sg_mp->host_joined = star_mp->host_joined; + } + } +} + +static void br_multicast_sg_del_exclude_ports(struct net_bridge_mdb_entry *sgmp) +{ + struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + + /* *,G exclude ports are only added to S,G entries */ + if (WARN_ON(br_multicast_is_star_g(&sgmp->addr))) + return; + + /* we need the STAR_EXCLUDE ports if there are non-STAR_EXCLUDE ports + * we should ignore perm entries since they're managed by user-space + */ + for (pp = &sgmp->ports; + (p = mlock_dereference(*pp, sgmp->br)) != NULL; + pp = &p->next) + if (!(p->flags & (MDB_PG_FLAGS_STAR_EXCL | + MDB_PG_FLAGS_PERMANENT))) + return; + + /* currently the host can only have joined the *,G which means + * we treat it as EXCLUDE {}, so for an S,G it's considered a + * STAR_EXCLUDE entry and we can safely leave it + */ + sgmp->host_joined = false; + + for (pp = &sgmp->ports; + (p = mlock_dereference(*pp, sgmp->br)) != NULL;) { + if (!(p->flags & MDB_PG_FLAGS_PERMANENT)) + br_multicast_del_pg(sgmp, p, pp); + else + pp = &p->next; + } +} + +void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, + struct net_bridge_port_group *sg) +{ + struct net_bridge_port_group_sg_key sg_key; + struct net_bridge *br = star_mp->br; + struct net_bridge_port_group *pg; + + if (WARN_ON(br_multicast_is_star_g(&sg->key.addr))) + return; + if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr))) + return; + + br_multicast_sg_host_state(star_mp, sg); + memset(&sg_key, 0, sizeof(sg_key)); + sg_key.addr = sg->key.addr; + /* we need to add all exclude ports to the S,G */ + for (pg = mlock_dereference(star_mp->ports, br); + pg; + pg = mlock_dereference(pg->next, br)) { + struct net_bridge_port_group *src_pg; + + if (pg == sg || pg->filter_mode == MCAST_INCLUDE) + continue; + + sg_key.port = pg->key.port; + if (br_sg_port_find(br, &sg_key)) + continue; + + src_pg = __br_multicast_add_group(br, pg->key.port, + &sg->key.addr, + sg->eth_addr, + MCAST_INCLUDE, false, false); + if (IS_ERR_OR_NULL(src_pg) || + src_pg->rt_protocol != RTPROT_KERNEL) + continue; + src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL; + } +} + +static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) +{ + struct net_bridge_mdb_entry *star_mp; + struct net_bridge_port_group *sg; + struct br_ip sg_ip; + + if (src->flags & BR_SGRP_F_INSTALLED) + return; + + memset(&sg_ip, 0, sizeof(sg_ip)); + sg_ip = src->pg->key.addr; + sg_ip.src = src->addr.src; + sg = __br_multicast_add_group(src->br, src->pg->key.port, &sg_ip, + src->pg->eth_addr, MCAST_INCLUDE, false, + !timer_pending(&src->timer)); + if (IS_ERR_OR_NULL(sg)) + return; + src->flags |= BR_SGRP_F_INSTALLED; + sg->flags &= ~MDB_PG_FLAGS_STAR_EXCL; + + /* if it was added by user-space as perm we can skip next steps */ + if (sg->rt_protocol != RTPROT_KERNEL && + (sg->flags & MDB_PG_FLAGS_PERMANENT)) + return; + + /* the kernel is now responsible for removing this S,G */ + del_timer(&sg->timer); + star_mp = br_mdb_ip_get(src->br, &src->pg->key.addr); + if (!star_mp) + return; + + br_multicast_sg_add_exclude_ports(star_mp, sg); +} + +static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src) +{ + struct net_bridge_port_group *p, *pg = src->pg; + struct net_bridge_port_group __rcu **pp; + struct net_bridge_mdb_entry *mp; + struct br_ip sg_ip; + + memset(&sg_ip, 0, sizeof(sg_ip)); + sg_ip = pg->key.addr; + sg_ip.src = src->addr.src; + + mp = br_mdb_ip_get(src->br, &sg_ip); + if (!mp) + return; + + for (pp = &mp->ports; + (p = mlock_dereference(*pp, src->br)) != NULL; + pp = &p->next) { + if (!br_port_group_equal(p, pg->key.port, pg->eth_addr)) + continue; + + if (p->rt_protocol != RTPROT_KERNEL && + (p->flags & MDB_PG_FLAGS_PERMANENT)) + break; + + br_multicast_del_pg(mp, p, pp); + break; + } + src->flags &= ~BR_SGRP_F_INSTALLED; +} + +/* install S,G and based on src's timer enable or disable forwarding */ +static void br_multicast_fwd_src_handle(struct net_bridge_group_src *src) +{ + struct net_bridge_port_group_sg_key sg_key; + struct net_bridge_port_group *sg; + u8 old_flags; + + br_multicast_fwd_src_add(src); + + memset(&sg_key, 0, sizeof(sg_key)); + sg_key.addr = src->pg->key.addr; + sg_key.addr.src = src->addr.src; + sg_key.port = src->pg->key.port; + + sg = br_sg_port_find(src->br, &sg_key); + if (!sg || (sg->flags & MDB_PG_FLAGS_PERMANENT)) + return; + + old_flags = sg->flags; + if (timer_pending(&src->timer)) + sg->flags &= ~MDB_PG_FLAGS_BLOCKED; + else + sg->flags |= MDB_PG_FLAGS_BLOCKED; + + if (old_flags != sg->flags) { + struct net_bridge_mdb_entry *sg_mp; + + sg_mp = br_mdb_ip_get(src->br, &sg_key.addr); + if (!sg_mp) + return; + br_mdb_notify(src->br->dev, sg_mp, sg, RTM_NEWMDB); + } +} + static void br_multicast_destroy_mdb_entry(struct net_bridge_mcast_gc *gc) { struct net_bridge_mdb_entry *mp; @@ -169,7 +534,8 @@ static void br_multicast_group_expired(struct timer_list *t) struct net_bridge *br = mp->br; spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || timer_pending(&mp->timer)) + if (hlist_unhashed(&mp->mdb_node) || !netif_running(br->dev) || + timer_pending(&mp->timer)) goto out; br_multicast_host_leave(mp, true); @@ -194,8 +560,9 @@ static void br_multicast_destroy_group_src(struct net_bridge_mcast_gc *gc) static void br_multicast_del_group_src(struct net_bridge_group_src *src) { - struct net_bridge *br = src->pg->port->br; + struct net_bridge *br = src->pg->key.port->br; + br_multicast_fwd_src_remove(src); hlist_del_init_rcu(&src->node); src->pg->src_ents--; hlist_add_head(&src->mcast_gc.gc_node, &br->mcast_gc_list); @@ -219,15 +586,21 @@ void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_port_group __rcu **pp) { - struct net_bridge *br = pg->port->br; + struct net_bridge *br = pg->key.port->br; struct net_bridge_group_src *ent; struct hlist_node *tmp; + rhashtable_remove_fast(&br->sg_port_tbl, &pg->rhnode, + br_sg_port_rht_params); rcu_assign_pointer(*pp, pg->next); hlist_del_init(&pg->mglist); hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) br_multicast_del_group_src(ent); br_mdb_notify(br->dev, mp, pg, RTM_DELMDB); + if (!br_multicast_is_star_g(&mp->addr)) + br_multicast_sg_del_exclude_ports(mp); + else + br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE); hlist_add_head(&pg->mcast_gc.gc_node, &br->mcast_gc_list); queue_work(system_long_wq, &br->mcast_gc_work); @@ -242,7 +615,7 @@ static void br_multicast_find_del_pg(struct net_bridge *br, struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; - mp = br_mdb_ip_get(br, &pg->addr); + mp = br_mdb_ip_get(br, &pg->key.addr); if (WARN_ON(!mp)) return; @@ -263,7 +636,7 @@ static void br_multicast_port_group_expired(struct timer_list *t) { struct net_bridge_port_group *pg = from_timer(pg, t, timer); struct net_bridge_group_src *src_ent; - struct net_bridge *br = pg->port->br; + struct net_bridge *br = pg->key.port->br; struct hlist_node *tmp; bool changed; @@ -284,7 +657,10 @@ static void br_multicast_port_group_expired(struct timer_list *t) if (hlist_empty(&pg->src_list)) { br_multicast_find_del_pg(br, pg); } else if (changed) { - struct net_bridge_mdb_entry *mp = br_mdb_ip_get(br, &pg->addr); + struct net_bridge_mdb_entry *mp = br_mdb_ip_get(br, &pg->key.addr); + + if (changed && br_multicast_is_star_g(&pg->key.addr)) + br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE); if (WARN_ON(!mp)) goto out; @@ -312,7 +688,7 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, u8 sflag, u8 *igmp_type, bool *need_rexmit) { - struct net_bridge_port *p = pg ? pg->port : NULL; + struct net_bridge_port *p = pg ? pg->key.port : NULL; struct net_bridge_group_src *ent; size_t pkt_size, igmp_hdr_size; unsigned long now = jiffies; @@ -423,7 +799,7 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, if (over_lmqt == time_after(ent->timer.expires, lmqt) && ent->src_query_rexmit_cnt > 0) { - ihv3->srcs[lmqt_srcs++] = ent->addr.u.ip4; + ihv3->srcs[lmqt_srcs++] = ent->addr.src.ip4; ent->src_query_rexmit_cnt--; if (need_rexmit && ent->src_query_rexmit_cnt) *need_rexmit = true; @@ -458,7 +834,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, u8 sflag, u8 *igmp_type, bool *need_rexmit) { - struct net_bridge_port *p = pg ? pg->port : NULL; + struct net_bridge_port *p = pg ? pg->key.port : NULL; struct net_bridge_group_src *ent; size_t pkt_size, mld_hdr_size; unsigned long now = jiffies; @@ -584,7 +960,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, if (over_llqt == time_after(ent->timer.expires, llqt) && ent->src_query_rexmit_cnt > 0) { - mld2q->mld2q_srcs[llqt_srcs++] = ent->addr.u.ip6; + mld2q->mld2q_srcs[llqt_srcs++] = ent->addr.src.ip6; ent->src_query_rexmit_cnt--; if (need_rexmit && ent->src_query_rexmit_cnt) *need_rexmit = true; @@ -625,9 +1001,9 @@ static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, switch (group->proto) { case htons(ETH_P_IP): - ip4_dst = ip_dst ? ip_dst->u.ip4 : htonl(INADDR_ALLHOSTS_GROUP); + ip4_dst = ip_dst ? ip_dst->dst.ip4 : htonl(INADDR_ALLHOSTS_GROUP); return br_ip4_multicast_alloc_query(br, pg, - ip4_dst, group->u.ip4, + ip4_dst, group->dst.ip4, with_srcs, over_lmqt, sflag, igmp_type, need_rexmit); @@ -636,13 +1012,13 @@ static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, struct in6_addr ip6_dst; if (ip_dst) - ip6_dst = ip_dst->u.ip6; + ip6_dst = ip_dst->dst.ip6; else ipv6_addr_set(&ip6_dst, htonl(0xff020000), 0, 0, htonl(1)); return br_ip6_multicast_alloc_query(br, pg, - &ip6_dst, &group->u.ip6, + &ip6_dst, &group->dst.ip6, with_srcs, over_lmqt, sflag, igmp_type, need_rexmit); @@ -704,7 +1080,10 @@ static void br_multicast_group_src_expired(struct timer_list *t) if (!hlist_empty(&pg->src_list)) goto out; br_multicast_find_del_pg(br, pg); + } else { + br_multicast_fwd_src_handle(src); } + out: spin_unlock(&br->multicast_lock); } @@ -717,13 +1096,13 @@ br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip) switch (ip->proto) { case htons(ETH_P_IP): hlist_for_each_entry(ent, &pg->src_list, node) - if (ip->u.ip4 == ent->addr.u.ip4) + if (ip->src.ip4 == ent->addr.src.ip4) return ent; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): hlist_for_each_entry(ent, &pg->src_list, node) - if (!ipv6_addr_cmp(&ent->addr.u.ip6, &ip->u.ip6)) + if (!ipv6_addr_cmp(&ent->addr.src.ip6, &ip->src.ip6)) return ent; break; #endif @@ -742,14 +1121,14 @@ br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_i switch (src_ip->proto) { case htons(ETH_P_IP): - if (ipv4_is_zeronet(src_ip->u.ip4) || - ipv4_is_multicast(src_ip->u.ip4)) + if (ipv4_is_zeronet(src_ip->src.ip4) || + ipv4_is_multicast(src_ip->src.ip4)) return NULL; break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - if (ipv6_addr_any(&src_ip->u.ip6) || - ipv6_addr_is_multicast(&src_ip->u.ip6)) + if (ipv6_addr_any(&src_ip->src.ip6) || + ipv6_addr_is_multicast(&src_ip->src.ip6)) return NULL; break; #endif @@ -760,7 +1139,7 @@ br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_i return NULL; grp_src->pg = pg; - grp_src->br = pg->port->br; + grp_src->br = pg->key.port->br; grp_src->addr = *src_ip; grp_src->mcast_gc.destroy = br_multicast_destroy_group_src; timer_setup(&grp_src->timer, br_multicast_group_src_expired, 0); @@ -777,7 +1156,8 @@ struct net_bridge_port_group *br_multicast_new_port_group( struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src, - u8 filter_mode) + u8 filter_mode, + u8 rt_protocol) { struct net_bridge_port_group *p; @@ -785,12 +1165,21 @@ struct net_bridge_port_group *br_multicast_new_port_group( if (unlikely(!p)) return NULL; - p->addr = *group; - p->port = port; + p->key.addr = *group; + p->key.port = port; p->flags = flags; p->filter_mode = filter_mode; + p->rt_protocol = rt_protocol; p->mcast_gc.destroy = br_multicast_destroy_port_group; INIT_HLIST_HEAD(&p->src_list); + + if (!br_multicast_is_star_g(group) && + rhashtable_lookup_insert_fast(&port->br->sg_port_tbl, &p->rhnode, + br_sg_port_rht_params)) { + kfree(p); + return NULL; + } + rcu_assign_pointer(p->next, next); timer_setup(&p->timer, br_multicast_port_group_expired, 0); timer_setup(&p->rexmit_timer, br_multicast_port_group_rexmit, 0); @@ -804,23 +1193,12 @@ struct net_bridge_port_group *br_multicast_new_port_group( return p; } -static bool br_port_group_equal(struct net_bridge_port_group *p, - struct net_bridge_port *port, - const unsigned char *src) -{ - if (p->port != port) - return false; - - if (!(port->flags & BR_MULTICAST_TO_UNICAST)) - return true; - - return ether_addr_equal(src, p->eth_addr); -} - void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify) { if (!mp->host_joined) { mp->host_joined = true; + if (br_multicast_is_star_g(&mp->addr)) + br_multicast_star_g_host_state(mp); if (notify) br_mdb_notify(mp->br->dev, mp, NULL, RTM_NEWMDB); } @@ -833,32 +1211,33 @@ void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify) return; mp->host_joined = false; + if (br_multicast_is_star_g(&mp->addr)) + br_multicast_star_g_host_state(mp); if (notify) br_mdb_notify(mp->br->dev, mp, NULL, RTM_DELMDB); } -static int br_multicast_add_group(struct net_bridge *br, - struct net_bridge_port *port, - struct br_ip *group, - const unsigned char *src, - u8 filter_mode, - bool igmpv2_mldv1) +static struct net_bridge_port_group * +__br_multicast_add_group(struct net_bridge *br, + struct net_bridge_port *port, + struct br_ip *group, + const unsigned char *src, + u8 filter_mode, + bool igmpv2_mldv1, + bool blocked) { struct net_bridge_port_group __rcu **pp; - struct net_bridge_port_group *p; + struct net_bridge_port_group *p = NULL; struct net_bridge_mdb_entry *mp; unsigned long now = jiffies; - int err; - spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || (port && port->state == BR_STATE_DISABLED)) goto out; mp = br_multicast_new_group(br, group); - err = PTR_ERR(mp); if (IS_ERR(mp)) - goto err; + return ERR_PTR(PTR_ERR(mp)); if (!port) { br_multicast_host_join(mp, true); @@ -870,14 +1249,19 @@ static int br_multicast_add_group(struct net_bridge *br, pp = &p->next) { if (br_port_group_equal(p, port, src)) goto found; - if ((unsigned long)p->port < (unsigned long)port) + if ((unsigned long)p->key.port < (unsigned long)port) break; } - p = br_multicast_new_port_group(port, group, *pp, 0, src, filter_mode); - if (unlikely(!p)) - goto err; + p = br_multicast_new_port_group(port, group, *pp, 0, src, filter_mode, + RTPROT_KERNEL); + if (unlikely(!p)) { + p = ERR_PTR(-ENOMEM); + goto out; + } rcu_assign_pointer(*pp, p); + if (blocked) + p->flags |= MDB_PG_FLAGS_BLOCKED; br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); found: @@ -885,10 +1269,26 @@ found: mod_timer(&p->timer, now + br->multicast_membership_interval); out: - err = 0; + return p; +} -err: +static int br_multicast_add_group(struct net_bridge *br, + struct net_bridge_port *port, + struct br_ip *group, + const unsigned char *src, + u8 filter_mode, + bool igmpv2_mldv1) +{ + struct net_bridge_port_group *pg; + int err; + + spin_lock(&br->multicast_lock); + pg = __br_multicast_add_group(br, port, group, src, filter_mode, + igmpv2_mldv1, false); + /* NULL is considered valid for host joined groups */ + err = IS_ERR(pg) ? PTR_ERR(pg) : 0; spin_unlock(&br->multicast_lock); + return err; } @@ -906,7 +1306,7 @@ static int br_ip4_multicast_add_group(struct net_bridge *br, return 0; memset(&br_group, 0, sizeof(br_group)); - br_group.u.ip4 = group; + br_group.dst.ip4 = group; br_group.proto = htons(ETH_P_IP); br_group.vid = vid; filter_mode = igmpv2 ? MCAST_EXCLUDE : MCAST_INCLUDE; @@ -930,7 +1330,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, return 0; memset(&br_group, 0, sizeof(br_group)); - br_group.u.ip6 = *group; + br_group.dst.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; filter_mode = mldv1 ? MCAST_EXCLUDE : MCAST_INCLUDE; @@ -1019,10 +1419,10 @@ static void br_multicast_select_own_querier(struct net_bridge *br, struct sk_buff *skb) { if (ip->proto == htons(ETH_P_IP)) - br->ip4_querier.addr.u.ip4 = ip_hdr(skb)->saddr; + br->ip4_querier.addr.src.ip4 = ip_hdr(skb)->saddr; #if IS_ENABLED(CONFIG_IPV6) else - br->ip6_querier.addr.u.ip6 = ipv6_hdr(skb)->saddr; + br->ip6_querier.addr.src.ip6 = ipv6_hdr(skb)->saddr; #endif } @@ -1079,7 +1479,7 @@ static void br_multicast_send_query(struct net_bridge *br, !br_opt_get(br, BROPT_MULTICAST_QUERIER)) return; - memset(&br_group.u, 0, sizeof(br_group.u)); + memset(&br_group.dst, 0, sizeof(br_group.dst)); if (port ? (own_query == &port->ip4_own_query) : (own_query == &br->ip4_own_query)) { @@ -1145,7 +1545,7 @@ static void br_multicast_port_group_rexmit(struct timer_list *t) { struct net_bridge_port_group *pg = from_timer(pg, t, rexmit_timer); struct bridge_mcast_other_query *other_query = NULL; - struct net_bridge *br = pg->port->br; + struct net_bridge *br = pg->key.port->br; bool need_rexmit = false; spin_lock(&br->multicast_lock); @@ -1154,7 +1554,7 @@ static void br_multicast_port_group_rexmit(struct timer_list *t) !br_opt_get(br, BROPT_MULTICAST_QUERIER)) goto out; - if (pg->addr.proto == htons(ETH_P_IP)) + if (pg->key.addr.proto == htons(ETH_P_IP)) other_query = &br->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else @@ -1166,11 +1566,11 @@ static void br_multicast_port_group_rexmit(struct timer_list *t) if (pg->grp_query_rexmit_cnt) { pg->grp_query_rexmit_cnt--; - __br_multicast_send_query(br, pg->port, pg, &pg->addr, - &pg->addr, false, 1, NULL); + __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr, + &pg->key.addr, false, 1, NULL); } - __br_multicast_send_query(br, pg->port, pg, &pg->addr, - &pg->addr, true, 0, &need_rexmit); + __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr, + &pg->key.addr, true, 0, &need_rexmit); if (pg->grp_query_rexmit_cnt || need_rexmit) mod_timer(&pg->rexmit_timer, jiffies + @@ -1301,10 +1701,17 @@ static int __grp_src_delete_marked(struct net_bridge_port_group *pg) return deleted; } +static void __grp_src_mod_timer(struct net_bridge_group_src *src, + unsigned long expires) +{ + mod_timer(&src->timer, expires); + br_multicast_fwd_src_handle(src); +} + static void __grp_src_query_marked_and_rexmit(struct net_bridge_port_group *pg) { struct bridge_mcast_other_query *other_query = NULL; - struct net_bridge *br = pg->port->br; + struct net_bridge *br = pg->key.port->br; u32 lmqc = br->multicast_last_member_count; unsigned long lmqt, lmi, now = jiffies; struct net_bridge_group_src *ent; @@ -1313,7 +1720,7 @@ static void __grp_src_query_marked_and_rexmit(struct net_bridge_port_group *pg) !br_opt_get(br, BROPT_MULTICAST_ENABLED)) return; - if (pg->addr.proto == htons(ETH_P_IP)) + if (pg->key.addr.proto == htons(ETH_P_IP)) other_query = &br->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else @@ -1329,7 +1736,7 @@ static void __grp_src_query_marked_and_rexmit(struct net_bridge_port_group *pg) other_query && !timer_pending(&other_query->timer)) ent->src_query_rexmit_cnt = lmqc; - mod_timer(&ent->timer, lmqt); + __grp_src_mod_timer(ent, lmqt); } } } @@ -1338,8 +1745,8 @@ static void __grp_src_query_marked_and_rexmit(struct net_bridge_port_group *pg) !other_query || timer_pending(&other_query->timer)) return; - __br_multicast_send_query(br, pg->port, pg, &pg->addr, - &pg->addr, true, 1, NULL); + __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr, + &pg->key.addr, true, 1, NULL); lmi = now + br->multicast_last_member_interval; if (!timer_pending(&pg->rexmit_timer) || @@ -1350,14 +1757,14 @@ static void __grp_src_query_marked_and_rexmit(struct net_bridge_port_group *pg) static void __grp_send_query_and_rexmit(struct net_bridge_port_group *pg) { struct bridge_mcast_other_query *other_query = NULL; - struct net_bridge *br = pg->port->br; + struct net_bridge *br = pg->key.port->br; unsigned long now = jiffies, lmi; if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) return; - if (pg->addr.proto == htons(ETH_P_IP)) + if (pg->key.addr.proto == htons(ETH_P_IP)) other_query = &br->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else @@ -1368,8 +1775,8 @@ static void __grp_send_query_and_rexmit(struct net_bridge_port_group *pg) other_query && !timer_pending(&other_query->timer)) { lmi = now + br->multicast_last_member_interval; pg->grp_query_rexmit_cnt = br->multicast_last_member_count - 1; - __br_multicast_send_query(br, pg->port, pg, &pg->addr, - &pg->addr, false, 0, NULL); + __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr, + &pg->key.addr, false, 0, NULL); if (!timer_pending(&pg->rexmit_timer) || time_after(pg->rexmit_timer.expires, lmi)) mod_timer(&pg->rexmit_timer, lmi); @@ -1389,7 +1796,7 @@ static void __grp_send_query_and_rexmit(struct net_bridge_port_group *pg) static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, void *srcs, u32 nsrcs, size_t src_size) { - struct net_bridge *br = pg->port->br; + struct net_bridge *br = pg->key.port->br; struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; @@ -1397,9 +1804,9 @@ static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, u32 src_idx; memset(&src_ip, 0, sizeof(src_ip)); - src_ip.proto = pg->addr.proto; + src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.u, srcs, src_size); + memcpy(&src_ip.src, srcs, src_size); ent = br_multicast_find_group_src(pg, &src_ip); if (!ent) { ent = br_multicast_new_group_src(pg, &src_ip); @@ -1408,7 +1815,7 @@ static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, } if (ent) - mod_timer(&ent->timer, now + br_multicast_gmi(br)); + __grp_src_mod_timer(ent, now + br_multicast_gmi(br)); srcs += src_size; } @@ -1431,14 +1838,16 @@ static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, ent->flags |= BR_SGRP_F_DELETE; memset(&src_ip, 0, sizeof(src_ip)); - src_ip.proto = pg->addr.proto; + src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.u, srcs, src_size); + memcpy(&src_ip.src, srcs, src_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) ent->flags &= ~BR_SGRP_F_DELETE; else - br_multicast_new_group_src(pg, &src_ip); + ent = br_multicast_new_group_src(pg, &src_ip); + if (ent) + br_multicast_fwd_src_handle(ent); srcs += src_size; } @@ -1454,7 +1863,7 @@ static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, void *srcs, u32 nsrcs, size_t src_size) { - struct net_bridge *br = pg->port->br; + struct net_bridge *br = pg->key.port->br; struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; @@ -1465,17 +1874,17 @@ static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, ent->flags |= BR_SGRP_F_DELETE; memset(&src_ip, 0, sizeof(src_ip)); - src_ip.proto = pg->addr.proto; + src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.u, srcs, src_size); + memcpy(&src_ip.src, srcs, src_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags &= ~BR_SGRP_F_DELETE; } else { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) { - mod_timer(&ent->timer, - now + br_multicast_gmi(br)); + __grp_src_mod_timer(ent, + now + br_multicast_gmi(br)); changed = true; } } @@ -1491,12 +1900,13 @@ static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, static bool br_multicast_isexc(struct net_bridge_port_group *pg, void *srcs, u32 nsrcs, size_t src_size) { - struct net_bridge *br = pg->port->br; + struct net_bridge *br = pg->key.port->br; bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: __grp_src_isexc_incl(pg, srcs, nsrcs, src_size); + br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: @@ -1517,7 +1927,7 @@ static bool br_multicast_isexc(struct net_bridge_port_group *pg, static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, void *srcs, u32 nsrcs, size_t src_size) { - struct net_bridge *br = pg->port->br; + struct net_bridge *br = pg->key.port->br; u32 src_idx, to_send = pg->src_ents; struct net_bridge_group_src *ent; unsigned long now = jiffies; @@ -1528,9 +1938,9 @@ static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, ent->flags |= BR_SGRP_F_SEND; memset(&src_ip, 0, sizeof(src_ip)); - src_ip.proto = pg->addr.proto; + src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.u, srcs, src_size); + memcpy(&src_ip.src, srcs, src_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags &= ~BR_SGRP_F_SEND; @@ -1541,7 +1951,7 @@ static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, changed = true; } if (ent) - mod_timer(&ent->timer, now + br_multicast_gmi(br)); + __grp_src_mod_timer(ent, now + br_multicast_gmi(br)); srcs += src_size; } @@ -1559,7 +1969,7 @@ static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, static bool __grp_src_toin_excl(struct net_bridge_port_group *pg, void *srcs, u32 nsrcs, size_t src_size) { - struct net_bridge *br = pg->port->br; + struct net_bridge *br = pg->key.port->br; u32 src_idx, to_send = pg->src_ents; struct net_bridge_group_src *ent; unsigned long now = jiffies; @@ -1571,9 +1981,9 @@ static bool __grp_src_toin_excl(struct net_bridge_port_group *pg, ent->flags |= BR_SGRP_F_SEND; memset(&src_ip, 0, sizeof(src_ip)); - src_ip.proto = pg->addr.proto; + src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.u, srcs, src_size); + memcpy(&src_ip.src, srcs, src_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { if (timer_pending(&ent->timer)) { @@ -1586,7 +1996,7 @@ static bool __grp_src_toin_excl(struct net_bridge_port_group *pg, changed = true; } if (ent) - mod_timer(&ent->timer, now + br_multicast_gmi(br)); + __grp_src_mod_timer(ent, now + br_multicast_gmi(br)); srcs += src_size; } @@ -1632,17 +2042,19 @@ static void __grp_src_toex_incl(struct net_bridge_port_group *pg, ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE; memset(&src_ip, 0, sizeof(src_ip)); - src_ip.proto = pg->addr.proto; + src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.u, srcs, src_size); + memcpy(&src_ip.src, srcs, src_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags = (ent->flags & ~BR_SGRP_F_DELETE) | BR_SGRP_F_SEND; to_send++; } else { - br_multicast_new_group_src(pg, &src_ip); + ent = br_multicast_new_group_src(pg, &src_ip); } + if (ent) + br_multicast_fwd_src_handle(ent); srcs += src_size; } @@ -1670,16 +2082,16 @@ static bool __grp_src_toex_excl(struct net_bridge_port_group *pg, ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE; memset(&src_ip, 0, sizeof(src_ip)); - src_ip.proto = pg->addr.proto; + src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.u, srcs, src_size); + memcpy(&src_ip.src, srcs, src_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags &= ~BR_SGRP_F_DELETE; } else { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) { - mod_timer(&ent->timer, pg->timer.expires); + __grp_src_mod_timer(ent, pg->timer.expires); changed = true; } } @@ -1701,12 +2113,13 @@ static bool __grp_src_toex_excl(struct net_bridge_port_group *pg, static bool br_multicast_toex(struct net_bridge_port_group *pg, void *srcs, u32 nsrcs, size_t src_size) { - struct net_bridge *br = pg->port->br; + struct net_bridge *br = pg->key.port->br; bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: __grp_src_toex_incl(pg, srcs, nsrcs, src_size); + br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: @@ -1734,9 +2147,9 @@ static void __grp_src_block_incl(struct net_bridge_port_group *pg, ent->flags &= ~BR_SGRP_F_SEND; memset(&src_ip, 0, sizeof(src_ip)); - src_ip.proto = pg->addr.proto; + src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.u, srcs, src_size); + memcpy(&src_ip.src, srcs, src_size); ent = br_multicast_find_group_src(pg, &src_ip); if (ent) { ent->flags |= BR_SGRP_F_SEND; @@ -1749,7 +2162,7 @@ static void __grp_src_block_incl(struct net_bridge_port_group *pg, __grp_src_query_marked_and_rexmit(pg); if (pg->filter_mode == MCAST_INCLUDE && hlist_empty(&pg->src_list)) - br_multicast_find_del_pg(pg->port->br, pg); + br_multicast_find_del_pg(pg->key.port->br, pg); } /* State Msg type New state Actions @@ -1768,14 +2181,14 @@ static bool __grp_src_block_excl(struct net_bridge_port_group *pg, ent->flags &= ~BR_SGRP_F_SEND; memset(&src_ip, 0, sizeof(src_ip)); - src_ip.proto = pg->addr.proto; + src_ip.proto = pg->key.addr.proto; for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&src_ip.u, srcs, src_size); + memcpy(&src_ip.src, srcs, src_size); ent = br_multicast_find_group_src(pg, &src_ip); if (!ent) { ent = br_multicast_new_group_src(pg, &src_ip); if (ent) { - mod_timer(&ent->timer, pg->timer.expires); + __grp_src_mod_timer(ent, pg->timer.expires); changed = true; } } @@ -2071,16 +2484,16 @@ static bool br_ip4_multicast_select_querier(struct net_bridge *br, !timer_pending(&br->ip4_other_query.timer)) goto update; - if (!br->ip4_querier.addr.u.ip4) + if (!br->ip4_querier.addr.src.ip4) goto update; - if (ntohl(saddr) <= ntohl(br->ip4_querier.addr.u.ip4)) + if (ntohl(saddr) <= ntohl(br->ip4_querier.addr.src.ip4)) goto update; return false; update: - br->ip4_querier.addr.u.ip4 = saddr; + br->ip4_querier.addr.src.ip4 = saddr; /* update protected by general multicast_lock by caller */ rcu_assign_pointer(br->ip4_querier.port, port); @@ -2097,13 +2510,13 @@ static bool br_ip6_multicast_select_querier(struct net_bridge *br, !timer_pending(&br->ip6_other_query.timer)) goto update; - if (ipv6_addr_cmp(saddr, &br->ip6_querier.addr.u.ip6) <= 0) + if (ipv6_addr_cmp(saddr, &br->ip6_querier.addr.src.ip6) <= 0) goto update; return false; update: - br->ip6_querier.addr.u.ip6 = *saddr; + br->ip6_querier.addr.src.ip6 = *saddr; /* update protected by general multicast_lock by caller */ rcu_assign_pointer(br->ip6_querier.port, port); @@ -2118,10 +2531,10 @@ static bool br_multicast_select_querier(struct net_bridge *br, { switch (saddr->proto) { case htons(ETH_P_IP): - return br_ip4_multicast_select_querier(br, port, saddr->u.ip4); + return br_ip4_multicast_select_querier(br, port, saddr->src.ip4); #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - return br_ip6_multicast_select_querier(br, port, &saddr->u.ip6); + return br_ip6_multicast_select_querier(br, port, &saddr->src.ip6); #endif } @@ -2263,7 +2676,7 @@ static void br_ip4_multicast_query(struct net_bridge *br, if (!group) { saddr.proto = htons(ETH_P_IP); - saddr.u.ip4 = iph->saddr; + saddr.src.ip4 = iph->saddr; br_multicast_query_received(br, port, &br->ip4_other_query, &saddr, max_delay); @@ -2351,7 +2764,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, if (is_general_query) { saddr.proto = htons(ETH_P_IPV6); - saddr.u.ip6 = ipv6_hdr(skb)->saddr; + saddr.src.ip6 = ipv6_hdr(skb)->saddr; br_multicast_query_received(br, port, &br->ip6_other_query, &saddr, max_delay); @@ -2475,7 +2888,7 @@ br_multicast_leave_group(struct net_bridge *br, for (p = mlock_dereference(mp->ports, br); p != NULL; p = mlock_dereference(p->next, br)) { - if (p->port != port) + if (p->key.port != port) continue; if (!hlist_unhashed(&p->mglist) && @@ -2506,7 +2919,7 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br, own_query = port ? &port->ip4_own_query : &br->ip4_own_query; memset(&br_group, 0, sizeof(br_group)); - br_group.u.ip4 = group; + br_group.dst.ip4 = group; br_group.proto = htons(ETH_P_IP); br_group.vid = vid; @@ -2530,7 +2943,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, own_query = port ? &port->ip6_own_query : &br->ip6_own_query; memset(&br_group, 0, sizeof(br_group)); - br_group.u.ip6 = *group; + br_group.dst.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; @@ -3235,7 +3648,7 @@ int br_multicast_list_adjacent(struct net_device *dev, if (!entry) goto unlock; - entry->addr = group->addr; + entry->addr = group->key.addr; list_add(&entry->list, br_ip_list); count++; } @@ -3492,10 +3905,23 @@ void br_multicast_get_stats(const struct net_bridge *br, int br_mdb_hash_init(struct net_bridge *br) { - return rhashtable_init(&br->mdb_hash_tbl, &br_mdb_rht_params); + int err; + + err = rhashtable_init(&br->sg_port_tbl, &br_sg_port_rht_params); + if (err) + return err; + + err = rhashtable_init(&br->mdb_hash_tbl, &br_mdb_rht_params); + if (err) { + rhashtable_destroy(&br->sg_port_tbl); + return err; + } + + return 0; } void br_mdb_hash_fini(struct net_bridge *br) { + rhashtable_destroy(&br->sg_port_tbl); rhashtable_destroy(&br->mdb_hash_tbl); } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a23d2bae56e1..345118e35c42 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -213,11 +213,14 @@ struct net_bridge_fdb_entry { #define MDB_PG_FLAGS_PERMANENT BIT(0) #define MDB_PG_FLAGS_OFFLOAD BIT(1) #define MDB_PG_FLAGS_FAST_LEAVE BIT(2) +#define MDB_PG_FLAGS_STAR_EXCL BIT(3) +#define MDB_PG_FLAGS_BLOCKED BIT(4) #define PG_SRC_ENT_LIMIT 32 #define BR_SGRP_F_DELETE BIT(0) #define BR_SGRP_F_SEND BIT(1) +#define BR_SGRP_F_INSTALLED BIT(2) struct net_bridge_mcast_gc { struct hlist_node gc_node; @@ -238,14 +241,19 @@ struct net_bridge_group_src { struct rcu_head rcu; }; -struct net_bridge_port_group { +struct net_bridge_port_group_sg_key { struct net_bridge_port *port; - struct net_bridge_port_group __rcu *next; struct br_ip addr; +}; + +struct net_bridge_port_group { + struct net_bridge_port_group __rcu *next; + struct net_bridge_port_group_sg_key key; unsigned char eth_addr[ETH_ALEN] __aligned(2); unsigned char flags; unsigned char filter_mode; unsigned char grp_query_rexmit_cnt; + unsigned char rt_protocol; struct hlist_head src_list; unsigned int src_ents; @@ -253,6 +261,7 @@ struct net_bridge_port_group { struct timer_list rexmit_timer; struct hlist_node mglist; + struct rhash_head rhnode; struct net_bridge_mcast_gc mcast_gc; struct rcu_head rcu; }; @@ -440,6 +449,7 @@ struct net_bridge { unsigned long multicast_startup_query_interval; struct rhashtable mdb_hash_tbl; + struct rhashtable sg_port_tbl; struct hlist_head mcast_gc_list; struct hlist_head mdb_list; @@ -804,7 +814,7 @@ struct net_bridge_port_group * br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src, - u8 filter_mode); + u8 filter_mode, u8 rt_protocol); int br_mdb_hash_init(struct net_bridge *br); void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, @@ -825,6 +835,10 @@ void br_mdb_init(void); void br_mdb_uninit(void); void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify); void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify); +void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, + u8 filter_mode); +void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, + struct net_bridge_port_group *sg); #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) @@ -873,6 +887,35 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br, } } +static inline bool br_multicast_is_star_g(const struct br_ip *ip) +{ + switch (ip->proto) { + case htons(ETH_P_IP): + return ipv4_is_zeronet(ip->src.ip4); +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + return ipv6_addr_any(&ip->src.ip6); +#endif + default: + return false; + } +} + +static inline bool br_multicast_should_handle_mode(const struct net_bridge *br, + __be16 proto) +{ + switch (proto) { + case htons(ETH_P_IP): + return !!(br->multicast_igmp_version == 3); +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + return !!(br->multicast_mld_version == 2); +#endif + default: + return false; + } +} + static inline int br_multicast_igmp_type(const struct sk_buff *skb) { return BR_INPUT_SKB_CB(skb)->igmp; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index a0d1a3265b71..838efc682cff 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -12,7 +12,6 @@ #include <net/sock.h> #include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> -#include <linux/btf_ids.h> DEFINE_BPF_STORAGE_CACHE(sk_cache); @@ -379,19 +378,15 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .arg2_type = ARG_PTR_TO_SOCKET, }; -BTF_ID_LIST(sk_storage_btf_ids) -BTF_ID_UNUSED -BTF_ID(struct, sock) - const struct bpf_func_proto sk_storage_get_btf_proto = { .func = bpf_sk_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, - .btf_id = sk_storage_btf_ids, }; const struct bpf_func_proto sk_storage_delete_btf_proto = { @@ -400,7 +395,7 @@ const struct bpf_func_proto sk_storage_delete_btf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .btf_id = sk_storage_btf_ids, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], }; struct bpf_sk_storage_diag { @@ -679,6 +674,7 @@ struct bpf_iter_seq_sk_storage_map_info { static struct bpf_local_storage_elem * bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, struct bpf_local_storage_elem *prev_selem) + __acquires(RCU) __releases(RCU) { struct bpf_local_storage *sk_storage; struct bpf_local_storage_elem *selem; @@ -697,16 +693,16 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, selem = prev_selem; count = 0; while (selem) { - selem = hlist_entry_safe(selem->map_node.next, + selem = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&selem->map_node)), struct bpf_local_storage_elem, map_node); if (!selem) { /* not found, unlock and go to the next bucket */ b = &smap->buckets[bucket_id++]; - raw_spin_unlock_bh(&b->lock); + rcu_read_unlock(); skip_elems = 0; break; } - sk_storage = rcu_dereference_raw(selem->local_storage); + sk_storage = rcu_dereference(selem->local_storage); if (sk_storage) { info->skip_elems = skip_elems + count; return selem; @@ -716,10 +712,10 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, for (i = bucket_id; i < (1U << smap->bucket_log); i++) { b = &smap->buckets[i]; - raw_spin_lock_bh(&b->lock); + rcu_read_lock(); count = 0; - hlist_for_each_entry(selem, &b->list, map_node) { - sk_storage = rcu_dereference_raw(selem->local_storage); + hlist_for_each_entry_rcu(selem, &b->list, map_node) { + sk_storage = rcu_dereference(selem->local_storage); if (sk_storage && count >= skip_elems) { info->bucket_id = i; info->skip_elems = count; @@ -727,7 +723,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info, } count++; } - raw_spin_unlock_bh(&b->lock); + rcu_read_unlock(); skip_elems = 0; } @@ -786,7 +782,7 @@ static int __bpf_sk_storage_map_seq_show(struct seq_file *seq, ctx.meta = &meta; ctx.map = info->map; if (selem) { - sk_storage = rcu_dereference_raw(selem->local_storage); + sk_storage = rcu_dereference(selem->local_storage); ctx.sk = sk_storage->owner; ctx.value = SDATA(selem)->data; } @@ -802,18 +798,12 @@ static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v) } static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) { - struct bpf_iter_seq_sk_storage_map_info *info = seq->private; - struct bpf_local_storage_map *smap; - struct bpf_local_storage_map_bucket *b; - - if (!v) { + if (!v) (void)__bpf_sk_storage_map_seq_show(seq, v); - } else { - smap = (struct bpf_local_storage_map *)info->map; - b = &smap->buckets[info->bucket_id]; - raw_spin_unlock_bh(&b->lock); - } + else + rcu_read_unlock(); } static int bpf_iter_init_sk_storage_map(void *priv_data, diff --git a/net/core/dev.c b/net/core/dev.c index 38a172a63318..873b50ac9668 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5442,15 +5442,20 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) if (new) { u32 i; + mutex_lock(&new->aux->used_maps_mutex); + /* generic XDP does not work with DEVMAPs that can * have a bpf_prog installed on an entry */ for (i = 0; i < new->aux->used_map_cnt; i++) { - if (dev_map_can_have_prog(new->aux->used_maps[i])) - return -EINVAL; - if (cpu_map_prog_allowed(new->aux->used_maps[i])) + if (dev_map_can_have_prog(new->aux->used_maps[i]) || + cpu_map_prog_allowed(new->aux->used_maps[i])) { + mutex_unlock(&new->aux->used_maps_mutex); return -EINVAL; + } } + + mutex_unlock(&new->aux->used_maps_mutex); } switch (xdp->command) { @@ -10016,6 +10021,8 @@ int netdev_refcnt_read(const struct net_device *dev) } EXPORT_SYMBOL(netdev_refcnt_read); +#define WAIT_REFS_MIN_MSECS 1 +#define WAIT_REFS_MAX_MSECS 250 /** * netdev_wait_allrefs - wait until all references are gone. * @dev: target net_device @@ -10028,8 +10035,6 @@ EXPORT_SYMBOL(netdev_refcnt_read); * We can get stuck here if buggy protocols don't correctly * call dev_put. */ -#define WAIT_REFS_MIN_MSECS 1 -#define WAIT_REFS_MAX_MSECS 250 static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; diff --git a/net/core/filter.c b/net/core/filter.c index 08f577114acc..706f8db0ccf8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3803,19 +3803,18 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -BTF_ID_LIST(bpf_skb_output_btf_ids) -BTF_ID(struct, sk_buff) +BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff) const struct bpf_func_proto bpf_skb_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_skb_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, .arg5_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_skb_output_btf_ids, }; static unsigned short bpf_tunnel_key_af(u64 flags) @@ -4199,19 +4198,18 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -BTF_ID_LIST(bpf_xdp_output_btf_ids) -BTF_ID(struct, xdp_buff) +BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff) const struct bpf_func_proto bpf_xdp_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_xdp_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, .arg5_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_xdp_output_btf_ids, }; BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) @@ -4313,10 +4311,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -#define SOCKOPT_CC_REINIT (1 << 0) - static int _bpf_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen, u32 flags) + char *optval, int optlen) { char devname[IFNAMSIZ]; int val, valbool; @@ -4449,13 +4445,11 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; - bool reinit = flags & SOCKOPT_CC_REINIT; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, - reinit, true); + ret = tcp_set_congestion_control(sk, name, false, true); } else { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -4615,9 +4609,7 @@ err_clear: BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { - u32 flags = 0; - return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen, - flags); + return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { @@ -4651,11 +4643,7 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { - u32 flags = 0; - if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) - flags |= SOCKOPT_CC_REINIT; - return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen, - flags); + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { @@ -9908,17 +9896,6 @@ BTF_SOCK_TYPE_xxx u32 btf_sock_ids[MAX_BTF_SOCK_TYPE]; #endif -static bool check_arg_btf_id(u32 btf_id, u32 arg) -{ - int i; - - /* only one argument, no need to check arg */ - for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) - if (btf_sock_ids[i] == btf_id) - return true; - return false; -} - BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) { /* tcp6_sock type is not generated in dwarf and hence btf, @@ -9937,7 +9914,7 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], }; @@ -9954,7 +9931,7 @@ const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], }; @@ -9978,7 +9955,7 @@ const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], }; @@ -10002,7 +9979,7 @@ const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], }; @@ -10024,6 +10001,6 @@ const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID, - .check_btf_id = check_arg_btf_id, + .arg1_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], }; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 078386d7d9a2..e1f05e3fa1d0 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -2,6 +2,7 @@ /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/bpf.h> +#include <linux/btf_ids.h> #include <linux/filter.h> #include <linux/errno.h> #include <linux/file.h> @@ -382,7 +383,7 @@ static void *sock_map_lookup(struct bpf_map *map, void *key) struct sock *sk; sk = __sock_map_lookup_elem(map, *(u32 *)key); - if (!sk || !sk_fullsock(sk)) + if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; @@ -703,6 +704,109 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +struct sock_map_seq_info { + struct bpf_map *map; + struct sock *sk; + u32 index; +}; + +struct bpf_iter__sockmap { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); + __bpf_md_ptr(void *, key); + __bpf_md_ptr(struct sock *, sk); +}; + +DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta, + struct bpf_map *map, void *key, + struct sock *sk) + +static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info) +{ + if (unlikely(info->index >= info->map->max_entries)) + return NULL; + + info->sk = __sock_map_lookup_elem(info->map, info->index); + + /* can't return sk directly, since that might be NULL */ + return info; +} + +static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct sock_map_seq_info *info = seq->private; + + if (*pos == 0) + ++*pos; + + /* pairs with sock_map_seq_stop */ + rcu_read_lock(); + return sock_map_seq_lookup_elem(info); +} + +static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock_map_seq_info *info = seq->private; + + ++*pos; + ++info->index; + + return sock_map_seq_lookup_elem(info); +} + +static int sock_map_seq_show(struct seq_file *seq, void *v) +{ + struct sock_map_seq_info *info = seq->private; + struct bpf_iter__sockmap ctx = {}; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, !v); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (v) { + ctx.key = &info->index; + ctx.sk = info->sk; + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static void sock_map_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)sock_map_seq_show(seq, NULL); + + /* pairs with sock_map_seq_start */ + rcu_read_unlock(); +} + +static const struct seq_operations sock_map_seq_ops = { + .start = sock_map_seq_start, + .next = sock_map_seq_next, + .stop = sock_map_seq_stop, + .show = sock_map_seq_show, +}; + +static int sock_map_init_seq_private(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct sock_map_seq_info *info = priv_data; + + info->map = aux->map; + return 0; +} + +static const struct bpf_iter_seq_info sock_map_iter_seq_info = { + .seq_ops = &sock_map_seq_ops, + .init_seq_private = sock_map_init_seq_private, + .seq_priv_size = sizeof(struct sock_map_seq_info), +}; + static int sock_map_btf_id; const struct bpf_map_ops sock_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -717,6 +821,7 @@ const struct bpf_map_ops sock_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_stab", .map_btf_id = &sock_map_btf_id, + .iter_seq_info = &sock_map_iter_seq_info, }; struct bpf_shtab_elem { @@ -953,7 +1058,7 @@ static int sock_hash_get_next_key(struct bpf_map *map, void *key, if (!elem) goto find_first_elem; - elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)), + elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); @@ -965,7 +1070,7 @@ static int sock_hash_get_next_key(struct bpf_map *map, void *key, find_first_elem: for (; i < htab->buckets_num; i++) { head = &sock_hash_select_bucket(htab, i)->head; - elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); @@ -1110,7 +1215,7 @@ static void *sock_hash_lookup(struct bpf_map *map, void *key) struct sock *sk; sk = __sock_hash_lookup_elem(map, key); - if (!sk || !sk_fullsock(sk)) + if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; @@ -1199,6 +1304,117 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = { .arg4_type = ARG_ANYTHING, }; +struct sock_hash_seq_info { + struct bpf_map *map; + struct bpf_shtab *htab; + u32 bucket_id; +}; + +static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info, + struct bpf_shtab_elem *prev_elem) +{ + const struct bpf_shtab *htab = info->htab; + struct bpf_shtab_bucket *bucket; + struct bpf_shtab_elem *elem; + struct hlist_node *node; + + /* try to find next elem in the same bucket */ + if (prev_elem) { + node = rcu_dereference(hlist_next_rcu(&prev_elem->node)); + elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); + if (elem) + return elem; + + /* no more elements, continue in the next bucket */ + info->bucket_id++; + } + + for (; info->bucket_id < htab->buckets_num; info->bucket_id++) { + bucket = &htab->buckets[info->bucket_id]; + node = rcu_dereference(hlist_first_rcu(&bucket->head)); + elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); + if (elem) + return elem; + } + + return NULL; +} + +static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct sock_hash_seq_info *info = seq->private; + + if (*pos == 0) + ++*pos; + + /* pairs with sock_hash_seq_stop */ + rcu_read_lock(); + return sock_hash_seq_find_next(info, NULL); +} + +static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock_hash_seq_info *info = seq->private; + + ++*pos; + return sock_hash_seq_find_next(info, v); +} + +static int sock_hash_seq_show(struct seq_file *seq, void *v) +{ + struct sock_hash_seq_info *info = seq->private; + struct bpf_iter__sockmap ctx = {}; + struct bpf_shtab_elem *elem = v; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, !elem); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.map = info->map; + if (elem) { + ctx.key = elem->key; + ctx.sk = elem->sk; + } + + return bpf_iter_run_prog(prog, &ctx); +} + +static void sock_hash_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)sock_hash_seq_show(seq, NULL); + + /* pairs with sock_hash_seq_start */ + rcu_read_unlock(); +} + +static const struct seq_operations sock_hash_seq_ops = { + .start = sock_hash_seq_start, + .next = sock_hash_seq_next, + .stop = sock_hash_seq_stop, + .show = sock_hash_seq_show, +}; + +static int sock_hash_init_seq_private(void *priv_data, + struct bpf_iter_aux_info *aux) +{ + struct sock_hash_seq_info *info = priv_data; + + info->map = aux->map; + info->htab = container_of(aux->map, struct bpf_shtab, map); + return 0; +} + +static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { + .seq_ops = &sock_hash_seq_ops, + .init_seq_private = sock_hash_init_seq_private, + .seq_priv_size = sizeof(struct sock_hash_seq_info), +}; + static int sock_hash_map_btf_id; const struct bpf_map_ops sock_hash_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -1213,6 +1429,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_shtab", .map_btf_id = &sock_hash_map_btf_id, + .iter_seq_info = &sock_hash_iter_seq_info, }; static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) @@ -1323,3 +1540,62 @@ void sock_map_close(struct sock *sk, long timeout) release_sock(sk); saved_close(sk, timeout); } + +static int sock_map_iter_attach_target(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + struct bpf_map *map; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (map->map_type != BPF_MAP_TYPE_SOCKMAP && + map->map_type != BPF_MAP_TYPE_SOCKHASH) + goto put_map; + + if (prog->aux->max_rdonly_access > map->key_size) { + err = -EACCES; + goto put_map; + } + + aux->map = map; + return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); +} + +static struct bpf_iter_reg sock_map_iter_reg = { + .target = "sockmap", + .attach_target = sock_map_iter_attach_target, + .detach_target = sock_map_iter_detach_target, + .show_fdinfo = bpf_iter_map_show_fdinfo, + .fill_link_info = bpf_iter_map_fill_link_info, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__sockmap, key), + PTR_TO_RDONLY_BUF_OR_NULL }, + { offsetof(struct bpf_iter__sockmap, sk), + PTR_TO_BTF_ID_OR_NULL }, + }, +}; + +static int __init bpf_sockmap_iter_init(void) +{ + sock_map_iter_reg.ctx_arg_info[1].btf_id = + btf_sock_ids[BTF_SOCK_TYPE_SOCK]; + return bpf_iter_reg_target(&sock_map_iter_reg); +} +late_initcall(bpf_sockmap_iter_init); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 2da656d984ef..0348dbab4131 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -7,6 +7,7 @@ #ifndef __DSA_PRIV_H #define __DSA_PRIV_H +#include <linux/if_bridge.h> #include <linux/phy.h> #include <linux/netdevice.h> #include <linux/netpoll.h> @@ -194,6 +195,71 @@ dsa_slave_to_master(const struct net_device *dev) return dp->cpu_dp->master; } +/* If under a bridge with vlan_filtering=0, make sure to send pvid-tagged + * frames as untagged, since the bridge will not untag them. + */ +static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb) +{ + struct dsa_port *dp = dsa_slave_to_port(skb->dev); + struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); + struct net_device *br = dp->bridge_dev; + struct net_device *dev = skb->dev; + struct net_device *upper_dev; + struct list_head *iter; + u16 vid, pvid, proto; + int err; + + if (!br || br_vlan_enabled(br)) + return skb; + + err = br_vlan_get_proto(br, &proto); + if (err) + return skb; + + /* Move VLAN tag from data to hwaccel */ + if (!skb_vlan_tag_present(skb) && hdr->h_vlan_proto == htons(proto)) { + skb = skb_vlan_untag(skb); + if (!skb) + return NULL; + } + + if (!skb_vlan_tag_present(skb)) + return skb; + + vid = skb_vlan_tag_get_id(skb); + + /* We already run under an RCU read-side critical section since + * we are called from netif_receive_skb_list_internal(). + */ + err = br_vlan_get_pvid_rcu(dev, &pvid); + if (err) + return skb; + + if (vid != pvid) + return skb; + + /* The sad part about attempting to untag from DSA is that we + * don't know, unless we check, if the skb will end up in + * the bridge's data path - br_allowed_ingress() - or not. + * For example, there might be an 8021q upper for the + * default_pvid of the bridge, which will steal VLAN-tagged traffic + * from the bridge's data path. This is a configuration that DSA + * supports because vlan_filtering is 0. In that case, we should + * definitely keep the tag, to make sure it keeps working. + */ + netdev_for_each_upper_dev_rcu(dev, upper_dev, iter) { + if (!is_vlan_dev(upper_dev)) + continue; + + if (vid == vlan_dev_vlan_id(upper_dev)) + return skb; + } + + __vlan_hwaccel_clear_tag(skb); + + return skb; +} + /* switch.c */ int dsa_switch_register_notifier(struct dsa_switch *ds); void dsa_switch_unregister_notifier(struct dsa_switch *ds); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index cc8512b5f9e2..1dab212a294f 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -140,6 +140,11 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_TAG_LEN); + /* Set the MAC header to where it should point for + * dsa_untag_bridge_pvid() to parse the correct VLAN header. + */ + skb_set_mac_header(skb, -ETH_HLEN); + skb->offload_fwd_mark = 1; return skb; @@ -191,7 +196,7 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, nskb->data - ETH_HLEN - BRCM_TAG_LEN, 2 * ETH_ALEN); - return nskb; + return dsa_untag_bridge_pvid(nskb); } static const struct dsa_device_ops brcm_netdev_ops = { @@ -219,8 +224,14 @@ static struct sk_buff *brcm_tag_rcv_prepend(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { + struct sk_buff *nskb; + /* tag is prepended to the packet */ - return brcm_tag_rcv_ll(skb, dev, pt, ETH_HLEN); + nskb = brcm_tag_rcv_ll(skb, dev, pt, ETH_HLEN); + if (!nskb) + return nskb; + + return dsa_untag_bridge_pvid(nskb); } static const struct dsa_device_ops brcm_prepend_netdev_ops = { diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index b4fc05cafaa6..d1a7e224adff 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -137,6 +137,7 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct net_device *netdev) { struct dsa_port *dp = dsa_slave_to_port(netdev); + struct sk_buff *clone = DSA_SKB_CB(skb)->clone; struct dsa_switch *ds = dp->ds; struct ocelot *ocelot = ds->priv; struct ocelot_port *ocelot_port; @@ -159,9 +160,8 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, qos_class = skb->priority; packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0); - if (ocelot->ptp && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { - struct sk_buff *clone = DSA_SKB_CB(skb)->clone; - + /* TX timestamping was requested */ + if (clone) { rew_op = ocelot_port->ptp_cmd; /* Retrieve timestamp ID populated inside skb->cb[0] of the * clone by ocelot_port_add_txtstamp_skb diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index e3939f76b024..74a2ef598c31 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -28,23 +28,18 @@ static u32 unsupported_ops[] = { static const struct btf_type *tcp_sock_type; static u32 tcp_sock_id, sock_id; -static int btf_sk_storage_get_ids[5]; static struct bpf_func_proto btf_sk_storage_get_proto __read_mostly; - -static int btf_sk_storage_delete_ids[5]; static struct bpf_func_proto btf_sk_storage_delete_proto __read_mostly; -static void convert_sk_func_proto(struct bpf_func_proto *to, int *to_btf_ids, - const struct bpf_func_proto *from) +static void convert_sk_func_proto(struct bpf_func_proto *to, const struct bpf_func_proto *from) { int i; *to = *from; - to->btf_id = to_btf_ids; for (i = 0; i < ARRAY_SIZE(to->arg_type); i++) { if (to->arg_type[i] == ARG_PTR_TO_SOCKET) { to->arg_type[i] = ARG_PTR_TO_BTF_ID; - to->btf_id[i] = tcp_sock_id; + to->arg_btf_id[i] = &tcp_sock_id; } } } @@ -64,12 +59,8 @@ static int bpf_tcp_ca_init(struct btf *btf) tcp_sock_id = type_id; tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); - convert_sk_func_proto(&btf_sk_storage_get_proto, - btf_sk_storage_get_ids, - &bpf_sk_storage_get_proto); - convert_sk_func_proto(&btf_sk_storage_delete_proto, - btf_sk_storage_delete_ids, - &bpf_sk_storage_delete_proto); + convert_sk_func_proto(&btf_sk_storage_get_proto, &bpf_sk_storage_get_proto); + convert_sk_func_proto(&btf_sk_storage_delete_proto, &bpf_sk_storage_delete_proto); return 0; } @@ -185,8 +176,8 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = { /* In case we want to report error later */ .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &tcp_sock_id, .arg2_type = ARG_ANYTHING, - .btf_id = &tcp_sock_id, }; static const struct bpf_func_proto * diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 65057744fac8..2a8bfa89a515 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2696,6 +2696,7 @@ int tcp_disconnect(struct sock *sk, int flags) if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + icsk->icsk_ca_initialized = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); @@ -3047,7 +3048,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true, true, + err = tcp_set_congestion_control(sk, name, true, ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); release_sock(sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 62878cf26d9c..db47ac24d057 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -176,7 +176,7 @@ void tcp_assign_congestion_control(struct sock *sk) void tcp_init_congestion_control(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) @@ -185,6 +185,7 @@ void tcp_init_congestion_control(struct sock *sk) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); + icsk->icsk_ca_initialized = 1; } static void tcp_reinit_congestion_control(struct sock *sk, @@ -340,7 +341,7 @@ out: * already initialized. */ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, - bool reinit, bool cap_net_admin) + bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -361,28 +362,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, goto out; } - if (!ca) { + if (!ca) err = -ENOENT; - } else if (!load) { - const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; - - if (bpf_try_module_get(ca, ca->owner)) { - if (reinit) { - tcp_reinit_congestion_control(sk, ca); - } else { - icsk->icsk_ca_ops = ca; - bpf_module_put(old_ca, old_ca->owner); - } - } else { - err = -EBUSY; - } - } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { + else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) err = -EPERM; - } else if (!bpf_try_module_get(ca, ca->owner)) { + else if (!bpf_try_module_get(ca, ca->owner)) err = -EBUSY; - } else { + else tcp_reinit_congestion_control(sk, ca); - } out: rcu_read_unlock(); return err; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 50834e7f958e..02d0e2fb77c0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5885,8 +5885,10 @@ void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); tp->snd_cwnd_stamp = tcp_jiffies32; + icsk->icsk_ca_initialized = 0; bpf_skops_established(sk, bpf_op, skb); - tcp_init_congestion_control(sk); + if (!icsk->icsk_ca_initialized) + tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 5eb6662f562a..3895697f8540 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -33,12 +33,6 @@ static DEFINE_PER_CPU(struct list_head, xskmap_flush_list); -bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) -{ - return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && - (xs->pool->fq || READ_ONCE(xs->fq_tmp)); -} - void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { if (pool->cached_need_wakeup & XDP_WAKEUP_RX) @@ -717,6 +711,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) dev, qid); if (err) { xp_destroy(xs->pool); + xs->pool = NULL; sockfd_put(sock); goto out_unlock; } diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index da1f73e43924..b9e896cee5bb 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -39,7 +39,6 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk) return (struct xdp_sock *)sk; } -bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry); int xsk_map_inc(struct xsk_map *map); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 795d7c81c0ca..e63fadd000db 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -287,7 +287,7 @@ static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_devi return NULL; dma_map->dma_pages = kvcalloc(nr_pages, sizeof(*dma_map->dma_pages), GFP_KERNEL); - if (!dma_map) { + if (!dma_map->dma_pages) { kfree(dma_map); return NULL; } @@ -296,7 +296,7 @@ static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_devi dma_map->dev = dev; dma_map->dma_need_sync = false; dma_map->dma_pages_cnt = nr_pages; - refcount_set(&dma_map->users, 0); + refcount_set(&dma_map->users, 1); list_add(&dma_map->list, &umem->xsk_dma_list); return dma_map; } @@ -369,7 +369,6 @@ static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_ pool->dev = dma_map->dev; pool->dma_pages_cnt = dma_map->dma_pages_cnt; pool->dma_need_sync = dma_map->dma_need_sync; - refcount_inc(&dma_map->users); memcpy(pool->dma_pages, dma_map->dma_pages, pool->dma_pages_cnt * sizeof(*pool->dma_pages)); @@ -390,6 +389,7 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, if (err) return err; + refcount_inc(&dma_map->users); return 0; } diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index 5bd8ea9d206a..c014217f5fa7 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -59,22 +59,20 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) du.num_pages = umem->npgs; du.chunk_size = umem->chunk_size; du.headroom = umem->headroom; - du.ifindex = pool->netdev ? pool->netdev->ifindex : 0; - du.queue_id = pool->queue_id; + du.ifindex = (pool && pool->netdev) ? pool->netdev->ifindex : 0; + du.queue_id = pool ? pool->queue_id : 0; du.flags = 0; if (umem->zc) du.flags |= XDP_DU_F_ZEROCOPY; du.refs = refcount_read(&umem->users); err = nla_put(nlskb, XDP_DIAG_UMEM, sizeof(du), &du); - - if (!err && pool->fq) + if (!err && pool && pool->fq) err = xsk_diag_put_ring(pool->fq, XDP_DIAG_UMEM_FILL_RING, nlskb); - if (!err && pool->cq) { - err = xsk_diag_put_ring(pool->cq, XDP_DIAG_UMEM_COMPLETION_RING, - nlskb); - } + if (!err && pool && pool->cq) + err = xsk_diag_put_ring(pool->cq, + XDP_DIAG_UMEM_COMPLETION_RING, nlskb); return err; } diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 2a4fd6677155..0c5df593bc56 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -185,11 +185,6 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, xs = (struct xdp_sock *)sock->sk; - if (!xsk_is_setup_for_bpf_map(xs)) { - sockfd_put(sock); - return -EOPNOTSUPP; - } - map_entry = &m->xsk_map[i]; node = xsk_map_node_alloc(m, map_entry); if (IS_ERR(node)) { diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore index 034800c4d1e6..b2f29bc8dc43 100644 --- a/samples/bpf/.gitignore +++ b/samples/bpf/.gitignore @@ -50,4 +50,5 @@ xdp_rxq_info xdp_sample_pkts xdp_tx_iptunnel xdpsock +xsk_fwd testfile.img diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c index 4dbee7427d47..7793f6a6ae7e 100644 --- a/samples/bpf/sockex3_user.c +++ b/samples/bpf/sockex3_user.c @@ -29,8 +29,8 @@ int main(int argc, char **argv) struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_program *prog; struct bpf_object *obj; + const char *section; char filename[256]; - const char *title; FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); @@ -58,8 +58,8 @@ int main(int argc, char **argv) bpf_object__for_each_program(prog, obj) { fd = bpf_program__fd(prog); - title = bpf_program__title(prog, false); - if (sscanf(title, "socket/%d", &key) != 1) { + section = bpf_program__section_name(prog); + if (sscanf(section, "socket/%d", &key) != 1) { fprintf(stderr, "ERROR: finding prog failed\n"); goto cleanup; } diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c index 847da9284fa8..f090d0dc60d6 100644 --- a/samples/bpf/spintest_user.c +++ b/samples/bpf/spintest_user.c @@ -17,7 +17,7 @@ int main(int ac, char **argv) long key, next_key, value; struct bpf_program *prog; int map_fd, i, j = 0; - const char *title; + const char *section; struct ksym *sym; if (setrlimit(RLIMIT_MEMLOCK, &r)) { @@ -51,8 +51,8 @@ int main(int ac, char **argv) } bpf_object__for_each_program(prog, obj) { - title = bpf_program__title(prog, false); - if (sscanf(title, "kprobe/%s", symbol) != 1) + section = bpf_program__section_name(prog); + if (sscanf(section, "kprobe/%s", symbol) != 1) continue; /* Attach prog only when symbol exists */ diff --git a/samples/bpf/test_map_in_map_kern.c b/samples/bpf/test_map_in_map_kern.c index 8def45c5b697..b0200c8eac09 100644 --- a/samples/bpf/test_map_in_map_kern.c +++ b/samples/bpf/test_map_in_map_kern.c @@ -103,10 +103,9 @@ static __always_inline int do_inline_hash_lookup(void *inner_map, u32 port) return result ? *result : -ENOENT; } -SEC("kprobe/" SYSCALL(sys_connect)) +SEC("kprobe/__sys_connect") int trace_sys_connect(struct pt_regs *ctx) { - struct pt_regs *real_regs = (struct pt_regs *)PT_REGS_PARM1_CORE(ctx); struct sockaddr_in6 *in6; u16 test_case, port, dst6[8]; int addrlen, ret, inline_ret, ret_key = 0; @@ -114,8 +113,8 @@ int trace_sys_connect(struct pt_regs *ctx) void *outer_map, *inner_map; bool inline_hash = false; - in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(real_regs); - addrlen = (int)PT_REGS_PARM3_CORE(real_regs); + in6 = (struct sockaddr_in6 *)PT_REGS_PARM2_CORE(ctx); + addrlen = (int)PT_REGS_PARM3_CORE(ctx); if (addrlen != sizeof(*in6)) return 0; diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c index 98dad57a96c4..c17d3fb5fd64 100644 --- a/samples/bpf/tracex5_user.c +++ b/samples/bpf/tracex5_user.c @@ -39,8 +39,8 @@ int main(int ac, char **argv) struct bpf_program *prog; struct bpf_object *obj; int key, fd, progs_fd; + const char *section; char filename[256]; - const char *title; FILE *f; setrlimit(RLIMIT_MEMLOCK, &r); @@ -78,9 +78,9 @@ int main(int ac, char **argv) } bpf_object__for_each_program(prog, obj) { - title = bpf_program__title(prog, false); + section = bpf_program__section_name(prog); /* register only syscalls to PROG_ARRAY */ - if (sscanf(title, "kprobe/%d", &key) != 1) + if (sscanf(section, "kprobe/%d", &key) != 1) continue; fd = bpf_program__fd(prog); diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 004c0622c913..3dd366e9474d 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -111,7 +111,7 @@ static void print_avail_progs(struct bpf_object *obj) bpf_object__for_each_program(pos, obj) { if (bpf_program__is_xdp(pos)) - printf(" %s\n", bpf_program__title(pos, false)); + printf(" %s\n", bpf_program__section_name(pos)); } } diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 4cead341ae57..b220173dbe1e 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -78,6 +78,7 @@ static int opt_pkt_count; static u16 opt_pkt_size = MIN_PKT_SIZE; static u32 opt_pkt_fill_pattern = 0x12345678; static bool opt_extra_stats; +static bool opt_quiet; static int opt_poll; static int opt_interval = 1; static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; @@ -718,6 +719,7 @@ static struct option long_options[] = { {"tx-pkt-size", required_argument, 0, 's'}, {"tx-pkt-pattern", required_argument, 0, 'P'}, {"extra-stats", no_argument, 0, 'x'}, + {"quiet", no_argument, 0, 'Q'}, {0, 0, 0, 0} }; @@ -753,6 +755,7 @@ static void usage(const char *prog) " Min size: %d, Max size %d.\n" " -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n" " -x, --extra-stats Display extra statistics.\n" + " -Q, --quiet Do not display any stats.\n" "\n"; fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE, opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE, @@ -768,7 +771,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:x", + c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQ", long_options, &option_index); if (c == -1) break; @@ -852,6 +855,9 @@ static void parse_command_line(int argc, char **argv) case 'x': opt_extra_stats = 1; break; + case 'Q': + opt_quiet = 1; + break; default: usage(basename(argv[0])); } @@ -897,6 +903,14 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, if (!xsk->outstanding_tx) return; + /* In copy mode, Tx is driven by a syscall so we need to use e.g. sendto() to + * really send the packets. In zero-copy mode we do not have to do this, since Tx + * is driven by the NAPI loop. So as an optimization, we do not have to call + * sendto() all the time in zero-copy mode for l2fwd. + */ + if (opt_xdp_bind_flags & XDP_COPY) + kick_tx(xsk); + ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size : xsk->outstanding_tx; @@ -1117,6 +1131,7 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) while (ret != rcvd) { if (ret < 0) exit_with_error(-ret); + complete_tx_l2fwd(xsk, fds); if (xsk_ring_prod__needs_wakeup(&xsk->tx)) kick_tx(xsk); ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); @@ -1277,9 +1292,11 @@ int main(int argc, char **argv) setlocale(LC_ALL, ""); - ret = pthread_create(&pt, NULL, poller, NULL); - if (ret) - exit_with_error(ret); + if (!opt_quiet) { + ret = pthread_create(&pt, NULL, poller, NULL); + if (ret) + exit_with_error(ret); + } prev_time = get_nsecs(); start_time = prev_time; @@ -1293,7 +1310,8 @@ int main(int argc, char **argv) benchmark_done = true; - pthread_join(pt, NULL); + if (!opt_quiet) + pthread_join(pt, NULL); xdpsock_cleanup(); diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index e6e2d9e5ff48..dbde59d343b1 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -341,9 +341,9 @@ fi vmlinux_link vmlinux "${kallsymso}" ${btf_vmlinux_bin_o} # fill in BTF IDs -if [ -n "${CONFIG_DEBUG_INFO_BTF}" ]; then -info BTFIDS vmlinux -${RESOLVE_BTFIDS} vmlinux +if [ -n "${CONFIG_DEBUG_INFO_BTF}" -a -n "${CONFIG_BPF}" ]; then + info BTFIDS vmlinux + ${RESOLVE_BTFIDS} vmlinux fi if [ -n "${CONFIG_BUILDTIME_TABLE_SORT}" ]; then diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index 815ac9804aee..f33cb02de95c 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -19,7 +19,7 @@ man8dir = $(mandir)/man8 # Load targets for building eBPF helpers man page. include ../../Makefile.helpers -MAN8_RST = $(filter-out $(HELPERS_RST),$(wildcard *.rst)) +MAN8_RST = $(wildcard bpftool*.rst) _DOC_MAN8 = $(patsubst %.rst,%.8,$(MAN8_RST)) DOC_MAN8 = $(addprefix $(OUTPUT),$(_DOC_MAN8)) @@ -28,12 +28,23 @@ man: man8 helpers man8: $(DOC_MAN8) RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) +RST2MAN_OPTS += --verbose + +list_pages = $(sort $(basename $(filter-out $(1),$(MAN8_RST)))) +see_also = $(subst " ",, \ + "\n" \ + "SEE ALSO\n" \ + "========\n" \ + "\t**bpf**\ (2),\n" \ + "\t**bpf-helpers**\\ (7)" \ + $(foreach page,$(call list_pages,$(1)),",\n\t**$(page)**\\ (8)") \ + "\n") $(OUTPUT)%.8: %.rst ifndef RST2MAN_DEP $(error "rst2man not found, but required to generate man pages") endif - $(QUIET_GEN)rst2man $< > $@ + $(QUIET_GEN)( cat $< ; printf "%b" $(call see_also,$<) ) | rst2man $(RST2MAN_OPTS) > $@ clean: helpers-clean $(call QUIET_CLEAN, Documentation) diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index 896f4c6c2870..ff4d327a582e 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -71,26 +71,12 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. - - -d, --debug - Print all logs available from libbpf, including debug-level - information. + .. include:: common_options.rst EXAMPLES ======== **# bpftool btf dump id 1226** + :: [1] PTR '(anon)' type_id=2 @@ -104,6 +90,7 @@ EXAMPLES This gives an example of default output for all supported BTF kinds. **$ cat prog.c** + :: struct fwd_struct; @@ -144,6 +131,7 @@ This gives an example of default output for all supported BTF kinds. } **$ bpftool btf dump file prog.o** + :: [1] PTR '(anon)' type_id=2 @@ -229,20 +217,3 @@ All the standard ways to specify map or program are supported: **# bpftool btf dump prog tag b88e0a09b1d9759d** **# bpftool btf dump prog pinned /sys/fs/bpf/prog_name** - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index a226aee3574f..790944c35602 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -116,26 +116,11 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. + .. include:: common_options.rst -f, --bpffs Show file names of pinned programs. - -d, --debug - Print all logs available from libbpf, including debug-level - information. - EXAMPLES ======== | @@ -158,19 +143,3 @@ EXAMPLES :: ID AttachType AttachFlags Name - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index 8609f06e71de..dd3771bdbc57 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -71,35 +71,4 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. - - -d, --debug - Print all logs available from libbpf, including debug-level - information. - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) + .. include:: common_options.rst diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index df85dbd962c0..84cf0639696f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -126,26 +126,12 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, - this option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. - - -d, --debug - Print all logs available from libbpf, including debug-level - information. + .. include:: common_options.rst EXAMPLES ======== **$ cat example.c** + :: #include <stdbool.h> @@ -187,6 +173,7 @@ This is example BPF application with two BPF programs and a mix of BPF maps and global variables. **$ bpftool gen skeleton example.o** + :: /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ @@ -241,6 +228,7 @@ and global variables. #endif /* __EXAMPLE_SKEL_H__ */ **$ cat example_user.c** + :: #include "example.skel.h" @@ -283,6 +271,7 @@ and global variables. } **# ./example_user** + :: my_map name: my_map @@ -290,19 +279,3 @@ and global variables. my_static_var: 7 This is a stripped-out version of skeleton generated for above example code. - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst index 070ffacb42b5..51f49bead619 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -51,16 +51,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -d, --debug - Print all logs available, even debug-level information. This - includes logs from libbpf as well as from the verifier, when - attempting to load programs. + .. include:: common_options.rst EXAMPLES ======== @@ -77,19 +68,3 @@ EXAMPLES Create a file-based bpf iterator from bpf_iter_hashmap.o and map with id 20, and pin it to /sys/fs/bpf/my_hashmap - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst index 4a52e7a93339..5f7db2a837cc 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-link.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -21,7 +21,7 @@ LINK COMMANDS | **bpftool** **link { show | list }** [*LINK*] | **bpftool** **link pin** *LINK* *FILE* -| **bpftool** **link detach *LINK* +| **bpftool** **link detach** *LINK* | **bpftool** **link help** | | *LINK* := { **id** *LINK_ID* | **pinned** *FILE* } @@ -62,18 +62,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. + .. include:: common_options.rst -f, --bpffs When showing BPF links, show file names of pinned @@ -83,10 +72,6 @@ OPTIONS Do not automatically attempt to mount any virtual file system (such as tracefs or BPF virtual file system) when necessary. - -d, --debug - Print all logs available, even debug-level information. This - includes logs from libbpf. - EXAMPLES ======== **# bpftool link show** @@ -121,20 +106,3 @@ EXAMPLES :: -rw------- 1 root root 0 Apr 23 21:39 link - - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 083db6c2fc67..dade10cdf295 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -23,7 +23,8 @@ MAP COMMANDS | **bpftool** **map** { **show** | **list** } [*MAP*] | **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ -| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*] +| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] \ +| [**dev** *NAME*] | **bpftool** **map dump** *MAP* | **bpftool** **map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] | **bpftool** **map lookup** *MAP* [**key** *DATA*] @@ -67,7 +68,7 @@ DESCRIPTION maps. On such kernels bpftool will automatically emit this information as well. - **bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*] + **bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] [**dev** *NAME*] Create a new map with given parameters and pin it to *bpffs* as *FILE*. @@ -75,6 +76,11 @@ DESCRIPTION desired flags, e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h UAPI header for existing flags). + To create maps of type array-of-maps or hash-of-maps, the + **inner_map** keyword must be used to pass an inner map. The + kernel needs it to collect metadata related to the inner maps + that the new map will work with. + Keyword **dev** expects a network interface name, and is used to request hardware offload for the map. @@ -155,18 +161,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. + .. include:: common_options.rst -f, --bpffs Show file names of pinned maps. @@ -175,13 +170,10 @@ OPTIONS Do not automatically attempt to mount any virtual file system (such as tracefs or BPF virtual file system) when necessary. - -d, --debug - Print all logs available from libbpf, including debug-level - information. - EXAMPLES ======== **# bpftool map show** + :: 10: hash name some_map flags 0x0 @@ -203,6 +195,7 @@ The following three commands are equivalent: **# bpftool map dump id 10** + :: key: 00 01 02 03 value: 00 01 02 03 04 05 06 07 @@ -210,6 +203,7 @@ The following three commands are equivalent: Found 2 elements **# bpftool map getnext id 10 key 0 1 2 3** + :: key: @@ -276,19 +270,3 @@ would be lost as soon as bpftool exits). key: 00 00 00 00 value: 22 02 00 00 Found 1 element - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index aa7450736179..d8165d530937 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -75,22 +75,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. - - -d, --debug - Print all logs available from libbpf, including debug-level - information. + .. include:: common_options.rst EXAMPLES ======== @@ -187,20 +172,3 @@ EXAMPLES :: xdp: - - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index 9c592b7c6775..e958ce91de72 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -40,22 +40,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. - - -d, --debug - Print all logs available from libbpf, including debug-level - information. + .. include:: common_options.rst EXAMPLES ======== @@ -78,20 +63,3 @@ EXAMPLES {"pid":21765,"fd":5,"prog_id":7,"fd_type":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \ {"pid":21767,"fd":5,"prog_id":8,"fd_type":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \ {"pid":21800,"fd":5,"prog_id":9,"fd_type":"uprobe","filename":"/home/yhs/a.out","offset":1159}] - - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 82e356b664e8..358c7309d419 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -210,18 +210,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. + .. include:: common_options.rst -f, --bpffs When showing BPF programs, show file names of pinned @@ -234,11 +223,6 @@ OPTIONS Do not automatically attempt to mount any virtual file system (such as tracefs or BPF virtual file system) when necessary. - -d, --debug - Print all logs available, even debug-level information. This - includes logs from libbpf as well as from the verifier, when - attempting to load programs. - EXAMPLES ======== **# bpftool prog show** @@ -342,19 +326,3 @@ EXAMPLES 40176203 cycles (83.05%) 42518139 instructions # 1.06 insns per cycle (83.39%) 123 llc_misses # 2.89 LLC misses per million insns (83.15%) - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst index d93cd1cb8b0f..506e70ee78e9 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst @@ -60,23 +60,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short generic help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. - - -d, --debug - Print all logs available, even debug-level information. This - includes logs from libbpf as well as from the verifier, when - attempting to load programs. + .. include:: common_options.rst EXAMPLES ======== @@ -98,20 +82,3 @@ EXAMPLES :: Registered tcp_congestion_ops cubic id 110 - - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool**\ (8), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 420d4d5df8b6..e7d949334961 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -46,18 +46,7 @@ DESCRIPTION OPTIONS ======= - -h, --help - Print short help message (similar to **bpftool help**). - - -V, --version - Print version number (similar to **bpftool version**). - - -j, --json - Generate JSON output. For commands that cannot produce JSON, this - option has no effect. - - -p, --pretty - Generate human-readable JSON output. Implies **-j**. + .. include:: common_options.rst -m, --mapcompat Allow loading maps with unknown map definitions. @@ -65,24 +54,3 @@ OPTIONS -n, --nomount Do not automatically attempt to mount any virtual file system (such as tracefs or BPF virtual file system) when necessary. - - -d, --debug - Print all logs available, even debug-level information. This - includes logs from libbpf as well as from the verifier, when - attempting to load programs. - -SEE ALSO -======== - **bpf**\ (2), - **bpf-helpers**\ (7), - **bpftool-btf**\ (8), - **bpftool-cgroup**\ (8), - **bpftool-feature**\ (8), - **bpftool-gen**\ (8), - **bpftool-iter**\ (8), - **bpftool-link**\ (8), - **bpftool-map**\ (8), - **bpftool-net**\ (8), - **bpftool-perf**\ (8), - **bpftool-prog**\ (8), - **bpftool-struct_ops**\ (8) diff --git a/tools/bpf/bpftool/Documentation/common_options.rst b/tools/bpf/bpftool/Documentation/common_options.rst new file mode 100644 index 000000000000..05d06c74dcaa --- /dev/null +++ b/tools/bpf/bpftool/Documentation/common_options.rst @@ -0,0 +1,22 @@ +-h, --help + Print short help message (similar to **bpftool help**). + +-V, --version + Print version number (similar to **bpftool version**), and optional + features that were included when bpftool was compiled. Optional + features include linking against libbfd to provide the disassembler + for JIT-ted programs (**bpftool prog dump jited**) and usage of BPF + skeletons (some features like **bpftool prog profile** or showing + pids associated to BPF objects may rely on it). + +-j, --json + Generate JSON output. For commands that cannot produce JSON, this + option has no effect. + +-p, --pretty + Generate human-readable JSON output. Implies **-j**. + +-d, --debug + Print all logs available, even debug-level information. This includes + logs from libbpf as well as from the verifier, when attempting to + load programs. diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 7b68e3c0a5fb..3f1da30c4da6 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -709,9 +709,26 @@ _bpftool() "$cur" ) ) return 0 ;; - key|value|flags|name|entries) + key|value|flags|entries) return 0 ;; + inner_map) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) + return 0 + ;; + id) + _bpftool_get_map_ids + ;; + name) + case $pprev in + inner_map) + _bpftool_get_map_names + ;; + *) + return 0 + ;; + esac + ;; *) _bpftool_once_attr 'type' _bpftool_once_attr 'key' @@ -719,6 +736,9 @@ _bpftool() _bpftool_once_attr 'entries' _bpftool_once_attr 'name' _bpftool_once_attr 'flags' + if _bpftool_search_list 'array_of_maps' 'hash_of_maps'; then + _bpftool_once_attr 'inner_map' + fi _bpftool_once_attr 'dev' return 0 ;; diff --git a/tools/bpf/bpftool/json_writer.c b/tools/bpf/bpftool/json_writer.c index 86501cd3c763..7fea83bedf48 100644 --- a/tools/bpf/bpftool/json_writer.c +++ b/tools/bpf/bpftool/json_writer.c @@ -119,6 +119,12 @@ void jsonw_pretty(json_writer_t *self, bool on) self->pretty = on; } +void jsonw_reset(json_writer_t *self) +{ + assert(self->depth == 0); + self->sep = '\0'; +} + /* Basic blocks */ static void jsonw_begin(json_writer_t *self, int c) { diff --git a/tools/bpf/bpftool/json_writer.h b/tools/bpf/bpftool/json_writer.h index 35cf1f00f96c..8ace65cdb92f 100644 --- a/tools/bpf/bpftool/json_writer.h +++ b/tools/bpf/bpftool/json_writer.h @@ -27,6 +27,9 @@ void jsonw_destroy(json_writer_t **self_p); /* Cause output to have pretty whitespace */ void jsonw_pretty(json_writer_t *self, bool on); +/* Reset separator to create new JSON */ +void jsonw_reset(json_writer_t *self); + /* Add property name */ void jsonw_name(json_writer_t *self, const char *name); diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 4a191fcbeb82..682daaa49e6a 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -70,13 +70,42 @@ static int do_help(int argc, char **argv) static int do_version(int argc, char **argv) { +#ifdef HAVE_LIBBFD_SUPPORT + const bool has_libbfd = true; +#else + const bool has_libbfd = false; +#endif +#ifdef BPFTOOL_WITHOUT_SKELETONS + const bool has_skeletons = false; +#else + const bool has_skeletons = true; +#endif + if (json_output) { - jsonw_start_object(json_wtr); + jsonw_start_object(json_wtr); /* root object */ + jsonw_name(json_wtr, "version"); jsonw_printf(json_wtr, "\"%s\"", BPFTOOL_VERSION); - jsonw_end_object(json_wtr); + + jsonw_name(json_wtr, "features"); + jsonw_start_object(json_wtr); /* features */ + jsonw_bool_field(json_wtr, "libbfd", has_libbfd); + jsonw_bool_field(json_wtr, "skeletons", has_skeletons); + jsonw_end_object(json_wtr); /* features */ + + jsonw_end_object(json_wtr); /* root object */ } else { + unsigned int nb_features = 0; + printf("%s v%s\n", bin_name, BPFTOOL_VERSION); + printf("features:"); + if (has_libbfd) { + printf(" libbfd"); + nb_features++; + } + if (has_skeletons) + printf("%s skeletons", nb_features++ ? "," : ""); + printf("\n"); } return 0; } diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index bc0071228f88..a7efbd84fbcc 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -213,8 +213,9 @@ static void print_entry_json(struct bpf_map_info *info, unsigned char *key, jsonw_end_object(json_wtr); } -static void print_entry_error(struct bpf_map_info *info, unsigned char *key, - const char *error_msg) +static void +print_entry_error_msg(struct bpf_map_info *info, unsigned char *key, + const char *error_msg) { int msg_size = strlen(error_msg); bool single_line, break_names; @@ -232,6 +233,40 @@ static void print_entry_error(struct bpf_map_info *info, unsigned char *key, printf("\n"); } +static void +print_entry_error(struct bpf_map_info *map_info, void *key, int lookup_errno) +{ + /* For prog_array maps or arrays of maps, failure to lookup the value + * means there is no entry for that key. Do not print an error message + * in that case. + */ + if ((map_is_map_of_maps(map_info->type) || + map_is_map_of_progs(map_info->type)) && lookup_errno == ENOENT) + return; + + if (json_output) { + jsonw_start_object(json_wtr); /* entry */ + jsonw_name(json_wtr, "key"); + print_hex_data_json(key, map_info->key_size); + jsonw_name(json_wtr, "value"); + jsonw_start_object(json_wtr); /* error */ + jsonw_string_field(json_wtr, "error", strerror(lookup_errno)); + jsonw_end_object(json_wtr); /* error */ + jsonw_end_object(json_wtr); /* entry */ + } else { + const char *msg = NULL; + + if (lookup_errno == ENOENT) + msg = "<no entry>"; + else if (lookup_errno == ENOSPC && + map_info->type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) + msg = "<cannot read>"; + + print_entry_error_msg(map_info, key, + msg ? : strerror(lookup_errno)); + } +} + static void print_entry_plain(struct bpf_map_info *info, unsigned char *key, unsigned char *value) { @@ -713,56 +748,23 @@ static int dump_map_elem(int fd, void *key, void *value, struct bpf_map_info *map_info, struct btf *btf, json_writer_t *btf_wtr) { - int num_elems = 0; - int lookup_errno; - - if (!bpf_map_lookup_elem(fd, key, value)) { - if (json_output) { - print_entry_json(map_info, key, value, btf); - } else { - if (btf) { - struct btf_dumper d = { - .btf = btf, - .jw = btf_wtr, - .is_plain_text = true, - }; - - do_dump_btf(&d, map_info, key, value); - } else { - print_entry_plain(map_info, key, value); - } - num_elems++; - } - return num_elems; + if (bpf_map_lookup_elem(fd, key, value)) { + print_entry_error(map_info, key, errno); + return -1; } - /* lookup error handling */ - lookup_errno = errno; - - if (map_is_map_of_maps(map_info->type) || - map_is_map_of_progs(map_info->type)) - return 0; - if (json_output) { - jsonw_start_object(json_wtr); - jsonw_name(json_wtr, "key"); - print_hex_data_json(key, map_info->key_size); - jsonw_name(json_wtr, "value"); - jsonw_start_object(json_wtr); - jsonw_string_field(json_wtr, "error", strerror(lookup_errno)); - jsonw_end_object(json_wtr); - jsonw_end_object(json_wtr); - } else { - const char *msg = NULL; - - if (lookup_errno == ENOENT) - msg = "<no entry>"; - else if (lookup_errno == ENOSPC && - map_info->type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) - msg = "<cannot read>"; + print_entry_json(map_info, key, value, btf); + } else if (btf) { + struct btf_dumper d = { + .btf = btf, + .jw = btf_wtr, + .is_plain_text = true, + }; - print_entry_error(map_info, key, - msg ? : strerror(lookup_errno)); + do_dump_btf(&d, map_info, key, value); + } else { + print_entry_plain(map_info, key, value); } return 0; @@ -873,7 +875,8 @@ map_dump(int fd, struct bpf_map_info *info, json_writer_t *wtr, err = 0; break; } - num_elems += dump_map_elem(fd, key, value, info, btf, wtr); + if (!dump_map_elem(fd, key, value, info, btf, wtr)) + num_elems++; prev_key = key; } @@ -1247,7 +1250,7 @@ static int do_create(int argc, char **argv) { struct bpf_create_map_attr attr = { NULL, }; const char *pinfile; - int err, fd; + int err = -1, fd; if (!REQ_ARGS(7)) return -1; @@ -1262,13 +1265,13 @@ static int do_create(int argc, char **argv) if (attr.map_type) { p_err("map type already specified"); - return -1; + goto exit; } attr.map_type = map_type_from_str(*argv); if ((int)attr.map_type < 0) { p_err("unrecognized map type: %s", *argv); - return -1; + goto exit; } NEXT_ARG(); } else if (is_prefix(*argv, "name")) { @@ -1277,43 +1280,56 @@ static int do_create(int argc, char **argv) } else if (is_prefix(*argv, "key")) { if (parse_u32_arg(&argc, &argv, &attr.key_size, "key size")) - return -1; + goto exit; } else if (is_prefix(*argv, "value")) { if (parse_u32_arg(&argc, &argv, &attr.value_size, "value size")) - return -1; + goto exit; } else if (is_prefix(*argv, "entries")) { if (parse_u32_arg(&argc, &argv, &attr.max_entries, "max entries")) - return -1; + goto exit; } else if (is_prefix(*argv, "flags")) { if (parse_u32_arg(&argc, &argv, &attr.map_flags, "flags")) - return -1; + goto exit; } else if (is_prefix(*argv, "dev")) { NEXT_ARG(); if (attr.map_ifindex) { p_err("offload device already specified"); - return -1; + goto exit; } attr.map_ifindex = if_nametoindex(*argv); if (!attr.map_ifindex) { p_err("unrecognized netdevice '%s': %s", *argv, strerror(errno)); - return -1; + goto exit; } NEXT_ARG(); + } else if (is_prefix(*argv, "inner_map")) { + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + int inner_map_fd; + + NEXT_ARG(); + if (!REQ_ARGS(2)) + usage(); + inner_map_fd = map_parse_fd_and_info(&argc, &argv, + &info, &len); + if (inner_map_fd < 0) + return -1; + attr.inner_map_fd = inner_map_fd; } else { p_err("unknown arg %s", *argv); - return -1; + goto exit; } } if (!attr.name) { p_err("map name not specified"); - return -1; + goto exit; } set_max_rlimit(); @@ -1321,17 +1337,22 @@ static int do_create(int argc, char **argv) fd = bpf_create_map_xattr(&attr); if (fd < 0) { p_err("map create failed: %s", strerror(errno)); - return -1; + goto exit; } err = do_pin_fd(fd, pinfile); close(fd); if (err) - return err; + goto exit; if (json_output) jsonw_null(json_wtr); - return 0; + +exit: + if (attr.inner_map_fd > 0) + close(attr.inner_map_fd); + + return err; } static int do_pop_dequeue(int argc, char **argv) @@ -1417,7 +1438,7 @@ static int do_help(int argc, char **argv) "Usage: %1$s %2$s { show | list } [MAP]\n" " %1$s %2$s create FILE type TYPE key KEY_SIZE value VALUE_SIZE \\\n" " entries MAX_ENTRIES name NAME [flags FLAGS] \\\n" - " [dev NAME]\n" + " [inner_map MAP] [dev NAME]\n" " %1$s %2$s dump MAP\n" " %1$s %2$s update MAP [key DATA] [value VALUE] [UPDATE_FLAGS]\n" " %1$s %2$s lookup MAP [key DATA]\n" diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index d393eb8263a6..d942c1e3372c 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -29,6 +29,9 @@ #include "main.h" #include "xlated_dumper.h" +#define BPF_METADATA_PREFIX "bpf_metadata_" +#define BPF_METADATA_PREFIX_LEN (sizeof(BPF_METADATA_PREFIX) - 1) + const char * const prog_type_name[] = { [BPF_PROG_TYPE_UNSPEC] = "unspec", [BPF_PROG_TYPE_SOCKET_FILTER] = "socket_filter", @@ -151,6 +154,198 @@ static void show_prog_maps(int fd, __u32 num_maps) } } +static void *find_metadata(int prog_fd, struct bpf_map_info *map_info) +{ + struct bpf_prog_info prog_info; + __u32 prog_info_len; + __u32 map_info_len; + void *value = NULL; + __u32 *map_ids; + int nr_maps; + int key = 0; + int map_fd; + int ret; + __u32 i; + + memset(&prog_info, 0, sizeof(prog_info)); + prog_info_len = sizeof(prog_info); + ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_info_len); + if (ret) + return NULL; + + if (!prog_info.nr_map_ids) + return NULL; + + map_ids = calloc(prog_info.nr_map_ids, sizeof(__u32)); + if (!map_ids) + return NULL; + + nr_maps = prog_info.nr_map_ids; + memset(&prog_info, 0, sizeof(prog_info)); + prog_info.nr_map_ids = nr_maps; + prog_info.map_ids = ptr_to_u64(map_ids); + prog_info_len = sizeof(prog_info); + + ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_info_len); + if (ret) + goto free_map_ids; + + for (i = 0; i < prog_info.nr_map_ids; i++) { + map_fd = bpf_map_get_fd_by_id(map_ids[i]); + if (map_fd < 0) + goto free_map_ids; + + memset(map_info, 0, sizeof(*map_info)); + map_info_len = sizeof(*map_info); + ret = bpf_obj_get_info_by_fd(map_fd, map_info, &map_info_len); + if (ret < 0) { + close(map_fd); + goto free_map_ids; + } + + if (map_info->type != BPF_MAP_TYPE_ARRAY || + map_info->key_size != sizeof(int) || + map_info->max_entries != 1 || + !map_info->btf_value_type_id || + !strstr(map_info->name, ".rodata")) { + close(map_fd); + continue; + } + + value = malloc(map_info->value_size); + if (!value) { + close(map_fd); + goto free_map_ids; + } + + if (bpf_map_lookup_elem(map_fd, &key, value)) { + close(map_fd); + free(value); + value = NULL; + goto free_map_ids; + } + + close(map_fd); + break; + } + +free_map_ids: + free(map_ids); + return value; +} + +static bool has_metadata_prefix(const char *s) +{ + return strncmp(s, BPF_METADATA_PREFIX, BPF_METADATA_PREFIX_LEN) == 0; +} + +static void show_prog_metadata(int fd, __u32 num_maps) +{ + const struct btf_type *t_datasec, *t_var; + struct bpf_map_info map_info; + struct btf_var_secinfo *vsi; + bool printed_header = false; + struct btf *btf = NULL; + unsigned int i, vlen; + void *value = NULL; + const char *name; + int err; + + if (!num_maps) + return; + + memset(&map_info, 0, sizeof(map_info)); + value = find_metadata(fd, &map_info); + if (!value) + return; + + err = btf__get_from_id(map_info.btf_id, &btf); + if (err || !btf) + goto out_free; + + t_datasec = btf__type_by_id(btf, map_info.btf_value_type_id); + if (!btf_is_datasec(t_datasec)) + goto out_free; + + vlen = btf_vlen(t_datasec); + vsi = btf_var_secinfos(t_datasec); + + /* We don't proceed to check the kinds of the elements of the DATASEC. + * The verifier enforces them to be BTF_KIND_VAR. + */ + + if (json_output) { + struct btf_dumper d = { + .btf = btf, + .jw = json_wtr, + .is_plain_text = false, + }; + + for (i = 0; i < vlen; i++, vsi++) { + t_var = btf__type_by_id(btf, vsi->type); + name = btf__name_by_offset(btf, t_var->name_off); + + if (!has_metadata_prefix(name)) + continue; + + if (!printed_header) { + jsonw_name(json_wtr, "metadata"); + jsonw_start_object(json_wtr); + printed_header = true; + } + + jsonw_name(json_wtr, name + BPF_METADATA_PREFIX_LEN); + err = btf_dumper_type(&d, t_var->type, value + vsi->offset); + if (err) { + p_err("btf dump failed: %d", err); + break; + } + } + if (printed_header) + jsonw_end_object(json_wtr); + } else { + json_writer_t *btf_wtr = jsonw_new(stdout); + struct btf_dumper d = { + .btf = btf, + .jw = btf_wtr, + .is_plain_text = true, + }; + + if (!btf_wtr) { + p_err("jsonw alloc failed"); + goto out_free; + } + + for (i = 0; i < vlen; i++, vsi++) { + t_var = btf__type_by_id(btf, vsi->type); + name = btf__name_by_offset(btf, t_var->name_off); + + if (!has_metadata_prefix(name)) + continue; + + if (!printed_header) { + printf("\tmetadata:"); + printed_header = true; + } + + printf("\n\t\t%s = ", name + BPF_METADATA_PREFIX_LEN); + + jsonw_reset(btf_wtr); + err = btf_dumper_type(&d, t_var->type, value + vsi->offset); + if (err) { + p_err("btf dump failed: %d", err); + break; + } + } + if (printed_header) + jsonw_destroy(&btf_wtr); + } + +out_free: + btf__free(btf); + free(value); +} + static void print_prog_header_json(struct bpf_prog_info *info) { jsonw_uint_field(json_wtr, "id", info->id); @@ -228,6 +423,8 @@ static void print_prog_json(struct bpf_prog_info *info, int fd) emit_obj_refs_json(&refs_table, info->id, json_wtr); + show_prog_metadata(fd, info->nr_map_ids); + jsonw_end_object(json_wtr); } @@ -297,6 +494,8 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd) emit_obj_refs_plain(&refs_table, info->id, "\n\tpids "); printf("\n"); + + show_prog_metadata(fd, info->nr_map_ids); } static int show_prog(int fd) @@ -1304,7 +1503,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) enum bpf_prog_type prog_type = common_prog_type; if (prog_type == BPF_PROG_TYPE_UNSPEC) { - const char *sec_name = bpf_program__title(pos, false); + const char *sec_name = bpf_program__section_name(pos); err = get_prog_type_by_name(sec_name, &prog_type, &expected_attach_type); @@ -1398,7 +1597,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) err = bpf_obj_pin(bpf_program__fd(prog), pinfile); if (err) { p_err("failed to pin program %s", - bpf_program__title(prog, false)); + bpf_program__section_name(prog)); goto err_close_obj; } } else { diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index fe8eb537688b..66cb92136de4 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only include ../../scripts/Makefile.include +include ../../scripts/Makefile.arch ifeq ($(srctree),) srctree := $(patsubst %/,%,$(dir $(CURDIR))) @@ -29,6 +30,7 @@ endif AR = $(HOSTAR) CC = $(HOSTCC) LD = $(HOSTLD) +ARCH = $(HOSTARCH) OUTPUT ?= $(srctree)/tools/bpf/resolve_btfids/ diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h index 210b086188a3..57890b357f85 100644 --- a/tools/include/linux/btf_ids.h +++ b/tools/include/linux/btf_ids.h @@ -76,6 +76,13 @@ extern u32 name[]; #define BTF_ID_LIST_GLOBAL(name) \ __BTF_ID_LIST(name, globl) +/* The BTF_ID_LIST_SINGLE macro defines a BTF_ID_LIST with + * a single entry. + */ +#define BTF_ID_LIST_SINGLE(name, prefix, typename) \ + BTF_ID_LIST(name) \ + BTF_ID(prefix, typename) + /* * The BTF_ID_UNUSED macro defines 4 zero bytes. * It's used when we want to define 'unused' entry @@ -140,6 +147,7 @@ extern struct btf_id_set name; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name) u32 name[1]; +#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; #define BTF_SET_START(name) static struct btf_id_set name = { 0 }; #define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; #define BTF_SET_END(name) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 8dda13880957..a22812561064 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -124,6 +124,7 @@ enum bpf_cmd { BPF_ENABLE_STATS, BPF_ITER_CREATE, BPF_LINK_DETACH, + BPF_PROG_BIND_MAP, }; enum bpf_map_type { @@ -658,6 +659,12 @@ union bpf_attr { __u32 flags; } iter_create; + struct { /* struct used by BPF_PROG_BIND_MAP command */ + __u32 prog_fd; + __u32 map_fd; + __u32 flags; /* extra flags */ + } prog_bind_map; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF @@ -1447,8 +1454,8 @@ union bpf_attr { * Return * The return value depends on the result of the test, and can be: * - * * 0, if the *skb* task belongs to the cgroup2. - * * 1, if the *skb* task does not belong to the cgroup2. + * * 0, if current task belongs to the cgroup2. + * * 1, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. * * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) @@ -3349,38 +3356,38 @@ union bpf_attr { * Description * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *task*, which is a valid - * pointer to struct task_struct. To store the stacktrace, the - * bpf program provides *buf* with a nonnegative *size*. + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. * * The last argument, *flags*, holds the number of stack frames to * skip (from 0 to 255), masked with @@ -3410,12 +3417,12 @@ union bpf_attr { * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) * Description * Load header option. Support reading a particular TCP header - * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). * * If *flags* is 0, it will search the option from the - * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** * has details on what skb_data contains under different - * sock_ops->op. + * *skops*\ **->op**. * * The first byte of the *searchby_res* specifies the * kind that it wants to search. @@ -3435,7 +3442,7 @@ union bpf_attr { * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. * * To search for the standard window scale option (3), - * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. * Note, kind-length must be 0 for regular option. * * Searching for No-Op (0) and End-of-Option-List (1) are @@ -3445,27 +3452,30 @@ union bpf_attr { * of a header option. * * Supported flags: + * * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the * saved_syn packet or the just-received syn packet. * * Return - * >0 when found, the header option is copied to *searchby_res*. - * The return value is the total length copied. + * > 0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. On failure, a + * negative error code is returned: * - * **-EINVAL** If param is invalid + * **-EINVAL** if a parameter is invalid. * - * **-ENOMSG** The option is not found + * **-ENOMSG** if the option is not found. * - * **-ENOENT** No syn packet available when - * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * **-ENOENT** if no syn packet is available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. * - * **-ENOSPC** Not enough space. Only *len* number of - * bytes are copied. + * **-ENOSPC** if there is not enough space. Only *len* number of + * bytes are copied. * - * **-EFAULT** Cannot parse the header options in the packet + * **-EFAULT** on failure to parse the header options in the + * packet. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) * Description @@ -3483,44 +3493,44 @@ union bpf_attr { * by searching the same option in the outgoing skb. * * This helper can only be called during - * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * * Return * 0 on success, or negative error in case of failure: * - * **-EINVAL** If param is invalid + * **-EINVAL** If param is invalid. * - * **-ENOSPC** Not enough space in the header. - * Nothing has been written + * **-ENOSPC** if there is not enough space in the header. + * Nothing has been written * - * **-EEXIST** The option has already existed + * **-EEXIST** if the option already exists. * - * **-EFAULT** Cannot parse the existing header options + * **-EFAULT** on failrue to parse the existing header options. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) * Description * Reserve *len* bytes for the bpf header option. The - * space will be used by bpf_store_hdr_opt() later in - * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * space will be used by **bpf_store_hdr_opt**\ () later in + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * - * If bpf_reserve_hdr_opt() is called multiple times, + * If **bpf_reserve_hdr_opt**\ () is called multiple times, * the total number of bytes will be reserved. * * This helper can only be called during - * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. * * Return * 0 on success, or negative error in case of failure: * - * **-EINVAL** if param is invalid + * **-EINVAL** if a parameter is invalid. * - * **-ENOSPC** Not enough space in the header. + * **-ENOSPC** if there is not enough space in the header. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) * Description @@ -3560,9 +3570,9 @@ union bpf_attr { * * long bpf_d_path(struct path *path, char *buf, u32 sz) * Description - * Return full path for given 'struct path' object, which - * needs to be the kernel BTF 'path' object. The path is - * returned in the provided buffer 'buf' of size 'sz' and + * Return full path for given **struct path** object, which + * needs to be the kernel BTF *path* object. The path is + * returned in the provided buffer *buf* of size *sz* and * is zero terminated. * * Return @@ -3573,7 +3583,7 @@ union bpf_attr { * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) * Description * Read *size* bytes from user space address *user_ptr* and store - * the data in *dst*. This is a wrapper of copy_from_user(). + * the data in *dst*. This is a wrapper of **copy_from_user**\ (). * Return * 0 on success, or a negative error in case of failure. */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 82b983ff6569..2baa1308737c 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -872,3 +872,19 @@ int bpf_enable_stats(enum bpf_stats_type type) return sys_bpf(BPF_ENABLE_STATS, &attr, sizeof(attr)); } + +int bpf_prog_bind_map(int prog_fd, int map_fd, + const struct bpf_prog_bind_opts *opts) +{ + union bpf_attr attr; + + if (!OPTS_VALID(opts, bpf_prog_bind_opts)) + return -EINVAL; + + memset(&attr, 0, sizeof(attr)); + attr.prog_bind_map.prog_fd = prog_fd; + attr.prog_bind_map.map_fd = map_fd; + attr.prog_bind_map.flags = OPTS_GET(opts, flags, 0); + + return sys_bpf(BPF_PROG_BIND_MAP, &attr, sizeof(attr)); +} diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 015d13f25fcc..8c1ac4b42f90 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -243,6 +243,14 @@ LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, enum bpf_stats_type; /* defined in up-to-date linux/bpf.h */ LIBBPF_API int bpf_enable_stats(enum bpf_stats_type type); +struct bpf_prog_bind_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; +}; +#define bpf_prog_bind_opts__last_field flags + +LIBBPF_API int bpf_prog_bind_map(int prog_fd, int map_fd, + const struct bpf_prog_bind_opts *opts); #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 91f0ad0e0325..2a55320d87d0 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -57,14 +57,16 @@ LIBBPF_API struct btf_ext *btf_ext__new(__u8 *data, __u32 size); LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext); LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size); -LIBBPF_API int btf_ext__reloc_func_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **func_info, __u32 *cnt); -LIBBPF_API int btf_ext__reloc_line_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **line_info, __u32 *cnt); +LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_func_info was never meant as a public API and has wrong assumptions embedded in it; it will be removed in the future libbpf versions") +int btf_ext__reloc_func_info(const struct btf *btf, + const struct btf_ext *btf_ext, + const char *sec_name, __u32 insns_cnt, + void **func_info, __u32 *cnt); +LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_line_info was never meant as a public API and has wrong assumptions embedded in it; it will be removed in the future libbpf versions") +int btf_ext__reloc_line_info(const struct btf *btf, + const struct btf_ext *btf_ext, + const char *sec_name, __u32 insns_cnt, + void **line_info, __u32 *cnt); LIBBPF_API __u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext); LIBBPF_API __u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 46d727b45c81..32dc444224d8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -73,8 +73,6 @@ #define __printf(a, b) __attribute__((format(printf, a, b))) static struct bpf_map *bpf_object__add_map(struct bpf_object *obj); -static struct bpf_program *bpf_object__find_prog_by_idx(struct bpf_object *obj, - int idx); static const struct btf_type * skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); @@ -176,6 +174,8 @@ enum kern_feature_id { FEAT_EXP_ATTACH_TYPE, /* bpf_probe_read_{kernel,user}[_str] helpers */ FEAT_PROBE_READ_KERN, + /* BPF_PROG_BIND_MAP is supported */ + FEAT_PROG_BIND_MAP, __FEAT_CNT, }; @@ -193,6 +193,7 @@ struct reloc_desc { int insn_idx; int map_idx; int sym_off; + bool processed; }; struct bpf_sec_def; @@ -217,20 +218,45 @@ struct bpf_sec_def { * linux/filter.h. */ struct bpf_program { - /* Index in elf obj file, for relocation use. */ - int idx; - char *name; - int prog_ifindex; - char *section_name; const struct bpf_sec_def *sec_def; - /* section_name with / replaced by _; makes recursive pinning + char *sec_name; + size_t sec_idx; + /* this program's instruction offset (in number of instructions) + * within its containing ELF section + */ + size_t sec_insn_off; + /* number of original instructions in ELF section belonging to this + * program, not taking into account subprogram instructions possible + * appended later during relocation + */ + size_t sec_insn_cnt; + /* Offset (in number of instructions) of the start of instruction + * belonging to this BPF program within its containing main BPF + * program. For the entry-point (main) BPF program, this is always + * zero. For a sub-program, this gets reset before each of main BPF + * programs are processed and relocated and is used to determined + * whether sub-program was already appended to the main program, and + * if yes, at which instruction offset. + */ + size_t sub_insn_off; + + char *name; + /* sec_name with / replaced by _; makes recursive pinning * in bpf_object__pin_programs easier */ char *pin_name; + + /* instructions that belong to BPF program; insns[0] is located at + * sec_insn_off instruction within its ELF section in ELF file, so + * when mapping ELF file instruction index to the local instruction, + * one needs to subtract sec_insn_off; and vice versa. + */ struct bpf_insn *insns; - size_t insns_cnt, main_prog_cnt; - enum bpf_prog_type type; - bool load; + /* actual number of instruction in this BPF program's image; for + * entry-point BPF programs this includes the size of main program + * itself plus all the used sub-programs, appended at the end + */ + size_t insns_cnt; struct reloc_desc *reloc_desc; int nr_reloc; @@ -246,7 +272,10 @@ struct bpf_program { void *priv; bpf_program_clear_priv_t clear_priv; + bool load; + enum bpf_prog_type type; enum bpf_attach_type expected_attach_type; + int prog_ifindex; __u32 attach_btf_id; __u32 attach_prog_fd; void *func_info; @@ -382,9 +411,10 @@ struct bpf_object { struct extern_desc *externs; int nr_extern; int kconfig_map_idx; + int rodata_map_idx; bool loaded; - bool has_pseudo_calls; + bool has_subcalls; /* * Information when doing elf related work. Only valid if fd @@ -446,6 +476,8 @@ static Elf_Scn *elf_sec_by_name(const struct bpf_object *obj, const char *name); static int elf_sec_hdr(const struct bpf_object *obj, Elf_Scn *scn, GElf_Shdr *hdr); static const char *elf_sec_name(const struct bpf_object *obj, Elf_Scn *scn); static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn); +static int elf_sym_by_sec_off(const struct bpf_object *obj, size_t sec_idx, + size_t off, __u32 sym_type, GElf_Sym *sym); void bpf_program__unload(struct bpf_program *prog) { @@ -486,152 +518,160 @@ static void bpf_program__exit(struct bpf_program *prog) bpf_program__unload(prog); zfree(&prog->name); - zfree(&prog->section_name); + zfree(&prog->sec_name); zfree(&prog->pin_name); zfree(&prog->insns); zfree(&prog->reloc_desc); prog->nr_reloc = 0; prog->insns_cnt = 0; - prog->idx = -1; + prog->sec_idx = -1; } static char *__bpf_program__pin_name(struct bpf_program *prog) { char *name, *p; - name = p = strdup(prog->section_name); + name = p = strdup(prog->sec_name); while ((p = strchr(p, '/'))) *p = '_'; return name; } +static bool insn_is_subprog_call(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_JMP && + BPF_OP(insn->code) == BPF_CALL && + BPF_SRC(insn->code) == BPF_K && + insn->src_reg == BPF_PSEUDO_CALL && + insn->dst_reg == 0 && + insn->off == 0; +} + static int -bpf_program__init(void *data, size_t size, const char *section_name, int idx, - struct bpf_program *prog) +bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, + const char *name, size_t sec_idx, const char *sec_name, + size_t sec_off, void *insn_data, size_t insn_data_sz) { - const size_t bpf_insn_sz = sizeof(struct bpf_insn); + int i; - if (size == 0 || size % bpf_insn_sz) { - pr_warn("corrupted section '%s', size: %zu\n", - section_name, size); + if (insn_data_sz == 0 || insn_data_sz % BPF_INSN_SZ || sec_off % BPF_INSN_SZ) { + pr_warn("sec '%s': corrupted program '%s', offset %zu, size %zu\n", + sec_name, name, sec_off, insn_data_sz); return -EINVAL; } memset(prog, 0, sizeof(*prog)); + prog->obj = obj; + + prog->sec_idx = sec_idx; + prog->sec_insn_off = sec_off / BPF_INSN_SZ; + prog->sec_insn_cnt = insn_data_sz / BPF_INSN_SZ; + /* insns_cnt can later be increased by appending used subprograms */ + prog->insns_cnt = prog->sec_insn_cnt; + + prog->type = BPF_PROG_TYPE_UNSPEC; + prog->load = true; - prog->section_name = strdup(section_name); - if (!prog->section_name) { - pr_warn("failed to alloc name for prog under section(%d) %s\n", - idx, section_name); + prog->instances.fds = NULL; + prog->instances.nr = -1; + + prog->sec_name = strdup(sec_name); + if (!prog->sec_name) + goto errout; + + prog->name = strdup(name); + if (!prog->name) goto errout; - } prog->pin_name = __bpf_program__pin_name(prog); - if (!prog->pin_name) { - pr_warn("failed to alloc pin name for prog under section(%d) %s\n", - idx, section_name); + if (!prog->pin_name) goto errout; - } - prog->insns = malloc(size); - if (!prog->insns) { - pr_warn("failed to alloc insns for prog under section %s\n", - section_name); + prog->insns = malloc(insn_data_sz); + if (!prog->insns) goto errout; + memcpy(prog->insns, insn_data, insn_data_sz); + + for (i = 0; i < prog->insns_cnt; i++) { + if (insn_is_subprog_call(&prog->insns[i])) { + obj->has_subcalls = true; + break; + } } - prog->insns_cnt = size / bpf_insn_sz; - memcpy(prog->insns, data, size); - prog->idx = idx; - prog->instances.fds = NULL; - prog->instances.nr = -1; - prog->type = BPF_PROG_TYPE_UNSPEC; - prog->load = true; return 0; errout: + pr_warn("sec '%s': failed to allocate memory for prog '%s'\n", sec_name, name); bpf_program__exit(prog); return -ENOMEM; } static int -bpf_object__add_program(struct bpf_object *obj, void *data, size_t size, - const char *section_name, int idx) +bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, + const char *sec_name, int sec_idx) { - struct bpf_program prog, *progs; + struct bpf_program *prog, *progs; + void *data = sec_data->d_buf; + size_t sec_sz = sec_data->d_size, sec_off, prog_sz; int nr_progs, err; - - err = bpf_program__init(data, size, section_name, idx, &prog); - if (err) - return err; + const char *name; + GElf_Sym sym; progs = obj->programs; nr_progs = obj->nr_programs; + sec_off = 0; - progs = libbpf_reallocarray(progs, nr_progs + 1, sizeof(progs[0])); - if (!progs) { - /* - * In this case the original obj->programs - * is still valid, so don't need special treat for - * bpf_close_object(). - */ - pr_warn("failed to alloc a new program under section '%s'\n", - section_name); - bpf_program__exit(&prog); - return -ENOMEM; - } - - pr_debug("elf: found program '%s'\n", prog.section_name); - obj->programs = progs; - obj->nr_programs = nr_progs + 1; - prog.obj = obj; - progs[nr_progs] = prog; - return 0; -} - -static int -bpf_object__init_prog_names(struct bpf_object *obj) -{ - Elf_Data *symbols = obj->efile.symbols; - struct bpf_program *prog; - size_t pi, si; + while (sec_off < sec_sz) { + if (elf_sym_by_sec_off(obj, sec_idx, sec_off, STT_FUNC, &sym)) { + pr_warn("sec '%s': failed to find program symbol at offset %zu\n", + sec_name, sec_off); + return -LIBBPF_ERRNO__FORMAT; + } - for (pi = 0; pi < obj->nr_programs; pi++) { - const char *name = NULL; + prog_sz = sym.st_size; - prog = &obj->programs[pi]; + name = elf_sym_str(obj, sym.st_name); + if (!name) { + pr_warn("sec '%s': failed to get symbol name for offset %zu\n", + sec_name, sec_off); + return -LIBBPF_ERRNO__FORMAT; + } - for (si = 0; si < symbols->d_size / sizeof(GElf_Sym) && !name; si++) { - GElf_Sym sym; + if (sec_off + prog_sz > sec_sz) { + pr_warn("sec '%s': program at offset %zu crosses section boundary\n", + sec_name, sec_off); + return -LIBBPF_ERRNO__FORMAT; + } - if (!gelf_getsym(symbols, si, &sym)) - continue; - if (sym.st_shndx != prog->idx) - continue; - if (GELF_ST_BIND(sym.st_info) != STB_GLOBAL) - continue; + pr_debug("sec '%s': found program '%s' at insn offset %zu (%zu bytes), code size %zu insns (%zu bytes)\n", + sec_name, name, sec_off / BPF_INSN_SZ, sec_off, prog_sz / BPF_INSN_SZ, prog_sz); - name = elf_sym_str(obj, sym.st_name); - if (!name) { - pr_warn("prog '%s': failed to get symbol name\n", - prog->section_name); - return -LIBBPF_ERRNO__LIBELF; - } + progs = libbpf_reallocarray(progs, nr_progs + 1, sizeof(*progs)); + if (!progs) { + /* + * In this case the original obj->programs + * is still valid, so don't need special treat for + * bpf_close_object(). + */ + pr_warn("sec '%s': failed to alloc memory for new program '%s'\n", + sec_name, name); + return -ENOMEM; } + obj->programs = progs; - if (!name && prog->idx == obj->efile.text_shndx) - name = ".text"; + prog = &progs[nr_progs]; - if (!name) { - pr_warn("prog '%s': failed to find program symbol\n", - prog->section_name); - return -EINVAL; - } + err = bpf_object__init_prog(obj, prog, name, sec_idx, sec_name, + sec_off, data + sec_off, prog_sz); + if (err) + return err; - prog->name = strdup(name); - if (!prog->name) - return -ENOMEM; + nr_progs++; + obj->nr_programs = nr_progs; + + sec_off += prog_sz; } return 0; @@ -1033,6 +1073,7 @@ static struct bpf_object *bpf_object__new(const char *path, obj->efile.bss_shndx = -1; obj->efile.st_ops_shndx = -1; obj->kconfig_map_idx = -1; + obj->rodata_map_idx = -1; obj->kern_version = get_kernel_version(); obj->loaded = false; @@ -1391,6 +1432,8 @@ static int bpf_object__init_global_data_maps(struct bpf_object *obj) obj->efile.rodata->d_size); if (err) return err; + + obj->rodata_map_idx = obj->nr_maps - 1; } if (obj->efile.bss_shndx >= 0) { err = bpf_object__init_internal_map(obj, LIBBPF_MAP_BSS, @@ -2675,6 +2718,26 @@ static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn) return data; } +static int elf_sym_by_sec_off(const struct bpf_object *obj, size_t sec_idx, + size_t off, __u32 sym_type, GElf_Sym *sym) +{ + Elf_Data *symbols = obj->efile.symbols; + size_t n = symbols->d_size / sizeof(GElf_Sym); + int i; + + for (i = 0; i < n; i++) { + if (!gelf_getsym(symbols, i, sym)) + continue; + if (sym->st_shndx != sec_idx || sym->st_value != off) + continue; + if (GELF_ST_TYPE(sym->st_info) != sym_type) + continue; + return 0; + } + + return -ENOENT; +} + static bool is_sec_name_dwarf(const char *name) { /* approximation, but the actual list is too long */ @@ -2715,19 +2778,55 @@ static bool ignore_elf_section(GElf_Shdr *hdr, const char *name) return false; } +static int cmp_progs(const void *_a, const void *_b) +{ + const struct bpf_program *a = _a; + const struct bpf_program *b = _b; + + if (a->sec_idx != b->sec_idx) + return a->sec_idx < b->sec_idx ? -1 : 1; + + /* sec_insn_off can't be the same within the section */ + return a->sec_insn_off < b->sec_insn_off ? -1 : 1; +} + static int bpf_object__elf_collect(struct bpf_object *obj) { Elf *elf = obj->efile.elf; Elf_Data *btf_ext_data = NULL; Elf_Data *btf_data = NULL; - Elf_Scn *scn = NULL; int idx = 0, err = 0; + const char *name; + Elf_Data *data; + Elf_Scn *scn; + GElf_Shdr sh; + /* a bunch of ELF parsing functionality depends on processing symbols, + * so do the first pass and find the symbol table + */ + scn = NULL; while ((scn = elf_nextscn(elf, scn)) != NULL) { - const char *name; - GElf_Shdr sh; - Elf_Data *data; + if (elf_sec_hdr(obj, scn, &sh)) + return -LIBBPF_ERRNO__FORMAT; + + if (sh.sh_type == SHT_SYMTAB) { + if (obj->efile.symbols) { + pr_warn("elf: multiple symbol tables in %s\n", obj->path); + return -LIBBPF_ERRNO__FORMAT; + } + data = elf_sec_data(obj, scn); + if (!data) + return -LIBBPF_ERRNO__FORMAT; + + obj->efile.symbols = data; + obj->efile.symbols_shndx = elf_ndxscn(scn); + obj->efile.strtabidx = sh.sh_link; + } + } + + scn = NULL; + while ((scn = elf_nextscn(elf, scn)) != NULL) { idx++; if (elf_sec_hdr(obj, scn, &sh)) @@ -2766,20 +2865,12 @@ static int bpf_object__elf_collect(struct bpf_object *obj) } else if (strcmp(name, BTF_EXT_ELF_SEC) == 0) { btf_ext_data = data; } else if (sh.sh_type == SHT_SYMTAB) { - if (obj->efile.symbols) { - pr_warn("elf: multiple symbol tables in %s\n", obj->path); - return -LIBBPF_ERRNO__FORMAT; - } - obj->efile.symbols = data; - obj->efile.symbols_shndx = idx; - obj->efile.strtabidx = sh.sh_link; + /* already processed during the first pass above */ } else if (sh.sh_type == SHT_PROGBITS && data->d_size > 0) { if (sh.sh_flags & SHF_EXECINSTR) { if (strcmp(name, ".text") == 0) obj->efile.text_shndx = idx; - err = bpf_object__add_program(obj, data->d_buf, - data->d_size, - name, idx); + err = bpf_object__add_programs(obj, data, name, idx); if (err) return err; } else if (strcmp(name, DATA_SEC) == 0) { @@ -2833,6 +2924,11 @@ static int bpf_object__elf_collect(struct bpf_object *obj) pr_warn("elf: symbol strings section missing or invalid in %s\n", obj->path); return -LIBBPF_ERRNO__FORMAT; } + + /* sort BPF programs by section name and in-section instruction offset + * for faster search */ + qsort(obj->programs, obj->nr_programs, sizeof(*obj->programs), cmp_progs); + return bpf_object__init_btf(obj, btf_data, btf_ext_data); } @@ -3157,20 +3253,6 @@ static int bpf_object__collect_externs(struct bpf_object *obj) return 0; } -static struct bpf_program * -bpf_object__find_prog_by_idx(struct bpf_object *obj, int idx) -{ - struct bpf_program *prog; - size_t i; - - for (i = 0; i < obj->nr_programs; i++) { - prog = &obj->programs[i]; - if (prog->idx == idx) - return prog; - } - return NULL; -} - struct bpf_program * bpf_object__find_program_by_title(const struct bpf_object *obj, const char *title) @@ -3178,12 +3260,18 @@ bpf_object__find_program_by_title(const struct bpf_object *obj, struct bpf_program *pos; bpf_object__for_each_program(pos, obj) { - if (pos->section_name && !strcmp(pos->section_name, title)) + if (pos->sec_name && !strcmp(pos->sec_name, title)) return pos; } return NULL; } +static bool prog_is_subprog(const struct bpf_object *obj, + const struct bpf_program *prog) +{ + return prog->sec_idx == obj->efile.text_shndx && obj->has_subcalls; +} + struct bpf_program * bpf_object__find_program_by_name(const struct bpf_object *obj, const char *name) @@ -3191,6 +3279,8 @@ bpf_object__find_program_by_name(const struct bpf_object *obj, struct bpf_program *prog; bpf_object__for_each_program(prog, obj) { + if (prog_is_subprog(obj, prog)) + continue; if (!strcmp(prog->name, name)) return prog; } @@ -3240,6 +3330,8 @@ static int bpf_program__record_reloc(struct bpf_program *prog, const char *sym_sec_name; struct bpf_map *map; + reloc_desc->processed = false; + /* sub-program call relocation */ if (insn->code == (BPF_JMP | BPF_CALL)) { if (insn->src_reg != BPF_PSEUDO_CALL) { @@ -3261,7 +3353,6 @@ static int bpf_program__record_reloc(struct bpf_program *prog, reloc_desc->type = RELO_CALL; reloc_desc->insn_idx = insn_idx; reloc_desc->sym_off = sym->st_value; - obj->has_pseudo_calls = true; return 0; } @@ -3361,14 +3452,50 @@ static int bpf_program__record_reloc(struct bpf_program *prog, return 0; } +static bool prog_contains_insn(const struct bpf_program *prog, size_t insn_idx) +{ + return insn_idx >= prog->sec_insn_off && + insn_idx < prog->sec_insn_off + prog->sec_insn_cnt; +} + +static struct bpf_program *find_prog_by_sec_insn(const struct bpf_object *obj, + size_t sec_idx, size_t insn_idx) +{ + int l = 0, r = obj->nr_programs - 1, m; + struct bpf_program *prog; + + while (l < r) { + m = l + (r - l + 1) / 2; + prog = &obj->programs[m]; + + if (prog->sec_idx < sec_idx || + (prog->sec_idx == sec_idx && prog->sec_insn_off <= insn_idx)) + l = m; + else + r = m - 1; + } + /* matching program could be at index l, but it still might be the + * wrong one, so we need to double check conditions for the last time + */ + prog = &obj->programs[l]; + if (prog->sec_idx == sec_idx && prog_contains_insn(prog, insn_idx)) + return prog; + return NULL; +} + static int -bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, - Elf_Data *data, struct bpf_object *obj) +bpf_object__collect_prog_relos(struct bpf_object *obj, GElf_Shdr *shdr, Elf_Data *data) { Elf_Data *symbols = obj->efile.symbols; const char *relo_sec_name, *sec_name; size_t sec_idx = shdr->sh_info; + struct bpf_program *prog; + struct reloc_desc *relos; int err, i, nrels; + const char *sym_name; + __u32 insn_idx; + GElf_Sym sym; + GElf_Rel rel; relo_sec_name = elf_sec_str(obj, shdr->sh_name); sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); @@ -3379,19 +3506,7 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, relo_sec_name, sec_idx, sec_name); nrels = shdr->sh_size / shdr->sh_entsize; - prog->reloc_desc = malloc(sizeof(*prog->reloc_desc) * nrels); - if (!prog->reloc_desc) { - pr_warn("failed to alloc memory in relocation\n"); - return -ENOMEM; - } - prog->nr_reloc = nrels; - for (i = 0; i < nrels; i++) { - const char *sym_name; - __u32 insn_idx; - GElf_Sym sym; - GElf_Rel rel; - if (!gelf_getrel(data, i, &rel)) { pr_warn("sec '%s': failed to get relo #%d\n", relo_sec_name, i); return -LIBBPF_ERRNO__FORMAT; @@ -3408,15 +3523,42 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, } insn_idx = rel.r_offset / BPF_INSN_SZ; - sym_name = elf_sym_str(obj, sym.st_name) ?: "<?>"; + /* relocations against static functions are recorded as + * relocations against the section that contains a function; + * in such case, symbol will be STT_SECTION and sym.st_name + * will point to empty string (0), so fetch section name + * instead + */ + if (GELF_ST_TYPE(sym.st_info) == STT_SECTION && sym.st_name == 0) + sym_name = elf_sec_name(obj, elf_sec_by_idx(obj, sym.st_shndx)); + else + sym_name = elf_sym_str(obj, sym.st_name); + sym_name = sym_name ?: "<?"; pr_debug("sec '%s': relo #%d: insn #%u against '%s'\n", relo_sec_name, i, insn_idx, sym_name); - err = bpf_program__record_reloc(prog, &prog->reloc_desc[i], + prog = find_prog_by_sec_insn(obj, sec_idx, insn_idx); + if (!prog) { + pr_warn("sec '%s': relo #%d: program not found in section '%s' for insn #%u\n", + relo_sec_name, i, sec_name, insn_idx); + return -LIBBPF_ERRNO__RELOC; + } + + relos = libbpf_reallocarray(prog->reloc_desc, + prog->nr_reloc + 1, sizeof(*relos)); + if (!relos) + return -ENOMEM; + prog->reloc_desc = relos; + + /* adjust insn_idx to local BPF program frame of reference */ + insn_idx -= prog->sec_insn_off; + err = bpf_program__record_reloc(prog, &relos[prog->nr_reloc], insn_idx, sym_name, &sym, &rel); if (err) return err; + + prog->nr_reloc++; } return 0; } @@ -3758,6 +3900,52 @@ static int probe_kern_probe_read_kernel(void) return probe_fd(bpf_load_program_xattr(&attr, NULL, 0)); } +static int probe_prog_bind_map(void) +{ + struct bpf_load_program_attr prg_attr; + struct bpf_create_map_attr map_attr; + char *cp, errmsg[STRERR_BUFSIZE]; + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int ret, map, prog; + + memset(&map_attr, 0, sizeof(map_attr)); + map_attr.map_type = BPF_MAP_TYPE_ARRAY; + map_attr.key_size = sizeof(int); + map_attr.value_size = 32; + map_attr.max_entries = 1; + + map = bpf_create_map_xattr(&map_attr); + if (map < 0) { + ret = -errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); + pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", + __func__, cp, -ret); + return ret; + } + + memset(&prg_attr, 0, sizeof(prg_attr)); + prg_attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + prg_attr.insns = insns; + prg_attr.insns_cnt = ARRAY_SIZE(insns); + prg_attr.license = "GPL"; + + prog = bpf_load_program_xattr(&prg_attr, NULL, 0); + if (prog < 0) { + close(map); + return 0; + } + + ret = bpf_prog_bind_map(prog, map, NULL); + + close(map); + close(prog); + + return ret >= 0; +} + enum kern_feature_result { FEAT_UNKNOWN = 0, FEAT_SUPPORTED = 1, @@ -3798,6 +3986,9 @@ static struct kern_feature_desc { }, [FEAT_PROBE_READ_KERN] = { "bpf_probe_read_kernel() helper", probe_kern_probe_read_kernel, + }, + [FEAT_PROG_BIND_MAP] = { + "BPF_PROG_BIND_MAP support", probe_prog_bind_map, } }; @@ -4089,75 +4280,6 @@ err_out: return err; } -static int -check_btf_ext_reloc_err(struct bpf_program *prog, int err, - void *btf_prog_info, const char *info_name) -{ - if (err != -ENOENT) { - pr_warn("Error in loading %s for sec %s.\n", - info_name, prog->section_name); - return err; - } - - /* err == -ENOENT (i.e. prog->section_name not found in btf_ext) */ - - if (btf_prog_info) { - /* - * Some info has already been found but has problem - * in the last btf_ext reloc. Must have to error out. - */ - pr_warn("Error in relocating %s for sec %s.\n", - info_name, prog->section_name); - return err; - } - - /* Have problem loading the very first info. Ignore the rest. */ - pr_warn("Cannot find %s for main program sec %s. Ignore all %s.\n", - info_name, prog->section_name, info_name); - return 0; -} - -static int -bpf_program_reloc_btf_ext(struct bpf_program *prog, struct bpf_object *obj, - const char *section_name, __u32 insn_offset) -{ - int err; - - if (!insn_offset || prog->func_info) { - /* - * !insn_offset => main program - * - * For sub prog, the main program's func_info has to - * be loaded first (i.e. prog->func_info != NULL) - */ - err = btf_ext__reloc_func_info(obj->btf, obj->btf_ext, - section_name, insn_offset, - &prog->func_info, - &prog->func_info_cnt); - if (err) - return check_btf_ext_reloc_err(prog, err, - prog->func_info, - "bpf_func_info"); - - prog->func_info_rec_size = btf_ext__func_info_rec_size(obj->btf_ext); - } - - if (!insn_offset || prog->line_info) { - err = btf_ext__reloc_line_info(obj->btf, obj->btf_ext, - section_name, insn_offset, - &prog->line_info, - &prog->line_info_cnt); - if (err) - return check_btf_ext_reloc_err(prog, err, - prog->line_info, - "bpf_line_info"); - - prog->line_info_rec_size = btf_ext__line_info_rec_size(obj->btf_ext); - } - - return 0; -} - #define BPF_CORE_SPEC_MAX_LEN 64 /* represents BPF CO-RE field or array element accessor */ @@ -4927,8 +5049,7 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog, *val = sz; } else { pr_warn("prog '%s': relo %d at insn #%d can't be applied to array access\n", - bpf_program__title(prog, false), - relo->kind, relo->insn_off / 8); + prog->name, relo->kind, relo->insn_off / 8); return -EINVAL; } if (validate) @@ -4950,8 +5071,7 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog, if (byte_sz >= 8) { /* bitfield can't be read with 64-bit read */ pr_warn("prog '%s': relo %d at insn #%d can't be satisfied for bitfield\n", - bpf_program__title(prog, false), - relo->kind, relo->insn_off / 8); + prog->name, relo->kind, relo->insn_off / 8); return -E2BIG; } byte_sz *= 2; @@ -5116,8 +5236,8 @@ static int bpf_core_calc_relo(const struct bpf_program *prog, } else if (err == -EOPNOTSUPP) { /* EOPNOTSUPP means unknown/unsupported relocation */ pr_warn("prog '%s': relo #%d: unrecognized CO-RE relocation %s (%d) at insn #%d\n", - bpf_program__title(prog, false), relo_idx, - core_relo_kind_str(relo->kind), relo->kind, relo->insn_off / 8); + prog->name, relo_idx, core_relo_kind_str(relo->kind), + relo->kind, relo->insn_off / 8); } return err; @@ -5131,7 +5251,7 @@ static void bpf_core_poison_insn(struct bpf_program *prog, int relo_idx, int insn_idx, struct bpf_insn *insn) { pr_debug("prog '%s': relo #%d: substituting insn #%d w/ invalid insn\n", - bpf_program__title(prog, false), relo_idx, insn_idx); + prog->name, relo_idx, insn_idx); insn->code = BPF_JMP | BPF_CALL; insn->dst_reg = 0; insn->src_reg = 0; @@ -5175,6 +5295,11 @@ static int bpf_core_patch_insn(struct bpf_program *prog, if (relo->insn_off % BPF_INSN_SZ) return -EINVAL; insn_idx = relo->insn_off / BPF_INSN_SZ; + /* adjust insn_idx from section frame of reference to the local + * program's frame of reference; (sub-)program code is not yet + * relocated, so it's enough to just subtract in-section offset + */ + insn_idx = insn_idx - prog->sec_insn_off; insn = &prog->insns[insn_idx]; class = BPF_CLASS(insn->code); @@ -5198,14 +5323,14 @@ static int bpf_core_patch_insn(struct bpf_program *prog, return -EINVAL; if (res->validate && insn->imm != orig_val) { pr_warn("prog '%s': relo #%d: unexpected insn #%d (ALU/ALU64) value: got %u, exp %u -> %u\n", - bpf_program__title(prog, false), relo_idx, + prog->name, relo_idx, insn_idx, insn->imm, orig_val, new_val); return -EINVAL; } orig_val = insn->imm; insn->imm = new_val; pr_debug("prog '%s': relo #%d: patched insn #%d (ALU/ALU64) imm %u -> %u\n", - bpf_program__title(prog, false), relo_idx, insn_idx, + prog->name, relo_idx, insn_idx, orig_val, new_val); break; case BPF_LDX: @@ -5213,21 +5338,18 @@ static int bpf_core_patch_insn(struct bpf_program *prog, case BPF_STX: if (res->validate && insn->off != orig_val) { pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDX/ST/STX) value: got %u, exp %u -> %u\n", - bpf_program__title(prog, false), relo_idx, - insn_idx, insn->off, orig_val, new_val); + prog->name, relo_idx, insn_idx, insn->off, orig_val, new_val); return -EINVAL; } if (new_val > SHRT_MAX) { pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) value too big: %u\n", - bpf_program__title(prog, false), relo_idx, - insn_idx, new_val); + prog->name, relo_idx, insn_idx, new_val); return -ERANGE; } orig_val = insn->off; insn->off = new_val; pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %u -> %u\n", - bpf_program__title(prog, false), relo_idx, insn_idx, - orig_val, new_val); + prog->name, relo_idx, insn_idx, orig_val, new_val); break; case BPF_LD: { __u64 imm; @@ -5238,14 +5360,14 @@ static int bpf_core_patch_insn(struct bpf_program *prog, insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || insn[1].off != 0) { pr_warn("prog '%s': relo #%d: insn #%d (LDIMM64) has unexpected form\n", - bpf_program__title(prog, false), relo_idx, insn_idx); + prog->name, relo_idx, insn_idx); return -EINVAL; } imm = insn[0].imm + ((__u64)insn[1].imm << 32); if (res->validate && imm != orig_val) { pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDIMM64) value: got %llu, exp %u -> %u\n", - bpf_program__title(prog, false), relo_idx, + prog->name, relo_idx, insn_idx, (unsigned long long)imm, orig_val, new_val); return -EINVAL; @@ -5254,15 +5376,14 @@ static int bpf_core_patch_insn(struct bpf_program *prog, insn[0].imm = new_val; insn[1].imm = 0; /* currently only 32-bit values are supported */ pr_debug("prog '%s': relo #%d: patched insn #%d (LDIMM64) imm64 %llu -> %u\n", - bpf_program__title(prog, false), relo_idx, insn_idx, + prog->name, relo_idx, insn_idx, (unsigned long long)imm, new_val); break; } default: pr_warn("prog '%s': relo #%d: trying to relocate unrecognized insn #%d, code:0x%x, src:0x%x, dst:0x%x, off:0x%x, imm:0x%x\n", - bpf_program__title(prog, false), relo_idx, - insn_idx, insn->code, insn->src_reg, insn->dst_reg, - insn->off, insn->imm); + prog->name, relo_idx, insn_idx, insn->code, + insn->src_reg, insn->dst_reg, insn->off, insn->imm); return -EINVAL; } @@ -5392,7 +5513,6 @@ static int bpf_core_apply_relo(struct bpf_program *prog, const struct btf *targ_btf, struct hashmap *cand_cache) { - const char *prog_name = bpf_program__title(prog, false); struct bpf_core_spec local_spec, cand_spec, targ_spec = {}; const void *type_key = u32_as_hash_key(relo->type_id); struct bpf_core_relo_res cand_res, targ_res; @@ -5419,13 +5539,13 @@ static int bpf_core_apply_relo(struct bpf_program *prog, err = bpf_core_parse_spec(local_btf, local_id, spec_str, relo->kind, &local_spec); if (err) { pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n", - prog_name, relo_idx, local_id, btf_kind_str(local_type), + prog->name, relo_idx, local_id, btf_kind_str(local_type), str_is_empty(local_name) ? "<anon>" : local_name, spec_str, err); return -EINVAL; } - pr_debug("prog '%s': relo #%d: kind <%s> (%d), spec is ", prog_name, + pr_debug("prog '%s': relo #%d: kind <%s> (%d), spec is ", prog->name, relo_idx, core_relo_kind_str(relo->kind), relo->kind); bpf_core_dump_spec(LIBBPF_DEBUG, &local_spec); libbpf_print(LIBBPF_DEBUG, "\n"); @@ -5442,7 +5562,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, /* libbpf doesn't support candidate search for anonymous types */ if (str_is_empty(spec_str)) { pr_warn("prog '%s': relo #%d: <%s> (%d) relocation doesn't support anonymous types\n", - prog_name, relo_idx, core_relo_kind_str(relo->kind), relo->kind); + prog->name, relo_idx, core_relo_kind_str(relo->kind), relo->kind); return -EOPNOTSUPP; } @@ -5450,7 +5570,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, cand_ids = bpf_core_find_cands(local_btf, local_id, targ_btf); if (IS_ERR(cand_ids)) { pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld", - prog_name, relo_idx, local_id, btf_kind_str(local_type), + prog->name, relo_idx, local_id, btf_kind_str(local_type), local_name, PTR_ERR(cand_ids)); return PTR_ERR(cand_ids); } @@ -5466,13 +5586,13 @@ static int bpf_core_apply_relo(struct bpf_program *prog, err = bpf_core_spec_match(&local_spec, targ_btf, cand_id, &cand_spec); if (err < 0) { pr_warn("prog '%s': relo #%d: error matching candidate #%d ", - prog_name, relo_idx, i); + prog->name, relo_idx, i); bpf_core_dump_spec(LIBBPF_WARN, &cand_spec); libbpf_print(LIBBPF_WARN, ": %d\n", err); return err; } - pr_debug("prog '%s': relo #%d: %s candidate #%d ", prog_name, + pr_debug("prog '%s': relo #%d: %s candidate #%d ", prog->name, relo_idx, err == 0 ? "non-matching" : "matching", i); bpf_core_dump_spec(LIBBPF_DEBUG, &cand_spec); libbpf_print(LIBBPF_DEBUG, "\n"); @@ -5492,7 +5612,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, * should all resolve to the same bit offset */ pr_warn("prog '%s': relo #%d: field offset ambiguity: %u != %u\n", - prog_name, relo_idx, cand_spec.bit_offset, + prog->name, relo_idx, cand_spec.bit_offset, targ_spec.bit_offset); return -EINVAL; } else if (cand_res.poison != targ_res.poison || cand_res.new_val != targ_res.new_val) { @@ -5501,7 +5621,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, * proceed due to ambiguity */ pr_warn("prog '%s': relo #%d: relocation decision ambiguity: %s %u != %s %u\n", - prog_name, relo_idx, + prog->name, relo_idx, cand_res.poison ? "failure" : "success", cand_res.new_val, targ_res.poison ? "failure" : "success", targ_res.new_val); return -EINVAL; @@ -5534,7 +5654,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, */ if (j == 0) { pr_debug("prog '%s': relo #%d: no matching targets found\n", - prog_name, relo_idx); + prog->name, relo_idx); /* calculate single target relo result explicitly */ err = bpf_core_calc_relo(prog, relo, relo_idx, &local_spec, NULL, &targ_res); @@ -5547,7 +5667,7 @@ patch_insn: err = bpf_core_patch_insn(prog, relo, relo_idx, &targ_res); if (err) { pr_warn("prog '%s': relo #%d: failed to patch insn at offset %d: %d\n", - prog_name, relo_idx, relo->insn_off, err); + prog->name, relo_idx, relo->insn_off, err); return -EINVAL; } @@ -5565,7 +5685,7 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) struct bpf_program *prog; struct btf *targ_btf; const char *sec_name; - int i, err = 0; + int i, err = 0, insn_idx, sec_idx; if (obj->btf_ext->core_relo_info.len == 0) return 0; @@ -5592,24 +5712,37 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) err = -EINVAL; goto out; } + /* bpf_object's ELF is gone by now so it's not easy to find + * section index by section name, but we can find *any* + * bpf_program within desired section name and use it's + * prog->sec_idx to do a proper search by section index and + * instruction offset + */ prog = NULL; for (i = 0; i < obj->nr_programs; i++) { - if (!strcmp(obj->programs[i].section_name, sec_name)) { - prog = &obj->programs[i]; + prog = &obj->programs[i]; + if (strcmp(prog->sec_name, sec_name) == 0) break; - } } if (!prog) { - pr_warn("failed to find program '%s' for CO-RE offset relocation\n", - sec_name); - err = -EINVAL; - goto out; + pr_warn("sec '%s': failed to find a BPF program\n", sec_name); + return -ENOENT; } + sec_idx = prog->sec_idx; pr_debug("sec '%s': found %d CO-RE relocations\n", sec_name, sec->num_info); for_each_btf_ext_rec(seg, sec, i, rec) { + insn_idx = rec->insn_off / BPF_INSN_SZ; + prog = find_prog_by_sec_insn(obj, sec_idx, insn_idx); + if (!prog) { + pr_warn("sec '%s': failed to find program at insn #%d for CO-RE offset relocation #%d\n", + sec_name, insn_idx, i); + err = -EINVAL; + goto out; + } + err = bpf_core_apply_relo(prog, rec, i, obj->btf, targ_btf, cand_cache); if (err) { @@ -5633,89 +5766,32 @@ out: return err; } +/* Relocate data references within program code: + * - map references; + * - global variable references; + * - extern references. + */ static int -bpf_program__reloc_text(struct bpf_program *prog, struct bpf_object *obj, - struct reloc_desc *relo) -{ - struct bpf_insn *insn, *new_insn; - struct bpf_program *text; - size_t new_cnt; - int err; - - if (prog->idx != obj->efile.text_shndx && prog->main_prog_cnt == 0) { - text = bpf_object__find_prog_by_idx(obj, obj->efile.text_shndx); - if (!text) { - pr_warn("no .text section found yet relo into text exist\n"); - return -LIBBPF_ERRNO__RELOC; - } - new_cnt = prog->insns_cnt + text->insns_cnt; - new_insn = libbpf_reallocarray(prog->insns, new_cnt, sizeof(*insn)); - if (!new_insn) { - pr_warn("oom in prog realloc\n"); - return -ENOMEM; - } - prog->insns = new_insn; - - if (obj->btf_ext) { - err = bpf_program_reloc_btf_ext(prog, obj, - text->section_name, - prog->insns_cnt); - if (err) - return err; - } - - memcpy(new_insn + prog->insns_cnt, text->insns, - text->insns_cnt * sizeof(*insn)); - prog->main_prog_cnt = prog->insns_cnt; - prog->insns_cnt = new_cnt; - pr_debug("added %zd insn from %s to prog %s\n", - text->insns_cnt, text->section_name, - prog->section_name); - } - - insn = &prog->insns[relo->insn_idx]; - insn->imm += relo->sym_off / 8 + prog->main_prog_cnt - relo->insn_idx; - return 0; -} - -static int -bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj) +bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) { - int i, err; - - if (!prog) - return 0; - - if (obj->btf_ext) { - err = bpf_program_reloc_btf_ext(prog, obj, - prog->section_name, 0); - if (err) - return err; - } - - if (!prog->reloc_desc) - return 0; + int i; for (i = 0; i < prog->nr_reloc; i++) { struct reloc_desc *relo = &prog->reloc_desc[i]; struct bpf_insn *insn = &prog->insns[relo->insn_idx]; struct extern_desc *ext; - if (relo->insn_idx + 1 >= (int)prog->insns_cnt) { - pr_warn("relocation out of range: '%s'\n", - prog->section_name); - return -LIBBPF_ERRNO__RELOC; - } - switch (relo->type) { case RELO_LD64: insn[0].src_reg = BPF_PSEUDO_MAP_FD; insn[0].imm = obj->maps[relo->map_idx].fd; + relo->processed = true; break; case RELO_DATA: insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; insn[1].imm = insn[0].imm + relo->sym_off; insn[0].imm = obj->maps[relo->map_idx].fd; + relo->processed = true; break; case RELO_EXTERN: ext = &obj->externs[relo->sym_off]; @@ -5727,11 +5803,10 @@ bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj) insn[0].imm = (__u32)ext->ksym.addr; insn[1].imm = ext->ksym.addr >> 32; } + relo->processed = true; break; case RELO_CALL: - err = bpf_program__reloc_text(prog, obj, relo); - if (err) - return err; + /* will be handled as a follow up pass */ break; default: pr_warn("prog '%s': relo #%d: bad relo type %d\n", @@ -5740,8 +5815,378 @@ bpf_program__relocate(struct bpf_program *prog, struct bpf_object *obj) } } - zfree(&prog->reloc_desc); - prog->nr_reloc = 0; + return 0; +} + +static int adjust_prog_btf_ext_info(const struct bpf_object *obj, + const struct bpf_program *prog, + const struct btf_ext_info *ext_info, + void **prog_info, __u32 *prog_rec_cnt, + __u32 *prog_rec_sz) +{ + void *copy_start = NULL, *copy_end = NULL; + void *rec, *rec_end, *new_prog_info; + const struct btf_ext_info_sec *sec; + size_t old_sz, new_sz; + const char *sec_name; + int i, off_adj; + + for_each_btf_ext_sec(ext_info, sec) { + sec_name = btf__name_by_offset(obj->btf, sec->sec_name_off); + if (!sec_name) + return -EINVAL; + if (strcmp(sec_name, prog->sec_name) != 0) + continue; + + for_each_btf_ext_rec(ext_info, sec, i, rec) { + __u32 insn_off = *(__u32 *)rec / BPF_INSN_SZ; + + if (insn_off < prog->sec_insn_off) + continue; + if (insn_off >= prog->sec_insn_off + prog->sec_insn_cnt) + break; + + if (!copy_start) + copy_start = rec; + copy_end = rec + ext_info->rec_size; + } + + if (!copy_start) + return -ENOENT; + + /* append func/line info of a given (sub-)program to the main + * program func/line info + */ + old_sz = (size_t)(*prog_rec_cnt) * ext_info->rec_size; + new_sz = old_sz + (copy_end - copy_start); + new_prog_info = realloc(*prog_info, new_sz); + if (!new_prog_info) + return -ENOMEM; + *prog_info = new_prog_info; + *prog_rec_cnt = new_sz / ext_info->rec_size; + memcpy(new_prog_info + old_sz, copy_start, copy_end - copy_start); + + /* Kernel instruction offsets are in units of 8-byte + * instructions, while .BTF.ext instruction offsets generated + * by Clang are in units of bytes. So convert Clang offsets + * into kernel offsets and adjust offset according to program + * relocated position. + */ + off_adj = prog->sub_insn_off - prog->sec_insn_off; + rec = new_prog_info + old_sz; + rec_end = new_prog_info + new_sz; + for (; rec < rec_end; rec += ext_info->rec_size) { + __u32 *insn_off = rec; + + *insn_off = *insn_off / BPF_INSN_SZ + off_adj; + } + *prog_rec_sz = ext_info->rec_size; + return 0; + } + + return -ENOENT; +} + +static int +reloc_prog_func_and_line_info(const struct bpf_object *obj, + struct bpf_program *main_prog, + const struct bpf_program *prog) +{ + int err; + + /* no .BTF.ext relocation if .BTF.ext is missing or kernel doesn't + * supprot func/line info + */ + if (!obj->btf_ext || !kernel_supports(FEAT_BTF_FUNC)) + return 0; + + /* only attempt func info relocation if main program's func_info + * relocation was successful + */ + if (main_prog != prog && !main_prog->func_info) + goto line_info; + + err = adjust_prog_btf_ext_info(obj, prog, &obj->btf_ext->func_info, + &main_prog->func_info, + &main_prog->func_info_cnt, + &main_prog->func_info_rec_size); + if (err) { + if (err != -ENOENT) { + pr_warn("prog '%s': error relocating .BTF.ext function info: %d\n", + prog->name, err); + return err; + } + if (main_prog->func_info) { + /* + * Some info has already been found but has problem + * in the last btf_ext reloc. Must have to error out. + */ + pr_warn("prog '%s': missing .BTF.ext function info.\n", prog->name); + return err; + } + /* Have problem loading the very first info. Ignore the rest. */ + pr_warn("prog '%s': missing .BTF.ext function info for the main program, skipping all of .BTF.ext func info.\n", + prog->name); + } + +line_info: + /* don't relocate line info if main program's relocation failed */ + if (main_prog != prog && !main_prog->line_info) + return 0; + + err = adjust_prog_btf_ext_info(obj, prog, &obj->btf_ext->line_info, + &main_prog->line_info, + &main_prog->line_info_cnt, + &main_prog->line_info_rec_size); + if (err) { + if (err != -ENOENT) { + pr_warn("prog '%s': error relocating .BTF.ext line info: %d\n", + prog->name, err); + return err; + } + if (main_prog->line_info) { + /* + * Some info has already been found but has problem + * in the last btf_ext reloc. Must have to error out. + */ + pr_warn("prog '%s': missing .BTF.ext line info.\n", prog->name); + return err; + } + /* Have problem loading the very first info. Ignore the rest. */ + pr_warn("prog '%s': missing .BTF.ext line info for the main program, skipping all of .BTF.ext line info.\n", + prog->name); + } + return 0; +} + +static int cmp_relo_by_insn_idx(const void *key, const void *elem) +{ + size_t insn_idx = *(const size_t *)key; + const struct reloc_desc *relo = elem; + + if (insn_idx == relo->insn_idx) + return 0; + return insn_idx < relo->insn_idx ? -1 : 1; +} + +static struct reloc_desc *find_prog_insn_relo(const struct bpf_program *prog, size_t insn_idx) +{ + return bsearch(&insn_idx, prog->reloc_desc, prog->nr_reloc, + sizeof(*prog->reloc_desc), cmp_relo_by_insn_idx); +} + +static int +bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, + struct bpf_program *prog) +{ + size_t sub_insn_idx, insn_idx, new_cnt; + struct bpf_program *subprog; + struct bpf_insn *insns, *insn; + struct reloc_desc *relo; + int err; + + err = reloc_prog_func_and_line_info(obj, main_prog, prog); + if (err) + return err; + + for (insn_idx = 0; insn_idx < prog->sec_insn_cnt; insn_idx++) { + insn = &main_prog->insns[prog->sub_insn_off + insn_idx]; + if (!insn_is_subprog_call(insn)) + continue; + + relo = find_prog_insn_relo(prog, insn_idx); + if (relo && relo->type != RELO_CALL) { + pr_warn("prog '%s': unexpected relo for insn #%zu, type %d\n", + prog->name, insn_idx, relo->type); + return -LIBBPF_ERRNO__RELOC; + } + if (relo) { + /* sub-program instruction index is a combination of + * an offset of a symbol pointed to by relocation and + * call instruction's imm field; for global functions, + * call always has imm = -1, but for static functions + * relocation is against STT_SECTION and insn->imm + * points to a start of a static function + */ + sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1; + } else { + /* if subprogram call is to a static function within + * the same ELF section, there won't be any relocation + * emitted, but it also means there is no additional + * offset necessary, insns->imm is relative to + * instruction's original position within the section + */ + sub_insn_idx = prog->sec_insn_off + insn_idx + insn->imm + 1; + } + + /* we enforce that sub-programs should be in .text section */ + subprog = find_prog_by_sec_insn(obj, obj->efile.text_shndx, sub_insn_idx); + if (!subprog) { + pr_warn("prog '%s': no .text section found yet sub-program call exists\n", + prog->name); + return -LIBBPF_ERRNO__RELOC; + } + + /* if it's the first call instruction calling into this + * subprogram (meaning this subprog hasn't been processed + * yet) within the context of current main program: + * - append it at the end of main program's instructions blog; + * - process is recursively, while current program is put on hold; + * - if that subprogram calls some other not yet processes + * subprogram, same thing will happen recursively until + * there are no more unprocesses subprograms left to append + * and relocate. + */ + if (subprog->sub_insn_off == 0) { + subprog->sub_insn_off = main_prog->insns_cnt; + + new_cnt = main_prog->insns_cnt + subprog->insns_cnt; + insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns)); + if (!insns) { + pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name); + return -ENOMEM; + } + main_prog->insns = insns; + main_prog->insns_cnt = new_cnt; + + memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns, + subprog->insns_cnt * sizeof(*insns)); + + pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n", + main_prog->name, subprog->insns_cnt, subprog->name); + + err = bpf_object__reloc_code(obj, main_prog, subprog); + if (err) + return err; + } + + /* main_prog->insns memory could have been re-allocated, so + * calculate pointer again + */ + insn = &main_prog->insns[prog->sub_insn_off + insn_idx]; + /* calculate correct instruction position within current main + * prog; each main prog can have a different set of + * subprograms appended (potentially in different order as + * well), so position of any subprog can be different for + * different main programs */ + insn->imm = subprog->sub_insn_off - (prog->sub_insn_off + insn_idx) - 1; + + if (relo) + relo->processed = true; + + pr_debug("prog '%s': insn #%zu relocated, imm %d points to subprog '%s' (now at %zu offset)\n", + prog->name, insn_idx, insn->imm, subprog->name, subprog->sub_insn_off); + } + + return 0; +} + +/* + * Relocate sub-program calls. + * + * Algorithm operates as follows. Each entry-point BPF program (referred to as + * main prog) is processed separately. For each subprog (non-entry functions, + * that can be called from either entry progs or other subprogs) gets their + * sub_insn_off reset to zero. This serves as indicator that this subprogram + * hasn't been yet appended and relocated within current main prog. Once its + * relocated, sub_insn_off will point at the position within current main prog + * where given subprog was appended. This will further be used to relocate all + * the call instructions jumping into this subprog. + * + * We start with main program and process all call instructions. If the call + * is into a subprog that hasn't been processed (i.e., subprog->sub_insn_off + * is zero), subprog instructions are appended at the end of main program's + * instruction array. Then main program is "put on hold" while we recursively + * process newly appended subprogram. If that subprogram calls into another + * subprogram that hasn't been appended, new subprogram is appended again to + * the *main* prog's instructions (subprog's instructions are always left + * untouched, as they need to be in unmodified state for subsequent main progs + * and subprog instructions are always sent only as part of a main prog) and + * the process continues recursively. Once all the subprogs called from a main + * prog or any of its subprogs are appended (and relocated), all their + * positions within finalized instructions array are known, so it's easy to + * rewrite call instructions with correct relative offsets, corresponding to + * desired target subprog. + * + * Its important to realize that some subprogs might not be called from some + * main prog and any of its called/used subprogs. Those will keep their + * subprog->sub_insn_off as zero at all times and won't be appended to current + * main prog and won't be relocated within the context of current main prog. + * They might still be used from other main progs later. + * + * Visually this process can be shown as below. Suppose we have two main + * programs mainA and mainB and BPF object contains three subprogs: subA, + * subB, and subC. mainA calls only subA, mainB calls only subC, but subA and + * subC both call subB: + * + * +--------+ +-------+ + * | v v | + * +--+---+ +--+-+-+ +---+--+ + * | subA | | subB | | subC | + * +--+---+ +------+ +---+--+ + * ^ ^ + * | | + * +---+-------+ +------+----+ + * | mainA | | mainB | + * +-----------+ +-----------+ + * + * We'll start relocating mainA, will find subA, append it and start + * processing sub A recursively: + * + * +-----------+------+ + * | mainA | subA | + * +-----------+------+ + * + * At this point we notice that subB is used from subA, so we append it and + * relocate (there are no further subcalls from subB): + * + * +-----------+------+------+ + * | mainA | subA | subB | + * +-----------+------+------+ + * + * At this point, we relocate subA calls, then go one level up and finish with + * relocatin mainA calls. mainA is done. + * + * For mainB process is similar but results in different order. We start with + * mainB and skip subA and subB, as mainB never calls them (at least + * directly), but we see subC is needed, so we append and start processing it: + * + * +-----------+------+ + * | mainB | subC | + * +-----------+------+ + * Now we see subC needs subB, so we go back to it, append and relocate it: + * + * +-----------+------+------+ + * | mainB | subC | subB | + * +-----------+------+------+ + * + * At this point we unwind recursion, relocate calls in subC, then in mainB. + */ +static int +bpf_object__relocate_calls(struct bpf_object *obj, struct bpf_program *prog) +{ + struct bpf_program *subprog; + int i, j, err; + + /* mark all subprogs as not relocated (yet) within the context of + * current main program + */ + for (i = 0; i < obj->nr_programs; i++) { + subprog = &obj->programs[i]; + if (!prog_is_subprog(obj, subprog)) + continue; + + subprog->sub_insn_off = 0; + for (j = 0; j < subprog->nr_reloc; j++) + if (subprog->reloc_desc[j].type == RELO_CALL) + subprog->reloc_desc[j].processed = false; + } + + err = bpf_object__reloc_code(obj, prog, prog); + if (err) + return err; + + return 0; } @@ -5760,37 +6205,45 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) return err; } } - /* ensure .text is relocated first, as it's going to be copied as-is - * later for sub-program calls + /* relocate data references first for all programs and sub-programs, + * as they don't change relative to code locations, so subsequent + * subprogram processing won't need to re-calculate any of them */ for (i = 0; i < obj->nr_programs; i++) { prog = &obj->programs[i]; - if (prog->idx != obj->efile.text_shndx) - continue; - - err = bpf_program__relocate(prog, obj); + err = bpf_object__relocate_data(obj, prog); if (err) { pr_warn("prog '%s': failed to relocate data references: %d\n", prog->name, err); return err; } - break; } - /* now relocate everything but .text, which by now is relocated - * properly, so we can copy raw sub-program instructions as is safely + /* now relocate subprogram calls and append used subprograms to main + * programs; each copy of subprogram code needs to be relocated + * differently for each main program, because its code location might + * have changed */ for (i = 0; i < obj->nr_programs; i++) { prog = &obj->programs[i]; - if (prog->idx == obj->efile.text_shndx) + /* sub-program's sub-calls are relocated within the context of + * its main program only + */ + if (prog_is_subprog(obj, prog)) continue; - err = bpf_program__relocate(prog, obj); + err = bpf_object__relocate_calls(obj, prog); if (err) { pr_warn("prog '%s': failed to relocate calls: %d\n", prog->name, err); return err; } } + /* free up relocation descriptors */ + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + zfree(&prog->reloc_desc); + prog->nr_reloc = 0; + } return 0; } @@ -5910,41 +6363,53 @@ static int bpf_object__collect_map_relos(struct bpf_object *obj, return 0; } -static int bpf_object__collect_reloc(struct bpf_object *obj) +static int cmp_relocs(const void *_a, const void *_b) { - int i, err; + const struct reloc_desc *a = _a; + const struct reloc_desc *b = _b; - if (!obj_elf_valid(obj)) { - pr_warn("Internal error: elf object is closed\n"); - return -LIBBPF_ERRNO__INTERNAL; - } + if (a->insn_idx != b->insn_idx) + return a->insn_idx < b->insn_idx ? -1 : 1; + + /* no two relocations should have the same insn_idx, but ... */ + if (a->type != b->type) + return a->type < b->type ? -1 : 1; + + return 0; +} + +static int bpf_object__collect_relos(struct bpf_object *obj) +{ + int i, err; for (i = 0; i < obj->efile.nr_reloc_sects; i++) { GElf_Shdr *shdr = &obj->efile.reloc_sects[i].shdr; Elf_Data *data = obj->efile.reloc_sects[i].data; int idx = shdr->sh_info; - struct bpf_program *prog; if (shdr->sh_type != SHT_REL) { pr_warn("internal error at %d\n", __LINE__); return -LIBBPF_ERRNO__INTERNAL; } - if (idx == obj->efile.st_ops_shndx) { + if (idx == obj->efile.st_ops_shndx) err = bpf_object__collect_st_ops_relos(obj, shdr, data); - } else if (idx == obj->efile.btf_maps_shndx) { + else if (idx == obj->efile.btf_maps_shndx) err = bpf_object__collect_map_relos(obj, shdr, data); - } else { - prog = bpf_object__find_prog_by_idx(obj, idx); - if (!prog) { - pr_warn("relocation failed: no prog in section(%d)\n", idx); - return -LIBBPF_ERRNO__RELOC; - } - err = bpf_program__collect_reloc(prog, shdr, data, obj); - } + else + err = bpf_object__collect_prog_relos(obj, shdr, data); if (err) return err; } + + for (i = 0; i < obj->nr_programs; i++) { + struct bpf_program *p = &obj->programs[i]; + + if (!p->nr_reloc) + continue; + + qsort(p->reloc_desc, p->nr_reloc, sizeof(*p->reloc_desc), cmp_relocs); + } return 0; } @@ -6058,6 +6523,20 @@ retry_load: if (ret >= 0) { if (log_buf && load_attr.log_level) pr_debug("verifier log:\n%s", log_buf); + + if (prog->obj->rodata_map_idx >= 0 && + kernel_supports(FEAT_PROG_BIND_MAP)) { + struct bpf_map *rodata_map = + &prog->obj->maps[prog->obj->rodata_map_idx]; + + if (bpf_prog_bind_map(ret, bpf_map__fd(rodata_map), NULL)) { + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); + pr_warn("prog '%s': failed to bind .rodata map: %s\n", + prog->name, cp); + /* Don't fail hard if can't bind rodata. */ + } + } + *pfd = ret; ret = 0; goto out; @@ -6110,8 +6589,7 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) int err = 0, fd, i, btf_id; if (prog->obj->loaded) { - pr_warn("prog '%s'('%s'): can't load after object was loaded\n", - prog->name, prog->section_name); + pr_warn("prog '%s': can't load after object was loaded\n", prog->name); return -EINVAL; } @@ -6127,7 +6605,7 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) if (prog->instances.nr < 0 || !prog->instances.fds) { if (prog->preprocessor) { pr_warn("Internal error: can't load program '%s'\n", - prog->section_name); + prog->name); return -LIBBPF_ERRNO__INTERNAL; } @@ -6142,8 +6620,8 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) if (!prog->preprocessor) { if (prog->instances.nr != 1) { - pr_warn("Program '%s' is inconsistent: nr(%d) != 1\n", - prog->section_name, prog->instances.nr); + pr_warn("prog '%s': inconsistent nr(%d) != 1\n", + prog->name, prog->instances.nr); } err = load_program(prog, prog->insns, prog->insns_cnt, license, kern_ver, &fd); @@ -6161,13 +6639,13 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) prog->insns_cnt, &result); if (err) { pr_warn("Preprocessing the %dth instance of program '%s' failed\n", - i, prog->section_name); + i, prog->name); goto out; } if (!result.new_insn_ptr || !result.new_insn_cnt) { pr_debug("Skip loading the %dth instance of program '%s'\n", - i, prog->section_name); + i, prog->name); prog->instances.fds[i] = -1; if (result.pfd) *result.pfd = -1; @@ -6178,7 +6656,7 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) result.new_insn_cnt, license, kern_ver, &fd); if (err) { pr_warn("Loading the %dth instance of program '%s' failed\n", - i, prog->section_name); + i, prog->name); goto out; } @@ -6188,18 +6666,12 @@ int bpf_program__load(struct bpf_program *prog, char *license, __u32 kern_ver) } out: if (err) - pr_warn("failed to load program '%s'\n", prog->section_name); + pr_warn("failed to load program '%s'\n", prog->name); zfree(&prog->insns); prog->insns_cnt = 0; return err; } -static bool bpf_program__is_function_storage(const struct bpf_program *prog, - const struct bpf_object *obj) -{ - return prog->idx == obj->efile.text_shndx && obj->has_pseudo_calls; -} - static int bpf_object__load_progs(struct bpf_object *obj, int log_level) { @@ -6216,7 +6688,7 @@ bpf_object__load_progs(struct bpf_object *obj, int log_level) for (i = 0; i < obj->nr_programs; i++) { prog = &obj->programs[i]; - if (bpf_program__is_function_storage(prog, obj)) + if (prog_is_subprog(obj, prog)) continue; if (!prog->load) { pr_debug("prog '%s': skipped loading\n", prog->name); @@ -6280,14 +6752,13 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, err = err ? : bpf_object__collect_externs(obj); err = err ? : bpf_object__finalize_btf(obj); err = err ? : bpf_object__init_maps(obj, opts); - err = err ? : bpf_object__init_prog_names(obj); - err = err ? : bpf_object__collect_reloc(obj); + err = err ? : bpf_object__collect_relos(obj); if (err) goto out; bpf_object__elf_finish(obj); bpf_object__for_each_program(prog, obj) { - prog->sec_def = find_sec_def(prog->section_name); + prog->sec_def = find_sec_def(prog->sec_name); if (!prog->sec_def) /* couldn't guess, but user might manually specify */ continue; @@ -6668,7 +7139,7 @@ int bpf_program__pin_instance(struct bpf_program *prog, const char *path, if (instance < 0 || instance >= prog->instances.nr) { pr_warn("invalid prog instance %d of prog %s (max %d)\n", - instance, prog->section_name, prog->instances.nr); + instance, prog->name, prog->instances.nr); return -EINVAL; } @@ -6699,7 +7170,7 @@ int bpf_program__unpin_instance(struct bpf_program *prog, const char *path, if (instance < 0 || instance >= prog->instances.nr) { pr_warn("invalid prog instance %d of prog %s (max %d)\n", - instance, prog->section_name, prog->instances.nr); + instance, prog->name, prog->instances.nr); return -EINVAL; } @@ -6729,8 +7200,7 @@ int bpf_program__pin(struct bpf_program *prog, const char *path) } if (prog->instances.nr <= 0) { - pr_warn("no instances of prog %s to pin\n", - prog->section_name); + pr_warn("no instances of prog %s to pin\n", prog->name); return -EINVAL; } @@ -6792,8 +7262,7 @@ int bpf_program__unpin(struct bpf_program *prog, const char *path) } if (prog->instances.nr <= 0) { - pr_warn("no instances of prog %s to pin\n", - prog->section_name); + pr_warn("no instances of prog %s to pin\n", prog->name); return -EINVAL; } @@ -7285,7 +7754,7 @@ bpf_program__next(struct bpf_program *prev, const struct bpf_object *obj) do { prog = __bpf_program__iter(prog, obj, true); - } while (prog && bpf_program__is_function_storage(prog, obj)); + } while (prog && prog_is_subprog(obj, prog)); return prog; } @@ -7297,7 +7766,7 @@ bpf_program__prev(struct bpf_program *next, const struct bpf_object *obj) do { prog = __bpf_program__iter(prog, obj, false); - } while (prog && bpf_program__is_function_storage(prog, obj)); + } while (prog && prog_is_subprog(obj, prog)); return prog; } @@ -7328,11 +7797,16 @@ const char *bpf_program__name(const struct bpf_program *prog) return prog->name; } +const char *bpf_program__section_name(const struct bpf_program *prog) +{ + return prog->sec_name; +} + const char *bpf_program__title(const struct bpf_program *prog, bool needs_copy) { const char *title; - title = prog->section_name; + title = prog->sec_name; if (needs_copy) { title = strdup(title); if (!title) { @@ -7405,14 +7879,14 @@ int bpf_program__nth_fd(const struct bpf_program *prog, int n) if (n >= prog->instances.nr || n < 0) { pr_warn("Can't get the %dth fd from program %s: only %d instances\n", - n, prog->section_name, prog->instances.nr); + n, prog->name, prog->instances.nr); return -EINVAL; } fd = prog->instances.fds[n]; if (fd < 0) { pr_warn("%dth instance of program '%s' is invalid\n", - n, prog->section_name); + n, prog->name); return -ENOENT; } @@ -7772,7 +8246,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, const struct btf *btf; struct bpf_map *map; Elf_Data *symbols; - unsigned int moff; + unsigned int moff, insn_idx; const char *name; __u32 member_idx; GElf_Sym sym; @@ -7817,6 +8291,12 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, map->name, (size_t)rel.r_offset, shdr_idx); return -LIBBPF_ERRNO__RELOC; } + if (sym.st_value % BPF_INSN_SZ) { + pr_warn("struct_ops reloc %s: invalid target program offset %llu\n", + map->name, (unsigned long long)sym.st_value); + return -LIBBPF_ERRNO__FORMAT; + } + insn_idx = sym.st_value / BPF_INSN_SZ; member = find_member_by_offset(st_ops->type, moff * 8); if (!member) { @@ -7833,7 +8313,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, return -EINVAL; } - prog = bpf_object__find_prog_by_idx(obj, shdr_idx); + prog = find_prog_by_sec_insn(obj, shdr_idx, insn_idx); if (!prog) { pr_warn("struct_ops reloc %s: cannot find prog at shdr_idx %u to relocate func ptr %s\n", map->name, shdr_idx, name); @@ -7843,7 +8323,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, if (prog->type == BPF_PROG_TYPE_UNSPEC) { const struct bpf_sec_def *sec_def; - sec_def = find_sec_def(prog->section_name); + sec_def = find_sec_def(prog->sec_name); if (sec_def && sec_def->prog_type != BPF_PROG_TYPE_STRUCT_OPS) { /* for pr_warn */ @@ -7866,7 +8346,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, invalid_prog: pr_warn("struct_ops reloc %s: cannot use prog %s in sec %s with type %u attach_btf_id %u expected_attach_type %u for func ptr %s\n", - map->name, prog->name, prog->section_name, prog->type, + map->name, prog->name, prog->sec_name, prog->type, prog->attach_btf_id, prog->expected_attach_type, name); return -EINVAL; } @@ -7970,7 +8450,7 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog) { enum bpf_attach_type attach_type = prog->expected_attach_type; __u32 attach_prog_fd = prog->attach_prog_fd; - const char *name = prog->section_name; + const char *name = prog->sec_name; int i, err; if (!name) @@ -8497,14 +8977,14 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, int prog_fd, err; if (pfd < 0) { - pr_warn("program '%s': invalid perf event FD %d\n", - bpf_program__title(prog, false), pfd); + pr_warn("prog '%s': invalid perf event FD %d\n", + prog->name, pfd); return ERR_PTR(-EINVAL); } prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { - pr_warn("program '%s': can't attach BPF program w/o FD (did you load it?)\n", - bpf_program__title(prog, false)); + pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + prog->name); return ERR_PTR(-EINVAL); } @@ -8517,20 +8997,18 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog, if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) < 0) { err = -errno; free(link); - pr_warn("program '%s': failed to attach to pfd %d: %s\n", - bpf_program__title(prog, false), pfd, - libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + pr_warn("prog '%s': failed to attach to pfd %d: %s\n", + prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); if (err == -EPROTO) - pr_warn("program '%s': try add PERF_SAMPLE_CALLCHAIN to or remove exclude_callchain_[kernel|user] from pfd %d\n", - bpf_program__title(prog, false), pfd); + pr_warn("prog '%s': try add PERF_SAMPLE_CALLCHAIN to or remove exclude_callchain_[kernel|user] from pfd %d\n", + prog->name, pfd); return ERR_PTR(err); } if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) { err = -errno; free(link); - pr_warn("program '%s': failed to enable pfd %d: %s\n", - bpf_program__title(prog, false), pfd, - libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + pr_warn("prog '%s': failed to enable pfd %d: %s\n", + prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); return ERR_PTR(err); } return link; @@ -8652,9 +9130,8 @@ struct bpf_link *bpf_program__attach_kprobe(struct bpf_program *prog, pfd = perf_event_open_probe(false /* uprobe */, retprobe, func_name, 0 /* offset */, -1 /* pid */); if (pfd < 0) { - pr_warn("program '%s': failed to create %s '%s' perf event: %s\n", - bpf_program__title(prog, false), - retprobe ? "kretprobe" : "kprobe", func_name, + pr_warn("prog '%s': failed to create %s '%s' perf event: %s\n", + prog->name, retprobe ? "kretprobe" : "kprobe", func_name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); return ERR_PTR(pfd); } @@ -8662,9 +9139,8 @@ struct bpf_link *bpf_program__attach_kprobe(struct bpf_program *prog, if (IS_ERR(link)) { close(pfd); err = PTR_ERR(link); - pr_warn("program '%s': failed to attach to %s '%s': %s\n", - bpf_program__title(prog, false), - retprobe ? "kretprobe" : "kprobe", func_name, + pr_warn("prog '%s': failed to attach to %s '%s': %s\n", + prog->name, retprobe ? "kretprobe" : "kprobe", func_name, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); return link; } @@ -8677,7 +9153,7 @@ static struct bpf_link *attach_kprobe(const struct bpf_sec_def *sec, const char *func_name; bool retprobe; - func_name = bpf_program__title(prog, false) + sec->len; + func_name = prog->sec_name + sec->len; retprobe = strcmp(sec->sec, "kretprobe/") == 0; return bpf_program__attach_kprobe(prog, retprobe, func_name); @@ -8695,9 +9171,8 @@ struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog, pfd = perf_event_open_probe(true /* uprobe */, retprobe, binary_path, func_offset, pid); if (pfd < 0) { - pr_warn("program '%s': failed to create %s '%s:0x%zx' perf event: %s\n", - bpf_program__title(prog, false), - retprobe ? "uretprobe" : "uprobe", + pr_warn("prog '%s': failed to create %s '%s:0x%zx' perf event: %s\n", + prog->name, retprobe ? "uretprobe" : "uprobe", binary_path, func_offset, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); return ERR_PTR(pfd); @@ -8706,9 +9181,8 @@ struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog, if (IS_ERR(link)) { close(pfd); err = PTR_ERR(link); - pr_warn("program '%s': failed to attach to %s '%s:0x%zx': %s\n", - bpf_program__title(prog, false), - retprobe ? "uretprobe" : "uprobe", + pr_warn("prog '%s': failed to attach to %s '%s:0x%zx': %s\n", + prog->name, retprobe ? "uretprobe" : "uprobe", binary_path, func_offset, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); return link; @@ -8776,9 +9250,8 @@ struct bpf_link *bpf_program__attach_tracepoint(struct bpf_program *prog, pfd = perf_event_open_tracepoint(tp_category, tp_name); if (pfd < 0) { - pr_warn("program '%s': failed to create tracepoint '%s/%s' perf event: %s\n", - bpf_program__title(prog, false), - tp_category, tp_name, + pr_warn("prog '%s': failed to create tracepoint '%s/%s' perf event: %s\n", + prog->name, tp_category, tp_name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); return ERR_PTR(pfd); } @@ -8786,9 +9259,8 @@ struct bpf_link *bpf_program__attach_tracepoint(struct bpf_program *prog, if (IS_ERR(link)) { close(pfd); err = PTR_ERR(link); - pr_warn("program '%s': failed to attach to tracepoint '%s/%s': %s\n", - bpf_program__title(prog, false), - tp_category, tp_name, + pr_warn("prog '%s': failed to attach to tracepoint '%s/%s': %s\n", + prog->name, tp_category, tp_name, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); return link; } @@ -8801,7 +9273,7 @@ static struct bpf_link *attach_tp(const struct bpf_sec_def *sec, char *sec_name, *tp_cat, *tp_name; struct bpf_link *link; - sec_name = strdup(bpf_program__title(prog, false)); + sec_name = strdup(prog->sec_name); if (!sec_name) return ERR_PTR(-ENOMEM); @@ -8830,8 +9302,7 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(struct bpf_program *prog, prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { - pr_warn("program '%s': can't attach before loaded\n", - bpf_program__title(prog, false)); + pr_warn("prog '%s': can't attach before loaded\n", prog->name); return ERR_PTR(-EINVAL); } @@ -8844,9 +9315,8 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(struct bpf_program *prog, if (pfd < 0) { pfd = -errno; free(link); - pr_warn("program '%s': failed to attach to raw tracepoint '%s': %s\n", - bpf_program__title(prog, false), tp_name, - libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); + pr_warn("prog '%s': failed to attach to raw tracepoint '%s': %s\n", + prog->name, tp_name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); return ERR_PTR(pfd); } link->fd = pfd; @@ -8856,7 +9326,7 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(struct bpf_program *prog, static struct bpf_link *attach_raw_tp(const struct bpf_sec_def *sec, struct bpf_program *prog) { - const char *tp_name = bpf_program__title(prog, false) + sec->len; + const char *tp_name = prog->sec_name + sec->len; return bpf_program__attach_raw_tracepoint(prog, tp_name); } @@ -8870,8 +9340,7 @@ static struct bpf_link *bpf_program__attach_btf_id(struct bpf_program *prog) prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { - pr_warn("program '%s': can't attach before loaded\n", - bpf_program__title(prog, false)); + pr_warn("prog '%s': can't attach before loaded\n", prog->name); return ERR_PTR(-EINVAL); } @@ -8884,9 +9353,8 @@ static struct bpf_link *bpf_program__attach_btf_id(struct bpf_program *prog) if (pfd < 0) { pfd = -errno; free(link); - pr_warn("program '%s': failed to attach: %s\n", - bpf_program__title(prog, false), - libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); + pr_warn("prog '%s': failed to attach: %s\n", + prog->name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); return ERR_PTR(pfd); } link->fd = pfd; @@ -8932,8 +9400,7 @@ bpf_program__attach_fd(struct bpf_program *prog, int target_fd, prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { - pr_warn("program '%s': can't attach before loaded\n", - bpf_program__title(prog, false)); + pr_warn("prog '%s': can't attach before loaded\n", prog->name); return ERR_PTR(-EINVAL); } @@ -8947,8 +9414,8 @@ bpf_program__attach_fd(struct bpf_program *prog, int target_fd, if (link_fd < 0) { link_fd = -errno; free(link); - pr_warn("program '%s': failed to attach to %s: %s\n", - bpf_program__title(prog, false), target_name, + pr_warn("prog '%s': failed to attach to %s: %s\n", + prog->name, target_name, libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); return ERR_PTR(link_fd); } @@ -8992,8 +9459,7 @@ bpf_program__attach_iter(struct bpf_program *prog, prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { - pr_warn("program '%s': can't attach before loaded\n", - bpf_program__title(prog, false)); + pr_warn("prog '%s': can't attach before loaded\n", prog->name); return ERR_PTR(-EINVAL); } @@ -9007,9 +9473,8 @@ bpf_program__attach_iter(struct bpf_program *prog, if (link_fd < 0) { link_fd = -errno; free(link); - pr_warn("program '%s': failed to attach to iterator: %s\n", - bpf_program__title(prog, false), - libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); + pr_warn("prog '%s': failed to attach to iterator: %s\n", + prog->name, libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); return ERR_PTR(link_fd); } link->fd = link_fd; @@ -9020,7 +9485,7 @@ struct bpf_link *bpf_program__attach(struct bpf_program *prog) { const struct bpf_sec_def *sec_def; - sec_def = find_sec_def(bpf_program__title(prog, false)); + sec_def = find_sec_def(prog->sec_name); if (!sec_def || !sec_def->attach_fn) return ERR_PTR(-ESRCH); @@ -10090,12 +10555,11 @@ int bpf_object__attach_skeleton(struct bpf_object_skeleton *s) struct bpf_program *prog = *s->progs[i].prog; struct bpf_link **link = s->progs[i].link; const struct bpf_sec_def *sec_def; - const char *sec_name = bpf_program__title(prog, false); if (!prog->load) continue; - sec_def = find_sec_def(sec_name); + sec_def = find_sec_def(prog->sec_name); if (!sec_def || !sec_def->attach_fn) continue; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 308e0ded8f14..a750f67a23f6 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -198,8 +198,9 @@ LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex); LIBBPF_API const char *bpf_program__name(const struct bpf_program *prog); -LIBBPF_API const char *bpf_program__title(const struct bpf_program *prog, - bool needs_copy); +LIBBPF_API const char *bpf_program__section_name(const struct bpf_program *prog); +LIBBPF_API LIBBPF_DEPRECATED("BPF program title is confusing term; please use bpf_program__section_name() instead") +const char *bpf_program__title(const struct bpf_program *prog, bool needs_copy); LIBBPF_API bool bpf_program__autoload(const struct bpf_program *prog); LIBBPF_API int bpf_program__set_autoload(struct bpf_program *prog, bool autoload); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 3fedcdc4ae2f..5f054dadf082 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -302,6 +302,8 @@ LIBBPF_0.1.0 { LIBBPF_0.2.0 { global: + bpf_prog_bind_map; + bpf_program__section_name; perf_buffer__buffer_cnt; perf_buffer__buffer_fd; perf_buffer__epoll_fd; diff --git a/tools/lib/bpf/libbpf_common.h b/tools/lib/bpf/libbpf_common.h index a23ae1ac27eb..947d8bd8a7bb 100644 --- a/tools/lib/bpf/libbpf_common.h +++ b/tools/lib/bpf/libbpf_common.h @@ -15,6 +15,8 @@ #define LIBBPF_API __attribute__((visibility("default"))) #endif +#define LIBBPF_DEPRECATED(msg) __attribute__((deprecated(msg))) + /* Helper macro to declare and initialize libbpf options struct * * This dance with uninitialized declaration, followed by memset to zero, diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 49c324594792..30b4ca5d2ac7 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -20,6 +20,7 @@ #include <linux/if_ether.h> #include <linux/if_packet.h> #include <linux/if_xdp.h> +#include <linux/kernel.h> #include <linux/list.h> #include <linux/sockios.h> #include <net/if.h> diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index 2feb751516ab..0374adcb223c 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -328,12 +328,6 @@ config_bpf_program(struct bpf_program *prog) probe_conf.no_inlines = false; probe_conf.force_add = false; - config_str = bpf_program__title(prog, false); - if (IS_ERR(config_str)) { - pr_debug("bpf: unable to get title for program\n"); - return PTR_ERR(config_str); - } - priv = calloc(sizeof(*priv), 1); if (!priv) { pr_debug("bpf: failed to alloc priv\n"); @@ -341,6 +335,7 @@ config_bpf_program(struct bpf_program *prog) } pev = &priv->pev; + config_str = bpf_program__section_name(prog); pr_debug("bpf: config program '%s'\n", config_str); err = parse_prog_config(config_str, &main_str, &is_tp, pev); if (err) @@ -454,10 +449,7 @@ preproc_gen_prologue(struct bpf_program *prog, int n, if (err) { const char *title; - title = bpf_program__title(prog, false); - if (!title) - title = "[unknown]"; - + title = bpf_program__section_name(prog); pr_debug("Failed to generate prologue for program %s\n", title); return err; diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 9a0946ddb705..e8fed558b8b8 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -15,7 +15,6 @@ test_sock test_sock_addr test_sock_fields urandom_read -test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 65d3d9aaeb31..59a5fa5fe837 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -33,7 +33,7 @@ LDLIBS += -lcap -lelf -lz -lrt -lpthread # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_verifier_log test_dev_cgroup test_tcpbpf_user \ - test_sock test_btf test_sockmap get_cgroup_id_user test_socket_cookie \ + test_sock test_sockmap get_cgroup_id_user test_socket_cookie \ test_cgroup_storage \ test_netcnt test_tcpnotify_user test_sock_fields test_sysctl \ test_progs-no_alu32 \ @@ -68,7 +68,8 @@ TEST_PROGS := test_kmod.sh \ test_tc_edt.sh \ test_xdping.sh \ test_bpftool_build.sh \ - test_bpftool.sh + test_bpftool.sh \ + test_bpftool_metadata.sh \ TEST_PROGS_EXTENDED := with_addr.sh \ with_tunnels.sh \ @@ -176,6 +177,11 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ OUTPUT=$(BUILD_DIR)/bpftool/ \ prefix= DESTDIR=$(SCRATCH_DIR)/ install + $(Q)mkdir -p $(BUILD_DIR)/bpftool/Documentation + $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras) \ + -C $(BPFTOOLDIR)/Documentation \ + OUTPUT=$(BUILD_DIR)/bpftool/Documentation/ \ + prefix= DESTDIR=$(SCRATCH_DIR)/ install $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ diff --git a/tools/testing/selftests/bpf/flow_dissector_load.h b/tools/testing/selftests/bpf/flow_dissector_load.h index daeaeb518894..7290401ec172 100644 --- a/tools/testing/selftests/bpf/flow_dissector_load.h +++ b/tools/testing/selftests/bpf/flow_dissector_load.h @@ -23,7 +23,13 @@ static inline int bpf_flow_load(struct bpf_object **obj, if (ret) return ret; - main_prog = bpf_object__find_program_by_title(*obj, section_name); + main_prog = NULL; + bpf_object__for_each_program(prog, *obj) { + if (strcmp(section_name, bpf_program__section_name(prog)) == 0) { + main_prog = prog; + break; + } + } if (!main_prog) return -1; diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 7375d9a6d242..fe1a83b9875c 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -132,17 +132,38 @@ static void test_task_stack(void) bpf_iter_task_stack__destroy(skel); } +static void *do_nothing(void *arg) +{ + pthread_exit(arg); +} + static void test_task_file(void) { struct bpf_iter_task_file *skel; + pthread_t thread_id; + void *ret; skel = bpf_iter_task_file__open_and_load(); if (CHECK(!skel, "bpf_iter_task_file__open_and_load", "skeleton open_and_load failed\n")) return; + skel->bss->tgid = getpid(); + + if (CHECK(pthread_create(&thread_id, NULL, &do_nothing, NULL), + "pthread_create", "pthread_create failed\n")) + goto done; + do_dummy_read(skel->progs.dump_task_file); + if (CHECK(pthread_join(thread_id, &ret) || ret != NULL, + "pthread_join", "pthread_join failed\n")) + goto done; + + CHECK(skel->bss->count != 0, "check_count", + "invalid non pthread file visit count %d\n", skel->bss->count); + +done: bpf_iter_task_file__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c index e9f2f12ba06b..e698ee6bb6c2 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c @@ -49,6 +49,7 @@ void test_bpf_verif_scale(void) { "test_verif_scale3.o", BPF_PROG_TYPE_SCHED_CLS }, { "pyperf_global.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, + { "pyperf_subprogs.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, /* full unroll by llvm */ { "pyperf50.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, @@ -86,6 +87,9 @@ void test_bpf_verif_scale(void) { "strobemeta_nounroll1.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, { "strobemeta_nounroll2.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, + /* non-inlined subprogs */ + { "strobemeta_subprogs.o", BPF_PROG_TYPE_RAW_TRACEPOINT }, + { "test_sysctl_loop1.o", BPF_PROG_TYPE_CGROUP_SYSCTL }, { "test_sysctl_loop2.o", BPF_PROG_TYPE_CGROUP_SYSCTL }, diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index c75fc6447186..93162484c2ca 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -24,40 +24,17 @@ #include "bpf_rlimit.h" #include "bpf_util.h" -#include "test_btf.h" +#include "../test_btf.h" +#include "test_progs.h" #define MAX_INSNS 512 #define MAX_SUBPROGS 16 -static uint32_t pass_cnt; -static uint32_t error_cnt; -static uint32_t skip_cnt; +static int duration = 0; +static bool always_log; -#define CHECK(condition, format...) ({ \ - int __ret = !!(condition); \ - if (__ret) { \ - fprintf(stderr, "%s:%d:FAIL ", __func__, __LINE__); \ - fprintf(stderr, format); \ - } \ - __ret; \ -}) - -static int count_result(int err) -{ - if (err) - error_cnt++; - else - pass_cnt++; - - fprintf(stderr, "\n"); - return err; -} - -static int __base_pr(enum libbpf_print_level level __attribute__((unused)), - const char *format, va_list args) -{ - return vfprintf(stderr, format, args); -} +#undef CHECK +#define CHECK(condition, format...) _CHECK(condition, "check", duration, format) #define BTF_END_RAW 0xdeadbeef #define NAME_TBD 0xdeadb33f @@ -69,21 +46,6 @@ static int __base_pr(enum libbpf_print_level level __attribute__((unused)), #define MAX_NR_RAW_U32 1024 #define BTF_LOG_BUF_SIZE 65535 -static struct args { - unsigned int raw_test_num; - unsigned int file_test_num; - unsigned int get_info_test_num; - unsigned int info_raw_test_num; - unsigned int dedup_test_num; - bool raw_test; - bool file_test; - bool get_info_test; - bool pprint_test; - bool always_log; - bool info_raw_test; - bool dedup_test; -} args; - static char btf_log_buf[BTF_LOG_BUF_SIZE]; static struct btf_header hdr_tmpl = { @@ -3664,7 +3626,7 @@ done: return raw_btf; } -static int do_test_raw(unsigned int test_num) +static void do_test_raw(unsigned int test_num) { struct btf_raw_test *test = &raw_tests[test_num - 1]; struct bpf_create_map_attr create_attr = {}; @@ -3674,15 +3636,16 @@ static int do_test_raw(unsigned int test_num) void *raw_btf; int err; - fprintf(stderr, "BTF raw test[%u] (%s): ", test_num, test->descr); + if (!test__start_subtest(test->descr)) + return; + raw_btf = btf_raw_create(&hdr_tmpl, test->raw_types, test->str_sec, test->str_sec_size, &raw_btf_size, NULL); - if (!raw_btf) - return -1; + return; hdr = raw_btf; @@ -3694,7 +3657,7 @@ static int do_test_raw(unsigned int test_num) *btf_log_buf = '\0'; btf_fd = bpf_load_btf(raw_btf, raw_btf_size, btf_log_buf, BTF_LOG_BUF_SIZE, - args.always_log); + always_log); free(raw_btf); err = ((btf_fd == -1) != test->btf_load_err); @@ -3725,32 +3688,12 @@ static int do_test_raw(unsigned int test_num) map_fd, test->map_create_err); done: - if (!err) - fprintf(stderr, "OK"); - - if (*btf_log_buf && (err || args.always_log)) + if (*btf_log_buf && (err || always_log)) fprintf(stderr, "\n%s", btf_log_buf); - if (btf_fd != -1) close(btf_fd); if (map_fd != -1) close(map_fd); - - return err; -} - -static int test_raw(void) -{ - unsigned int i; - int err = 0; - - if (args.raw_test_num) - return count_result(do_test_raw(args.raw_test_num)); - - for (i = 1; i <= ARRAY_SIZE(raw_tests); i++) - err |= count_result(do_test_raw(i)); - - return err; } struct btf_get_info_test { @@ -3814,11 +3757,6 @@ const struct btf_get_info_test get_info_tests[] = { }, }; -static inline __u64 ptr_to_u64(const void *ptr) -{ - return (__u64)(unsigned long)ptr; -} - static int test_big_btf_info(unsigned int test_num) { const struct btf_get_info_test *test = &get_info_tests[test_num - 1]; @@ -3851,7 +3789,7 @@ static int test_big_btf_info(unsigned int test_num) btf_fd = bpf_load_btf(raw_btf, raw_btf_size, btf_log_buf, BTF_LOG_BUF_SIZE, - args.always_log); + always_log); if (CHECK(btf_fd == -1, "errno:%d", errno)) { err = -1; goto done; @@ -3892,7 +3830,7 @@ static int test_big_btf_info(unsigned int test_num) fprintf(stderr, "OK"); done: - if (*btf_log_buf && (err || args.always_log)) + if (*btf_log_buf && (err || always_log)) fprintf(stderr, "\n%s", btf_log_buf); free(raw_btf); @@ -3939,7 +3877,7 @@ static int test_btf_id(unsigned int test_num) btf_fd[0] = bpf_load_btf(raw_btf, raw_btf_size, btf_log_buf, BTF_LOG_BUF_SIZE, - args.always_log); + always_log); if (CHECK(btf_fd[0] == -1, "errno:%d", errno)) { err = -1; goto done; @@ -4024,7 +3962,7 @@ static int test_btf_id(unsigned int test_num) fprintf(stderr, "OK"); done: - if (*btf_log_buf && (err || args.always_log)) + if (*btf_log_buf && (err || always_log)) fprintf(stderr, "\n%s", btf_log_buf); free(raw_btf); @@ -4039,7 +3977,7 @@ done: return err; } -static int do_test_get_info(unsigned int test_num) +static void do_test_get_info(unsigned int test_num) { const struct btf_get_info_test *test = &get_info_tests[test_num - 1]; unsigned int raw_btf_size, user_btf_size, expected_nbytes; @@ -4048,11 +3986,14 @@ static int do_test_get_info(unsigned int test_num) int btf_fd = -1, err, ret; uint32_t info_len; - fprintf(stderr, "BTF GET_INFO test[%u] (%s): ", - test_num, test->descr); + if (!test__start_subtest(test->descr)) + return; - if (test->special_test) - return test->special_test(test_num); + if (test->special_test) { + err = test->special_test(test_num); + if (CHECK(err, "failed: %d\n", err)) + return; + } raw_btf = btf_raw_create(&hdr_tmpl, test->raw_types, @@ -4061,7 +4002,7 @@ static int do_test_get_info(unsigned int test_num) &raw_btf_size, NULL); if (!raw_btf) - return -1; + return; *btf_log_buf = '\0'; @@ -4073,7 +4014,7 @@ static int do_test_get_info(unsigned int test_num) btf_fd = bpf_load_btf(raw_btf, raw_btf_size, btf_log_buf, BTF_LOG_BUF_SIZE, - args.always_log); + always_log); if (CHECK(btf_fd == -1, "errno:%d", errno)) { err = -1; goto done; @@ -4114,7 +4055,7 @@ static int do_test_get_info(unsigned int test_num) fprintf(stderr, "OK"); done: - if (*btf_log_buf && (err || args.always_log)) + if (*btf_log_buf && (err || always_log)) fprintf(stderr, "\n%s", btf_log_buf); free(raw_btf); @@ -4122,22 +4063,6 @@ done: if (btf_fd != -1) close(btf_fd); - - return err; -} - -static int test_get_info(void) -{ - unsigned int i; - int err = 0; - - if (args.get_info_test_num) - return count_result(do_test_get_info(args.get_info_test_num)); - - for (i = 1; i <= ARRAY_SIZE(get_info_tests); i++) - err |= count_result(do_test_get_info(i)); - - return err; } struct btf_file_test { @@ -4151,7 +4076,7 @@ static struct btf_file_test file_tests[] = { { .file = "test_btf_nokv.o", .btf_kv_notfound = true, }, }; -static int do_test_file(unsigned int test_num) +static void do_test_file(unsigned int test_num) { const struct btf_file_test *test = &file_tests[test_num - 1]; const char *expected_fnames[] = {"_dummy_tracepoint", @@ -4169,17 +4094,17 @@ static int do_test_file(unsigned int test_num) struct bpf_map *map; int i, err, prog_fd; - fprintf(stderr, "BTF libbpf test[%u] (%s): ", test_num, - test->file); + if (!test__start_subtest(test->file)) + return; btf = btf__parse_elf(test->file, &btf_ext); if (IS_ERR(btf)) { if (PTR_ERR(btf) == -ENOENT) { - fprintf(stderr, "SKIP. No ELF %s found", BTF_ELF_SEC); - skip_cnt++; - return 0; + printf("%s:SKIP: No ELF %s found", __func__, BTF_ELF_SEC); + test__skip(); + return; } - return PTR_ERR(btf); + return; } btf__free(btf); @@ -4188,7 +4113,7 @@ static int do_test_file(unsigned int test_num) obj = bpf_object__open(test->file); if (CHECK(IS_ERR(obj), "obj: %ld", PTR_ERR(obj))) - return PTR_ERR(obj); + return; prog = bpf_program__next(NULL, obj); if (CHECK(!prog, "Cannot find bpf_prog")) { @@ -4310,21 +4235,6 @@ skip: done: free(func_info); bpf_object__close(obj); - return err; -} - -static int test_file(void) -{ - unsigned int i; - int err = 0; - - if (args.file_test_num) - return count_result(do_test_file(args.file_test_num)); - - for (i = 1; i <= ARRAY_SIZE(file_tests); i++) - err |= count_result(do_test_file(i)); - - return err; } const char *pprint_enum_str[] = { @@ -4428,7 +4338,7 @@ static struct btf_raw_test pprint_test_template[] = { .value_size = sizeof(struct pprint_mapv), .key_type_id = 3, /* unsigned int */ .value_type_id = 16, /* struct pprint_mapv */ - .max_entries = 128 * 1024, + .max_entries = 128, }, { @@ -4493,7 +4403,7 @@ static struct btf_raw_test pprint_test_template[] = { .value_size = sizeof(struct pprint_mapv), .key_type_id = 3, /* unsigned int */ .value_type_id = 16, /* struct pprint_mapv */ - .max_entries = 128 * 1024, + .max_entries = 128, }, { @@ -4564,7 +4474,7 @@ static struct btf_raw_test pprint_test_template[] = { .value_size = sizeof(struct pprint_mapv), .key_type_id = 3, /* unsigned int */ .value_type_id = 16, /* struct pprint_mapv */ - .max_entries = 128 * 1024, + .max_entries = 128, }, #ifdef __SIZEOF_INT128__ @@ -4591,7 +4501,7 @@ static struct btf_raw_test pprint_test_template[] = { .value_size = sizeof(struct pprint_mapv_int128), .key_type_id = 1, .value_type_id = 4, - .max_entries = 128 * 1024, + .max_entries = 128, .mapv_kind = PPRINT_MAPV_KIND_INT128, }, #endif @@ -4790,7 +4700,7 @@ static int check_line(const char *expected_line, int nexpected_line, } -static int do_test_pprint(int test_num) +static void do_test_pprint(int test_num) { const struct btf_raw_test *test = &pprint_test_template[test_num]; enum pprint_mapv_kind_t mapv_kind = test->mapv_kind; @@ -4809,18 +4719,20 @@ static int do_test_pprint(int test_num) uint8_t *raw_btf; ssize_t nread; - fprintf(stderr, "%s(#%d)......", test->descr, test_num); + if (!test__start_subtest(test->descr)) + return; + raw_btf = btf_raw_create(&hdr_tmpl, test->raw_types, test->str_sec, test->str_sec_size, &raw_btf_size, NULL); if (!raw_btf) - return -1; + return; *btf_log_buf = '\0'; btf_fd = bpf_load_btf(raw_btf, raw_btf_size, btf_log_buf, BTF_LOG_BUF_SIZE, - args.always_log); + always_log); free(raw_btf); if (CHECK(btf_fd == -1, "errno:%d", errno)) { @@ -4971,7 +4883,7 @@ done: free(mapv); if (!err) fprintf(stderr, "OK"); - if (*btf_log_buf && (err || args.always_log)) + if (*btf_log_buf && (err || always_log)) fprintf(stderr, "\n%s", btf_log_buf); if (btf_fd != -1) close(btf_fd); @@ -4981,14 +4893,11 @@ done: fclose(pin_file); unlink(pin_path); free(line); - - return err; } -static int test_pprint(void) +static void test_pprint(void) { unsigned int i; - int err = 0; /* test various maps with the first test template */ for (i = 0; i < ARRAY_SIZE(pprint_tests_meta); i++) { @@ -4999,7 +4908,7 @@ static int test_pprint(void) pprint_test_template[0].lossless_map = pprint_tests_meta[i].lossless_map; pprint_test_template[0].percpu_map = pprint_tests_meta[i].percpu_map; - err |= count_result(do_test_pprint(0)); + do_test_pprint(0); } /* test rest test templates with the first map */ @@ -5010,10 +4919,8 @@ static int test_pprint(void) pprint_test_template[i].ordered_map = pprint_tests_meta[0].ordered_map; pprint_test_template[i].lossless_map = pprint_tests_meta[0].lossless_map; pprint_test_template[i].percpu_map = pprint_tests_meta[0].percpu_map; - err |= count_result(do_test_pprint(i)); + do_test_pprint(i); } - - return err; } #define BPF_LINE_INFO_ENC(insn_off, file_off, line_off, line_num, line_col) \ @@ -6178,7 +6085,7 @@ done: return err; } -static int do_test_info_raw(unsigned int test_num) +static void do_test_info_raw(unsigned int test_num) { const struct prog_info_raw_test *test = &info_raw_tests[test_num - 1]; unsigned int raw_btf_size, linfo_str_off, linfo_size; @@ -6187,18 +6094,19 @@ static int do_test_info_raw(unsigned int test_num) const char *ret_next_str; union bpf_attr attr = {}; - fprintf(stderr, "BTF prog info raw test[%u] (%s): ", test_num, test->descr); + if (!test__start_subtest(test->descr)) + return; + raw_btf = btf_raw_create(&hdr_tmpl, test->raw_types, test->str_sec, test->str_sec_size, &raw_btf_size, &ret_next_str); - if (!raw_btf) - return -1; + return; *btf_log_buf = '\0'; btf_fd = bpf_load_btf(raw_btf, raw_btf_size, btf_log_buf, BTF_LOG_BUF_SIZE, - args.always_log); + always_log); free(raw_btf); if (CHECK(btf_fd == -1, "invalid btf_fd errno:%d", errno)) { @@ -6206,7 +6114,7 @@ static int do_test_info_raw(unsigned int test_num) goto done; } - if (*btf_log_buf && args.always_log) + if (*btf_log_buf && always_log) fprintf(stderr, "\n%s", btf_log_buf); *btf_log_buf = '\0'; @@ -6261,10 +6169,7 @@ static int do_test_info_raw(unsigned int test_num) goto done; done: - if (!err) - fprintf(stderr, "OK"); - - if (*btf_log_buf && (err || args.always_log)) + if (*btf_log_buf && (err || always_log)) fprintf(stderr, "\n%s", btf_log_buf); if (btf_fd != -1) @@ -6274,22 +6179,6 @@ done: if (!IS_ERR(patched_linfo)) free(patched_linfo); - - return err; -} - -static int test_info_raw(void) -{ - unsigned int i; - int err = 0; - - if (args.info_raw_test_num) - return count_result(do_test_info_raw(args.info_raw_test_num)); - - for (i = 1; i <= ARRAY_SIZE(info_raw_tests); i++) - err |= count_result(do_test_info_raw(i)); - - return err; } struct btf_raw_data { @@ -6754,7 +6643,7 @@ static void dump_btf_strings(const char *strs, __u32 len) } } -static int do_test_dedup(unsigned int test_num) +static void do_test_dedup(unsigned int test_num) { const struct btf_dedup_test *test = &dedup_tests[test_num - 1]; __u32 test_nr_types, expect_nr_types, test_btf_size, expect_btf_size; @@ -6769,13 +6658,15 @@ static int do_test_dedup(unsigned int test_num) void *raw_btf; int err = 0, i; - fprintf(stderr, "BTF dedup test[%u] (%s):", test_num, test->descr); + if (!test__start_subtest(test->descr)) + return; raw_btf = btf_raw_create(&hdr_tmpl, test->input.raw_types, test->input.str_sec, test->input.str_sec_size, &raw_btf_size, &ret_test_next_str); if (!raw_btf) - return -1; + return; + test_btf = btf__new((__u8 *)raw_btf, raw_btf_size); free(raw_btf); if (CHECK(IS_ERR(test_btf), "invalid test_btf errno:%ld", @@ -6789,7 +6680,7 @@ static int do_test_dedup(unsigned int test_num) test->expect.str_sec_size, &raw_btf_size, &ret_expect_next_str); if (!raw_btf) - return -1; + return; expect_btf = btf__new((__u8 *)raw_btf, raw_btf_size); free(raw_btf); if (CHECK(IS_ERR(expect_btf), "invalid expect_btf errno:%ld", @@ -6894,174 +6785,27 @@ static int do_test_dedup(unsigned int test_num) } done: - if (!err) - fprintf(stderr, "OK"); if (!IS_ERR(test_btf)) btf__free(test_btf); if (!IS_ERR(expect_btf)) btf__free(expect_btf); - - return err; } -static int test_dedup(void) +void test_btf(void) { - unsigned int i; - int err = 0; + int i; - if (args.dedup_test_num) - return count_result(do_test_dedup(args.dedup_test_num)); + always_log = env.verbosity > VERBOSE_NONE; + for (i = 1; i <= ARRAY_SIZE(raw_tests); i++) + do_test_raw(i); + for (i = 1; i <= ARRAY_SIZE(get_info_tests); i++) + do_test_get_info(i); + for (i = 1; i <= ARRAY_SIZE(file_tests); i++) + do_test_file(i); + for (i = 1; i <= ARRAY_SIZE(info_raw_tests); i++) + do_test_info_raw(i); for (i = 1; i <= ARRAY_SIZE(dedup_tests); i++) - err |= count_result(do_test_dedup(i)); - - return err; -} - -static void usage(const char *cmd) -{ - fprintf(stderr, "Usage: %s [-l] [[-r btf_raw_test_num (1 - %zu)] |\n" - "\t[-g btf_get_info_test_num (1 - %zu)] |\n" - "\t[-f btf_file_test_num (1 - %zu)] |\n" - "\t[-k btf_prog_info_raw_test_num (1 - %zu)] |\n" - "\t[-p (pretty print test)] |\n" - "\t[-d btf_dedup_test_num (1 - %zu)]]\n", - cmd, ARRAY_SIZE(raw_tests), ARRAY_SIZE(get_info_tests), - ARRAY_SIZE(file_tests), ARRAY_SIZE(info_raw_tests), - ARRAY_SIZE(dedup_tests)); -} - -static int parse_args(int argc, char **argv) -{ - const char *optstr = "hlpk:f:r:g:d:"; - int opt; - - while ((opt = getopt(argc, argv, optstr)) != -1) { - switch (opt) { - case 'l': - args.always_log = true; - break; - case 'f': - args.file_test_num = atoi(optarg); - args.file_test = true; - break; - case 'r': - args.raw_test_num = atoi(optarg); - args.raw_test = true; - break; - case 'g': - args.get_info_test_num = atoi(optarg); - args.get_info_test = true; - break; - case 'p': - args.pprint_test = true; - break; - case 'k': - args.info_raw_test_num = atoi(optarg); - args.info_raw_test = true; - break; - case 'd': - args.dedup_test_num = atoi(optarg); - args.dedup_test = true; - break; - case 'h': - usage(argv[0]); - exit(0); - default: - usage(argv[0]); - return -1; - } - } - - if (args.raw_test_num && - (args.raw_test_num < 1 || - args.raw_test_num > ARRAY_SIZE(raw_tests))) { - fprintf(stderr, "BTF raw test number must be [1 - %zu]\n", - ARRAY_SIZE(raw_tests)); - return -1; - } - - if (args.file_test_num && - (args.file_test_num < 1 || - args.file_test_num > ARRAY_SIZE(file_tests))) { - fprintf(stderr, "BTF file test number must be [1 - %zu]\n", - ARRAY_SIZE(file_tests)); - return -1; - } - - if (args.get_info_test_num && - (args.get_info_test_num < 1 || - args.get_info_test_num > ARRAY_SIZE(get_info_tests))) { - fprintf(stderr, "BTF get info test number must be [1 - %zu]\n", - ARRAY_SIZE(get_info_tests)); - return -1; - } - - if (args.info_raw_test_num && - (args.info_raw_test_num < 1 || - args.info_raw_test_num > ARRAY_SIZE(info_raw_tests))) { - fprintf(stderr, "BTF prog info raw test number must be [1 - %zu]\n", - ARRAY_SIZE(info_raw_tests)); - return -1; - } - - if (args.dedup_test_num && - (args.dedup_test_num < 1 || - args.dedup_test_num > ARRAY_SIZE(dedup_tests))) { - fprintf(stderr, "BTF dedup test number must be [1 - %zu]\n", - ARRAY_SIZE(dedup_tests)); - return -1; - } - - return 0; -} - -static void print_summary(void) -{ - fprintf(stderr, "PASS:%u SKIP:%u FAIL:%u\n", - pass_cnt - skip_cnt, skip_cnt, error_cnt); -} - -int main(int argc, char **argv) -{ - int err = 0; - - err = parse_args(argc, argv); - if (err) - return err; - - if (args.always_log) - libbpf_set_print(__base_pr); - - if (args.raw_test) - err |= test_raw(); - - if (args.get_info_test) - err |= test_get_info(); - - if (args.file_test) - err |= test_file(); - - if (args.pprint_test) - err |= test_pprint(); - - if (args.info_raw_test) - err |= test_info_raw(); - - if (args.dedup_test) - err |= test_dedup(); - - if (args.raw_test || args.get_info_test || args.file_test || - args.pprint_test || args.info_raw_test || args.dedup_test) - goto done; - - err |= test_raw(); - err |= test_get_info(); - err |= test_file(); - err |= test_info_raw(); - err |= test_dedup(); - -done: - print_summary(); - return err; + do_test_dedup(i); + test_pprint(); } diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c index f259085cca6a..9781d85cb223 100644 --- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -12,10 +12,13 @@ #include "progs/test_cls_redirect.h" #include "test_cls_redirect.skel.h" +#include "test_cls_redirect_subprogs.skel.h" #define ENCAP_IP INADDR_LOOPBACK #define ENCAP_PORT (1234) +static int duration = 0; + struct addr_port { in_port_t port; union { @@ -361,30 +364,18 @@ static void close_fds(int *fds, int n) close(fds[i]); } -void test_cls_redirect(void) +static void test_cls_redirect_common(struct bpf_program *prog) { - struct test_cls_redirect *skel = NULL; struct bpf_prog_test_run_attr tattr = {}; int families[] = { AF_INET, AF_INET6 }; struct sockaddr_storage ss; struct sockaddr *addr; socklen_t slen; int i, j, err; - int servers[__NR_KIND][ARRAY_SIZE(families)] = {}; int conns[__NR_KIND][ARRAY_SIZE(families)] = {}; struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)]; - skel = test_cls_redirect__open(); - if (CHECK_FAIL(!skel)) - return; - - skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP); - skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT); - - if (CHECK_FAIL(test_cls_redirect__load(skel))) - goto cleanup; - addr = (struct sockaddr *)&ss; for (i = 0; i < ARRAY_SIZE(families); i++) { slen = prepare_addr(&ss, families[i]); @@ -402,7 +393,7 @@ void test_cls_redirect(void) goto cleanup; } - tattr.prog_fd = bpf_program__fd(skel->progs.cls_redirect); + tattr.prog_fd = bpf_program__fd(prog); for (i = 0; i < ARRAY_SIZE(tests); i++) { struct test_cfg *test = &tests[i]; @@ -450,7 +441,58 @@ void test_cls_redirect(void) } cleanup: - test_cls_redirect__destroy(skel); close_fds((int *)servers, sizeof(servers) / sizeof(servers[0][0])); close_fds((int *)conns, sizeof(conns) / sizeof(conns[0][0])); } + +static void test_cls_redirect_inlined(void) +{ + struct test_cls_redirect *skel; + int err; + + skel = test_cls_redirect__open(); + if (CHECK(!skel, "skel_open", "failed\n")) + return; + + skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP); + skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT); + + err = test_cls_redirect__load(skel); + if (CHECK(err, "skel_load", "failed: %d\n", err)) + goto cleanup; + + test_cls_redirect_common(skel->progs.cls_redirect); + +cleanup: + test_cls_redirect__destroy(skel); +} + +static void test_cls_redirect_subprogs(void) +{ + struct test_cls_redirect_subprogs *skel; + int err; + + skel = test_cls_redirect_subprogs__open(); + if (CHECK(!skel, "skel_open", "failed\n")) + return; + + skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP); + skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT); + + err = test_cls_redirect_subprogs__load(skel); + if (CHECK(err, "skel_load", "failed: %d\n", err)) + goto cleanup; + + test_cls_redirect_common(skel->progs.cls_redirect); + +cleanup: + test_cls_redirect_subprogs__destroy(skel); +} + +void test_cls_redirect(void) +{ + if (test__start_subtest("cls_redirect_inlined")) + test_cls_redirect_inlined(); + if (test__start_subtest("cls_redirect_subprogs")) + test_cls_redirect_subprogs(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/d_path.c b/tools/testing/selftests/bpf/prog_tests/d_path.c index fc12e0d445ff..0a577a248d34 100644 --- a/tools/testing/selftests/bpf/prog_tests/d_path.c +++ b/tools/testing/selftests/bpf/prog_tests/d_path.c @@ -120,6 +120,16 @@ void test_d_path(void) if (err < 0) goto cleanup; + if (CHECK(!bss->called_stat, + "stat", + "trampoline for security_inode_getattr was not called\n")) + goto cleanup; + + if (CHECK(!bss->called_close, + "close", + "trampoline for filp_close was not called\n")) + goto cleanup; + for (int i = 0; i < MAX_FILES; i++) { CHECK(strncmp(src.paths[i], bss->paths_stat[i], MAX_PATH_LEN), "check", diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c index a550dab9ba7a..eda682727787 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c @@ -208,11 +208,18 @@ static void test_func_map_prog_compatibility(void) void test_fexit_bpf2bpf(void) { - test_target_no_callees(); - test_target_yes_callees(); - test_func_replace(); - test_func_replace_verify(); - test_func_sockmap_update(); - test_func_replace_return_code(); - test_func_map_prog_compatibility(); + if (test__start_subtest("target_no_callees")) + test_target_no_callees(); + if (test__start_subtest("target_yes_callees")) + test_target_yes_callees(); + if (test__start_subtest("func_replace")) + test_func_replace(); + if (test__start_subtest("func_replace_verify")) + test_func_replace_verify(); + if (test__start_subtest("func_sockmap_update")) + test_func_sockmap_update(); + if (test__start_subtest("func_replace_return_code")) + test_func_replace_return_code(); + if (test__start_subtest("func_map_prog_compatibility")) + test_func_map_prog_compatibility(); } diff --git a/tools/testing/selftests/bpf/prog_tests/global_data_init.c b/tools/testing/selftests/bpf/prog_tests/global_data_init.c index 3bdaa5a40744..ee46b11f1f9a 100644 --- a/tools/testing/selftests/bpf/prog_tests/global_data_init.c +++ b/tools/testing/selftests/bpf/prog_tests/global_data_init.c @@ -12,7 +12,8 @@ void test_global_data_init(void) size_t sz; obj = bpf_object__open_file(file, NULL); - if (CHECK_FAIL(!obj)) + err = libbpf_get_error(obj); + if (CHECK_FAIL(err)) return; map = bpf_object__find_map_by_name(obj, "test_glo.rodata"); diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms.c b/tools/testing/selftests/bpf/prog_tests/ksyms.c index e3d6777226a8..b771804b2342 100644 --- a/tools/testing/selftests/bpf/prog_tests/ksyms.c +++ b/tools/testing/selftests/bpf/prog_tests/ksyms.c @@ -32,6 +32,7 @@ out: void test_ksyms(void) { + __u64 per_cpu_start_addr = kallsyms_find("__per_cpu_start"); __u64 link_fops_addr = kallsyms_find("bpf_link_fops"); const char *btf_path = "/sys/kernel/btf/vmlinux"; struct test_ksyms *skel; @@ -63,8 +64,9 @@ void test_ksyms(void) "got %llu, exp %llu\n", data->out__bpf_link_fops1, (__u64)0); CHECK(data->out__btf_size != btf_size, "btf_size", "got %llu, exp %llu\n", data->out__btf_size, btf_size); - CHECK(data->out__per_cpu_start != 0, "__per_cpu_start", - "got %llu, exp %llu\n", data->out__per_cpu_start, (__u64)0); + CHECK(data->out__per_cpu_start != per_cpu_start_addr, "__per_cpu_start", + "got %llu, exp %llu\n", data->out__per_cpu_start, + per_cpu_start_addr); cleanup: test_ksyms__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c index c2d373e294bb..8073105548ff 100644 --- a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c +++ b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c @@ -80,9 +80,8 @@ out: void test_l4lb_all(void) { - const char *file1 = "./test_l4lb.o"; - const char *file2 = "./test_l4lb_noinline.o"; - - test_l4lb(file1); - test_l4lb(file2); + if (test__start_subtest("l4lb_inline")) + test_l4lb("test_l4lb.o"); + if (test__start_subtest("l4lb_noinline")) + test_l4lb("test_l4lb_noinline.o"); } diff --git a/tools/testing/selftests/bpf/prog_tests/metadata.c b/tools/testing/selftests/bpf/prog_tests/metadata.c new file mode 100644 index 000000000000..2c53eade88e3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/metadata.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright 2020 Google LLC. + */ + +#include <test_progs.h> +#include <cgroup_helpers.h> +#include <network_helpers.h> + +#include "metadata_unused.skel.h" +#include "metadata_used.skel.h" + +static int duration; + +static int prog_holds_map(int prog_fd, int map_fd) +{ + struct bpf_prog_info prog_info = {}; + struct bpf_prog_info map_info = {}; + __u32 prog_info_len; + __u32 map_info_len; + __u32 *map_ids; + int nr_maps; + int ret; + int i; + + map_info_len = sizeof(map_info); + ret = bpf_obj_get_info_by_fd(map_fd, &map_info, &map_info_len); + if (ret) + return -errno; + + prog_info_len = sizeof(prog_info); + ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_info_len); + if (ret) + return -errno; + + map_ids = calloc(prog_info.nr_map_ids, sizeof(__u32)); + if (!map_ids) + return -ENOMEM; + + nr_maps = prog_info.nr_map_ids; + memset(&prog_info, 0, sizeof(prog_info)); + prog_info.nr_map_ids = nr_maps; + prog_info.map_ids = ptr_to_u64(map_ids); + prog_info_len = sizeof(prog_info); + + ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_info_len); + if (ret) { + ret = -errno; + goto free_map_ids; + } + + ret = -ENOENT; + for (i = 0; i < prog_info.nr_map_ids; i++) { + if (map_ids[i] == map_info.id) { + ret = 0; + break; + } + } + +free_map_ids: + free(map_ids); + return ret; +} + +static void test_metadata_unused(void) +{ + struct metadata_unused *obj; + int err; + + obj = metadata_unused__open_and_load(); + if (CHECK(!obj, "skel-load", "errno %d", errno)) + return; + + err = prog_holds_map(bpf_program__fd(obj->progs.prog), + bpf_map__fd(obj->maps.rodata)); + if (CHECK(err, "prog-holds-rodata", "errno: %d", err)) + return; + + /* Assert that we can access the metadata in skel and the values are + * what we expect. + */ + if (CHECK(strncmp(obj->rodata->bpf_metadata_a, "foo", + sizeof(obj->rodata->bpf_metadata_a)), + "bpf_metadata_a", "expected \"foo\", value differ")) + goto close_bpf_object; + if (CHECK(obj->rodata->bpf_metadata_b != 1, "bpf_metadata_b", + "expected 1, got %d", obj->rodata->bpf_metadata_b)) + goto close_bpf_object; + + /* Assert that binding metadata map to prog again succeeds. */ + err = bpf_prog_bind_map(bpf_program__fd(obj->progs.prog), + bpf_map__fd(obj->maps.rodata), NULL); + CHECK(err, "rebind_map", "errno %d, expected 0", errno); + +close_bpf_object: + metadata_unused__destroy(obj); +} + +static void test_metadata_used(void) +{ + struct metadata_used *obj; + int err; + + obj = metadata_used__open_and_load(); + if (CHECK(!obj, "skel-load", "errno %d", errno)) + return; + + err = prog_holds_map(bpf_program__fd(obj->progs.prog), + bpf_map__fd(obj->maps.rodata)); + if (CHECK(err, "prog-holds-rodata", "errno: %d", err)) + return; + + /* Assert that we can access the metadata in skel and the values are + * what we expect. + */ + if (CHECK(strncmp(obj->rodata->bpf_metadata_a, "bar", + sizeof(obj->rodata->bpf_metadata_a)), + "metadata_a", "expected \"bar\", value differ")) + goto close_bpf_object; + if (CHECK(obj->rodata->bpf_metadata_b != 2, "metadata_b", + "expected 2, got %d", obj->rodata->bpf_metadata_b)) + goto close_bpf_object; + + /* Assert that binding metadata map to prog again succeeds. */ + err = bpf_prog_bind_map(bpf_program__fd(obj->progs.prog), + bpf_map__fd(obj->maps.rodata), NULL); + CHECK(err, "rebind_map", "errno %d, expected 0", errno); + +close_bpf_object: + metadata_used__destroy(obj); +} + +void test_metadata(void) +{ + if (test__start_subtest("unused")) + test_metadata_unused(); + + if (test__start_subtest("used")) + test_metadata_used(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c index fc0d7f4f02cf..ac1ee10cffd8 100644 --- a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c +++ b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c @@ -27,7 +27,7 @@ void test_reference_tracking(void) const char *title; /* Ignore .text sections */ - title = bpf_program__title(prog, false); + title = bpf_program__section_name(prog); if (strstr(title, ".text") != NULL) continue; diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index a49a26f95a8b..3a469099f30d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -265,7 +265,7 @@ void test_sk_assign(void) TEST("ipv6 udp port redir", AF_INET6, SOCK_DGRAM, false), TEST("ipv6 udp addr redir", AF_INET6, SOCK_DGRAM, true), }; - int server = -1; + __s64 server = -1; int server_map; int self_net; int i; diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 0b79d78b98db..4b7a527e7e82 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -6,6 +6,9 @@ #include "test_skmsg_load_helpers.skel.h" #include "test_sockmap_update.skel.h" #include "test_sockmap_invalid_update.skel.h" +#include "bpf_iter_sockmap.skel.h" + +#include "progs/bpf_iter_sockmap.h" #define TCP_REPAIR 19 /* TCP sock is under repair right now */ @@ -171,6 +174,88 @@ static void test_sockmap_invalid_update(void) test_sockmap_invalid_update__destroy(skel); } +static void test_sockmap_iter(enum bpf_map_type map_type) +{ + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + int err, len, src_fd, iter_fd, duration = 0; + union bpf_iter_link_info linfo = {0}; + __s64 sock_fd[SOCKMAP_MAX_ENTRIES]; + __u32 i, num_sockets, max_elems; + struct bpf_iter_sockmap *skel; + struct bpf_link *link; + struct bpf_map *src; + char buf[64]; + + skel = bpf_iter_sockmap__open_and_load(); + if (CHECK(!skel, "bpf_iter_sockmap__open_and_load", "skeleton open_and_load failed\n")) + return; + + for (i = 0; i < ARRAY_SIZE(sock_fd); i++) + sock_fd[i] = -1; + + /* Make sure we have at least one "empty" entry to test iteration of + * an empty slot. + */ + num_sockets = ARRAY_SIZE(sock_fd) - 1; + + if (map_type == BPF_MAP_TYPE_SOCKMAP) { + src = skel->maps.sockmap; + max_elems = bpf_map__max_entries(src); + } else { + src = skel->maps.sockhash; + max_elems = num_sockets; + } + + src_fd = bpf_map__fd(src); + + for (i = 0; i < num_sockets; i++) { + sock_fd[i] = connected_socket_v4(); + if (CHECK(sock_fd[i] == -1, "connected_socket_v4", "cannot connect\n")) + goto out; + + err = bpf_map_update_elem(src_fd, &i, &sock_fd[i], BPF_NOEXIST); + if (CHECK(err, "map_update", "failed: %s\n", strerror(errno))) + goto out; + } + + linfo.map.map_fd = src_fd; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + link = bpf_program__attach_iter(skel->progs.count_elems, &opts); + if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n")) + goto out; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n")) + goto free_link; + + /* do some tests */ + while ((len = read(iter_fd, buf, sizeof(buf))) > 0) + ; + if (CHECK(len < 0, "read", "failed: %s\n", strerror(errno))) + goto close_iter; + + /* test results */ + if (CHECK(skel->bss->elems != max_elems, "elems", "got %u expected %u\n", + skel->bss->elems, max_elems)) + goto close_iter; + + if (CHECK(skel->bss->socks != num_sockets, "socks", "got %u expected %u\n", + skel->bss->socks, num_sockets)) + goto close_iter; + +close_iter: + close(iter_fd); +free_link: + bpf_link__destroy(link); +out: + for (i = 0; i < num_sockets; i++) { + if (sock_fd[i] >= 0) + close(sock_fd[i]); + } + bpf_iter_sockmap__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) @@ -187,4 +272,8 @@ void test_sockmap_basic(void) test_sockmap_update(BPF_MAP_TYPE_SOCKHASH); if (test__start_subtest("sockmap update in unsafe context")) test_sockmap_invalid_update(); + if (test__start_subtest("sockmap iter")) + test_sockmap_iter(BPF_MAP_TYPE_SOCKMAP); + if (test__start_subtest("sockhash iter")) + test_sockmap_iter(BPF_MAP_TYPE_SOCKHASH); } diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index 5f54c6aec7f0..b25c9c45c148 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -45,9 +45,9 @@ static int getsetsockopt(void) goto err; } - if (*(int *)big_buf != 0x08) { + if (*big_buf != 0x08) { log_err("Unexpected getsockopt(IP_TOS) optval 0x%x != 0x08", - *(int *)big_buf); + (int)*big_buf); goto err; } diff --git a/tools/testing/selftests/bpf/prog_tests/subprogs.c b/tools/testing/selftests/bpf/prog_tests/subprogs.c new file mode 100644 index 000000000000..a00abf58c037 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/subprogs.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <test_progs.h> +#include <time.h> +#include "test_subprogs.skel.h" + +static int duration; + +void test_subprogs(void) +{ + struct test_subprogs *skel; + int err; + + skel = test_subprogs__open_and_load(); + if (CHECK(!skel, "skel_open", "failed to open skeleton\n")) + return; + + err = test_subprogs__attach(skel); + if (CHECK(err, "skel_attach", "failed to attach skeleton: %d\n", err)) + goto cleanup; + + usleep(1); + + CHECK(skel->bss->res1 != 12, "res1", "got %d, exp %d\n", skel->bss->res1, 12); + CHECK(skel->bss->res2 != 17, "res2", "got %d, exp %d\n", skel->bss->res2, 17); + CHECK(skel->bss->res3 != 19, "res3", "got %d, exp %d\n", skel->bss->res3, 19); + CHECK(skel->bss->res4 != 36, "res4", "got %d, exp %d\n", skel->bss->res4, 36); + +cleanup: + test_subprogs__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c index bb8fe646dd9f..ee27d68d2a1c 100644 --- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> +#include <network_helpers.h> /* test_tailcall_1 checks basic functionality by patching multiple locations * in a single program for a single tail call slot with nop->jmp, jmp->nop @@ -472,6 +473,329 @@ out: bpf_object__close(obj); } +/* test_tailcall_bpf2bpf_1 purpose is to make sure that tailcalls are working + * correctly in correlation with BPF subprograms + */ +static void test_tailcall_bpf2bpf_1(void) +{ + int err, map_fd, prog_fd, main_fd, i; + struct bpf_map *prog_array; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + char prog_name[32]; + + err = bpf_prog_load("tailcall_bpf2bpf1.o", BPF_PROG_TYPE_SCHED_CLS, + &obj, &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + /* nop -> jmp */ + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "classifier/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0, + 0, &retval, &duration); + CHECK(err || retval != 1, "tailcall", + "err %d errno %d retval %d\n", err, errno, retval); + + /* jmp -> nop, call subprog that will do tailcall */ + i = 1; + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0, + 0, &retval, &duration); + CHECK(err || retval != 0, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + /* make sure that subprog can access ctx and entry prog that + * called this subprog can properly return + */ + i = 0; + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0, + 0, &retval, &duration); + CHECK(err || retval != sizeof(pkt_v4) * 2, + "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); +out: + bpf_object__close(obj); +} + +/* test_tailcall_bpf2bpf_2 checks that the count value of the tail call limit + * enforcement matches with expectations when tailcall is preceded with + * bpf2bpf call. + */ +static void test_tailcall_bpf2bpf_2(void) +{ + int err, map_fd, prog_fd, main_fd, data_fd, i, val; + struct bpf_map *prog_array, *data_map; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + char buff[128] = {}; + + err = bpf_prog_load("tailcall_bpf2bpf2.o", BPF_PROG_TYPE_SCHED_CLS, + &obj, &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + prog = bpf_object__find_program_by_title(obj, "classifier/0"); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + i = 0; + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 1, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + data_map = bpf_object__find_map_by_name(obj, "tailcall.bss"); + if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map))) + return; + + data_fd = bpf_map__fd(data_map); + if (CHECK_FAIL(map_fd < 0)) + return; + + i = 0; + err = bpf_map_lookup_elem(data_fd, &i, &val); + CHECK(err || val != 33, "tailcall count", "err %d errno %d count %d\n", + err, errno, val); + + i = 0; + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0, + &duration, &retval, NULL); + CHECK(err || retval != 0, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); +out: + bpf_object__close(obj); +} + +/* test_tailcall_bpf2bpf_3 checks that non-trivial amount of stack (up to + * 256 bytes) can be used within bpf subprograms that have the tailcalls + * in them + */ +static void test_tailcall_bpf2bpf_3(void) +{ + int err, map_fd, prog_fd, main_fd, i; + struct bpf_map *prog_array; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + char prog_name[32]; + + err = bpf_prog_load("tailcall_bpf2bpf3.o", BPF_PROG_TYPE_SCHED_CLS, + &obj, &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "classifier/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0, + &duration, &retval, NULL); + CHECK(err || retval != sizeof(pkt_v4) * 3, + "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + i = 1; + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0, + &duration, &retval, NULL); + CHECK(err || retval != sizeof(pkt_v4), + "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + i = 0; + err = bpf_map_delete_elem(map_fd, &i); + if (CHECK_FAIL(err)) + goto out; + + err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0, + &duration, &retval, NULL); + CHECK(err || retval != sizeof(pkt_v4) * 2, + "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); +out: + bpf_object__close(obj); +} + +/* test_tailcall_bpf2bpf_4 checks that tailcall counter is correctly preserved + * across tailcalls combined with bpf2bpf calls. for making sure that tailcall + * counter behaves correctly, bpf program will go through following flow: + * + * entry -> entry_subprog -> tailcall0 -> bpf_func0 -> subprog0 -> + * -> tailcall1 -> bpf_func1 -> subprog1 -> tailcall2 -> bpf_func2 -> + * subprog2 [here bump global counter] --------^ + * + * We go through first two tailcalls and start counting from the subprog2 where + * the loop begins. At the end of the test make sure that the global counter is + * equal to 31, because tailcall counter includes the first two tailcalls + * whereas global counter is incremented only on loop presented on flow above. + */ +static void test_tailcall_bpf2bpf_4(void) +{ + int err, map_fd, prog_fd, main_fd, data_fd, i, val; + struct bpf_map *prog_array, *data_map; + struct bpf_program *prog; + struct bpf_object *obj; + __u32 retval, duration; + char prog_name[32]; + + err = bpf_prog_load("tailcall_bpf2bpf4.o", BPF_PROG_TYPE_SCHED_CLS, + &obj, &prog_fd); + if (CHECK_FAIL(err)) + return; + + prog = bpf_object__find_program_by_title(obj, "classifier"); + if (CHECK_FAIL(!prog)) + goto out; + + main_fd = bpf_program__fd(prog); + if (CHECK_FAIL(main_fd < 0)) + goto out; + + prog_array = bpf_object__find_map_by_name(obj, "jmp_table"); + if (CHECK_FAIL(!prog_array)) + goto out; + + map_fd = bpf_map__fd(prog_array); + if (CHECK_FAIL(map_fd < 0)) + goto out; + + for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) { + snprintf(prog_name, sizeof(prog_name), "classifier/%i", i); + + prog = bpf_object__find_program_by_title(obj, prog_name); + if (CHECK_FAIL(!prog)) + goto out; + + prog_fd = bpf_program__fd(prog); + if (CHECK_FAIL(prog_fd < 0)) + goto out; + + err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY); + if (CHECK_FAIL(err)) + goto out; + } + + err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0, + &duration, &retval, NULL); + CHECK(err || retval != sizeof(pkt_v4) * 3, "tailcall", "err %d errno %d retval %d\n", + err, errno, retval); + + data_map = bpf_object__find_map_by_name(obj, "tailcall.bss"); + if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map))) + return; + + data_fd = bpf_map__fd(data_map); + if (CHECK_FAIL(map_fd < 0)) + return; + + i = 0; + err = bpf_map_lookup_elem(data_fd, &i, &val); + CHECK(err || val != 31, "tailcall count", "err %d errno %d count %d\n", + err, errno, val); + +out: + bpf_object__close(obj); +} + void test_tailcalls(void) { if (test__start_subtest("tailcall_1")) @@ -484,4 +808,12 @@ void test_tailcalls(void) test_tailcall_4(); if (test__start_subtest("tailcall_5")) test_tailcall_5(); + if (test__start_subtest("tailcall_bpf2bpf_1")) + test_tailcall_bpf2bpf_1(); + if (test__start_subtest("tailcall_bpf2bpf_2")) + test_tailcall_bpf2bpf_2(); + if (test__start_subtest("tailcall_bpf2bpf_3")) + test_tailcall_bpf2bpf_3(); + if (test__start_subtest("tailcall_bpf2bpf_4")) + test_tailcall_bpf2bpf_4(); } diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c index f284f72158ef..a1f06424cf83 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c @@ -1,11 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> #include <network_helpers.h> +#include "test_xdp_noinline.skel.h" void test_xdp_noinline(void) { - const char *file = "./test_xdp_noinline.o"; unsigned int nr_cpus = bpf_num_possible_cpus(); + struct test_xdp_noinline *skel; struct vip key = {.protocol = 6}; struct vip_meta { __u32 flags; @@ -25,58 +26,42 @@ void test_xdp_noinline(void) } real_def = {.dst = MAGIC_VAL}; __u32 ch_key = 11, real_num = 3; __u32 duration, retval, size; - int err, i, prog_fd, map_fd; + int err, i; __u64 bytes = 0, pkts = 0; - struct bpf_object *obj; char buf[128]; u32 *magic = (u32 *)buf; - err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd); - if (CHECK_FAIL(err)) + skel = test_xdp_noinline__open_and_load(); + if (CHECK(!skel, "skel_open_and_load", "failed\n")) return; - map_fd = bpf_find_map(__func__, obj, "vip_map"); - if (map_fd < 0) - goto out; - bpf_map_update_elem(map_fd, &key, &value, 0); + bpf_map_update_elem(bpf_map__fd(skel->maps.vip_map), &key, &value, 0); + bpf_map_update_elem(bpf_map__fd(skel->maps.ch_rings), &ch_key, &real_num, 0); + bpf_map_update_elem(bpf_map__fd(skel->maps.reals), &real_num, &real_def, 0); - map_fd = bpf_find_map(__func__, obj, "ch_rings"); - if (map_fd < 0) - goto out; - bpf_map_update_elem(map_fd, &ch_key, &real_num, 0); - - map_fd = bpf_find_map(__func__, obj, "reals"); - if (map_fd < 0) - goto out; - bpf_map_update_elem(map_fd, &real_num, &real_def, 0); - - err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4), + err = bpf_prog_test_run(bpf_program__fd(skel->progs.balancer_ingress_v4), + NUM_ITER, &pkt_v4, sizeof(pkt_v4), buf, &size, &retval, &duration); CHECK(err || retval != 1 || size != 54 || *magic != MAGIC_VAL, "ipv4", "err %d errno %d retval %d size %d magic %x\n", err, errno, retval, size, *magic); - err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6), + err = bpf_prog_test_run(bpf_program__fd(skel->progs.balancer_ingress_v6), + NUM_ITER, &pkt_v6, sizeof(pkt_v6), buf, &size, &retval, &duration); CHECK(err || retval != 1 || size != 74 || *magic != MAGIC_VAL, "ipv6", "err %d errno %d retval %d size %d magic %x\n", err, errno, retval, size, *magic); - map_fd = bpf_find_map(__func__, obj, "stats"); - if (map_fd < 0) - goto out; - bpf_map_lookup_elem(map_fd, &stats_key, stats); + bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), &stats_key, stats); for (i = 0; i < nr_cpus; i++) { bytes += stats[i].bytes; pkts += stats[i].pkts; } - if (CHECK_FAIL(bytes != MAGIC_BYTES * NUM_ITER * 2 || - pkts != NUM_ITER * 2)) { - printf("test_xdp_noinline:FAIL:stats %lld %lld\n", - bytes, pkts); - } -out: - bpf_object__close(obj); + CHECK(bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2, + "stats", "bytes %lld pkts %lld\n", + (unsigned long long)bytes, (unsigned long long)pkts); + test_xdp_noinline__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/bpf_iter.h b/tools/testing/selftests/bpf/progs/bpf_iter.h index c196280df90d..df682af75510 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter.h +++ b/tools/testing/selftests/bpf/progs/bpf_iter.h @@ -13,6 +13,7 @@ #define udp6_sock udp6_sock___not_used #define bpf_iter__bpf_map_elem bpf_iter__bpf_map_elem___not_used #define bpf_iter__bpf_sk_storage_map bpf_iter__bpf_sk_storage_map___not_used +#define bpf_iter__sockmap bpf_iter__sockmap___not_used #include "vmlinux.h" #undef bpf_iter_meta #undef bpf_iter__bpf_map @@ -26,6 +27,7 @@ #undef udp6_sock #undef bpf_iter__bpf_map_elem #undef bpf_iter__bpf_sk_storage_map +#undef bpf_iter__sockmap struct bpf_iter_meta { struct seq_file *seq; @@ -96,3 +98,10 @@ struct bpf_iter__bpf_sk_storage_map { struct sock *sk; void *value; }; + +struct bpf_iter__sockmap { + struct bpf_iter_meta *meta; + struct bpf_map *map; + void *key; + struct sock *sk; +}; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.c b/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.c new file mode 100644 index 000000000000..0e27f73dd803 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Cloudflare */ +#include "bpf_iter.h" +#include "bpf_tracing_net.h" +#include "bpf_iter_sockmap.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <errno.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, SOCKMAP_MAX_ENTRIES); + __type(key, __u32); + __type(value, __u64); +} sockmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SOCKHASH); + __uint(max_entries, SOCKMAP_MAX_ENTRIES); + __type(key, __u32); + __type(value, __u64); +} sockhash SEC(".maps"); + +__u32 elems = 0; +__u32 socks = 0; + +SEC("iter/sockmap") +int count_elems(struct bpf_iter__sockmap *ctx) +{ + struct sock *sk = ctx->sk; + __u32 tmp, *key = ctx->key; + int ret; + + if (key) + elems++; + + if (sk) + socks++; + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.h b/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.h new file mode 100644 index 000000000000..35a675d13c0f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define SOCKMAP_MAX_ENTRIES (64) diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c index 8b787baa2654..b2f7c7c5f952 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c @@ -6,6 +6,9 @@ char _license[] SEC("license") = "GPL"; +int count = 0; +int tgid = 0; + SEC("iter/task_file") int dump_task_file(struct bpf_iter__task_file *ctx) { @@ -17,8 +20,13 @@ int dump_task_file(struct bpf_iter__task_file *ctx) if (task == (void *)0 || file == (void *)0) return 0; - if (ctx->meta->seq_num == 0) + if (ctx->meta->seq_num == 0) { + count = 0; BPF_SEQ_PRINTF(seq, " tgid gid fd file\n"); + } + + if (tgid == task->tgid && task->tgid != task->pid) + count++; BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd, (long)file->f_op); diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c index 982a2d8aa844..c325405751e2 100644 --- a/tools/testing/selftests/bpf/progs/map_ptr_kern.c +++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c @@ -82,6 +82,14 @@ static inline int check_default(struct bpf_map *indirect, return 1; } +static __noinline int +check_default_noinline(struct bpf_map *indirect, struct bpf_map *direct) +{ + VERIFY(check(indirect, direct, sizeof(__u32), sizeof(__u32), + MAX_ENTRIES)); + return 1; +} + typedef struct { int counter; } atomic_t; @@ -107,7 +115,7 @@ static inline int check_hash(void) struct bpf_map *map = (struct bpf_map *)&m_hash; int i; - VERIFY(check_default(&hash->map, map)); + VERIFY(check_default_noinline(&hash->map, map)); VERIFY(hash->n_buckets == MAX_ENTRIES); VERIFY(hash->elem_size == 64); diff --git a/tools/testing/selftests/bpf/progs/metadata_unused.c b/tools/testing/selftests/bpf/progs/metadata_unused.c new file mode 100644 index 000000000000..672a0d19f8d0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/metadata_unused.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +volatile const char bpf_metadata_a[] SEC(".rodata") = "foo"; +volatile const int bpf_metadata_b SEC(".rodata") = 1; + +SEC("cgroup_skb/egress") +int prog(struct xdp_md *ctx) +{ + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/metadata_used.c b/tools/testing/selftests/bpf/progs/metadata_used.c new file mode 100644 index 000000000000..b7198e65383d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/metadata_used.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +volatile const char bpf_metadata_a[] SEC(".rodata") = "bar"; +volatile const int bpf_metadata_b SEC(".rodata") = 2; + +SEC("cgroup_skb/egress") +int prog(struct xdp_md *ctx) +{ + return bpf_metadata_b ? 1 : 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/pyperf.h b/tools/testing/selftests/bpf/progs/pyperf.h index cc615b82b56e..2fb7adafb6b6 100644 --- a/tools/testing/selftests/bpf/progs/pyperf.h +++ b/tools/testing/selftests/bpf/progs/pyperf.h @@ -67,7 +67,12 @@ typedef struct { void* co_name; // PyCodeObject.co_name } FrameData; -static __always_inline void *get_thread_state(void *tls_base, PidData *pidData) +#ifdef SUBPROGS +__noinline +#else +__always_inline +#endif +static void *get_thread_state(void *tls_base, PidData *pidData) { void* thread_state; int key; @@ -155,7 +160,9 @@ struct { } stackmap SEC(".maps"); #ifdef GLOBAL_FUNC -__attribute__((noinline)) +__noinline +#elif defined(SUBPROGS) +static __noinline #else static __always_inline #endif diff --git a/tools/testing/selftests/bpf/progs/pyperf_subprogs.c b/tools/testing/selftests/bpf/progs/pyperf_subprogs.c new file mode 100644 index 000000000000..60e27a7f0cca --- /dev/null +++ b/tools/testing/selftests/bpf/progs/pyperf_subprogs.c @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#define STACK_MAX_LEN 50 +#define SUBPROGS +#include "pyperf.h" diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h index ad61b722a9de..7de534f38c3f 100644 --- a/tools/testing/selftests/bpf/progs/strobemeta.h +++ b/tools/testing/selftests/bpf/progs/strobemeta.h @@ -266,8 +266,12 @@ struct tls_index { uint64_t offset; }; -static __always_inline void *calc_location(struct strobe_value_loc *loc, - void *tls_base) +#ifdef SUBPROGS +__noinline +#else +__always_inline +#endif +static void *calc_location(struct strobe_value_loc *loc, void *tls_base) { /* * tls_mode value is: @@ -327,10 +331,15 @@ static __always_inline void *calc_location(struct strobe_value_loc *loc, : NULL; } -static __always_inline void read_int_var(struct strobemeta_cfg *cfg, - size_t idx, void *tls_base, - struct strobe_value_generic *value, - struct strobemeta_payload *data) +#ifdef SUBPROGS +__noinline +#else +__always_inline +#endif +static void read_int_var(struct strobemeta_cfg *cfg, + size_t idx, void *tls_base, + struct strobe_value_generic *value, + struct strobemeta_payload *data) { void *location = calc_location(&cfg->int_locs[idx], tls_base); if (!location) @@ -440,8 +449,13 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, * read_strobe_meta returns NULL, if no metadata was read; otherwise returns * pointer to *right after* payload ends */ -static __always_inline void *read_strobe_meta(struct task_struct *task, - struct strobemeta_payload *data) +#ifdef SUBPROGS +__noinline +#else +__always_inline +#endif +static void *read_strobe_meta(struct task_struct *task, + struct strobemeta_payload *data) { pid_t pid = bpf_get_current_pid_tgid() >> 32; struct strobe_value_generic value = {0}; diff --git a/tools/testing/selftests/bpf/progs/strobemeta_subprogs.c b/tools/testing/selftests/bpf/progs/strobemeta_subprogs.c new file mode 100644 index 000000000000..b6c01f8fc559 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/strobemeta_subprogs.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +// Copyright (c) 2019 Facebook + +#define STROBE_MAX_INTS 2 +#define STROBE_MAX_STRS 25 +#define STROBE_MAX_MAPS 13 +#define STROBE_MAX_MAP_ENTRIES 20 +#define NO_UNROLL +#define SUBPROGS +#include "strobemeta.h" diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf1.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf1.c new file mode 100644 index 000000000000..b5d9c8e778ae --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf1.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 2); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +#define TAIL_FUNC(x) \ + SEC("classifier/" #x) \ + int bpf_func_##x(struct __sk_buff *skb) \ + { \ + return x; \ + } +TAIL_FUNC(0) +TAIL_FUNC(1) + +static __noinline +int subprog_tail(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 0); + + return skb->len * 2; +} + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 1); + + return subprog_tail(skb); +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c new file mode 100644 index 000000000000..a004ab28ce28 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_legacy.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +static __noinline +int subprog_tail(struct __sk_buff *skb) +{ + if (load_byte(skb, 0)) + bpf_tail_call(skb, &jmp_table, 1); + else + bpf_tail_call(skb, &jmp_table, 0); + return 1; +} + +static volatile int count; + +SEC("classifier/0") +int bpf_func_0(struct __sk_buff *skb) +{ + count++; + return subprog_tail(skb); +} + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 0); + + return 0; +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf3.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf3.c new file mode 100644 index 000000000000..96dbef2b6b7c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf3.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_legacy.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 2); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +__noinline +int subprog_tail2(struct __sk_buff *skb) +{ + volatile char arr[64] = {}; + + if (load_word(skb, 0) || load_half(skb, 0)) + bpf_tail_call(skb, &jmp_table, 10); + else + bpf_tail_call(skb, &jmp_table, 1); + + return skb->len; +} + +static __noinline +int subprog_tail(struct __sk_buff *skb) +{ + volatile char arr[64] = {}; + + bpf_tail_call(skb, &jmp_table, 0); + + return skb->len * 2; +} + +SEC("classifier/0") +int bpf_func_0(struct __sk_buff *skb) +{ + volatile char arr[128] = {}; + + return subprog_tail2(skb); +} + +SEC("classifier/1") +int bpf_func_1(struct __sk_buff *skb) +{ + volatile char arr[128] = {}; + + return skb->len * 3; +} + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + volatile char arr[128] = {}; + + return subprog_tail(skb); +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c new file mode 100644 index 000000000000..98b40a95bc67 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 3); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} jmp_table SEC(".maps"); + +static volatile int count; + +__noinline +int subprog_tail_2(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 2); + return skb->len * 3; +} + +__noinline +int subprog_tail_1(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 1); + return skb->len * 2; +} + +__noinline +int subprog_tail(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &jmp_table, 0); + return skb->len; +} + +SEC("classifier/1") +int bpf_func_1(struct __sk_buff *skb) +{ + return subprog_tail_2(skb); +} + +SEC("classifier/2") +int bpf_func_2(struct __sk_buff *skb) +{ + count++; + return subprog_tail_2(skb); +} + +SEC("classifier/0") +int bpf_func_0(struct __sk_buff *skb) +{ + return subprog_tail_1(skb); +} + +SEC("classifier") +int entry(struct __sk_buff *skb) +{ + return subprog_tail(skb); +} + +char __license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c index f0b72e86bee5..c9f8464996ea 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -22,6 +22,12 @@ #include "test_cls_redirect.h" +#ifdef SUBPROGS +#define INLINING __noinline +#else +#define INLINING __always_inline +#endif + #define offsetofend(TYPE, MEMBER) \ (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) @@ -125,7 +131,7 @@ typedef struct buf { uint8_t *const tail; } buf_t; -static size_t buf_off(const buf_t *buf) +static __always_inline size_t buf_off(const buf_t *buf) { /* Clang seems to optimize constructs like * a - b + c @@ -145,7 +151,7 @@ static size_t buf_off(const buf_t *buf) return off; } -static bool buf_copy(buf_t *buf, void *dst, size_t len) +static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len) { if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) { return false; @@ -155,7 +161,7 @@ static bool buf_copy(buf_t *buf, void *dst, size_t len) return true; } -static bool buf_skip(buf_t *buf, const size_t len) +static __always_inline bool buf_skip(buf_t *buf, const size_t len) { /* Check whether off + len is valid in the non-linear part. */ if (buf_off(buf) + len > buf->skb->len) { @@ -173,7 +179,7 @@ static bool buf_skip(buf_t *buf, const size_t len) * If scratch is not NULL, the function will attempt to load non-linear * data via bpf_skb_load_bytes. On success, scratch is returned. */ -static void *buf_assign(buf_t *buf, const size_t len, void *scratch) +static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch) { if (buf->head + len > buf->tail) { if (scratch == NULL) { @@ -188,7 +194,7 @@ static void *buf_assign(buf_t *buf, const size_t len, void *scratch) return ptr; } -static bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) +static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) { if (ipv4->ihl <= 5) { return true; @@ -197,13 +203,13 @@ static bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) return buf_skip(buf, (ipv4->ihl - 5) * 4); } -static bool ipv4_is_fragment(const struct iphdr *ip) +static INLINING bool ipv4_is_fragment(const struct iphdr *ip) { uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK); return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0; } -static struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) +static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) { struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch); if (ipv4 == NULL) { @@ -222,7 +228,7 @@ static struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) } /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */ -static bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) +static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) { if (!buf_copy(pkt, ports, sizeof(*ports))) { return false; @@ -237,7 +243,7 @@ static bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) return true; } -static uint16_t pkt_checksum_fold(uint32_t csum) +static INLINING uint16_t pkt_checksum_fold(uint32_t csum) { /* The highest reasonable value for an IPv4 header * checksum requires two folds, so we just do that always. @@ -247,7 +253,7 @@ static uint16_t pkt_checksum_fold(uint32_t csum) return (uint16_t)~csum; } -static void pkt_ipv4_checksum(struct iphdr *iph) +static INLINING void pkt_ipv4_checksum(struct iphdr *iph) { iph->check = 0; @@ -268,10 +274,11 @@ static void pkt_ipv4_checksum(struct iphdr *iph) iph->check = pkt_checksum_fold(acc); } -static bool pkt_skip_ipv6_extension_headers(buf_t *pkt, - const struct ipv6hdr *ipv6, - uint8_t *upper_proto, - bool *is_fragment) +static INLINING +bool pkt_skip_ipv6_extension_headers(buf_t *pkt, + const struct ipv6hdr *ipv6, + uint8_t *upper_proto, + bool *is_fragment) { /* We understand five extension headers. * https://tools.ietf.org/html/rfc8200#section-4.1 states that all @@ -336,7 +343,7 @@ static bool pkt_skip_ipv6_extension_headers(buf_t *pkt, * scratch is allocated on the stack. However, this usage should be safe since * it's the callers stack after all. */ -static inline __attribute__((__always_inline__)) struct ipv6hdr * +static __always_inline struct ipv6hdr * pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto, bool *is_fragment) { @@ -354,20 +361,20 @@ pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto, /* Global metrics, per CPU */ -struct bpf_map_def metrics_map SEC("maps") = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(unsigned int), - .value_size = sizeof(metrics_t), - .max_entries = 1, -}; - -static metrics_t *get_global_metrics(void) +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, unsigned int); + __type(value, metrics_t); +} metrics_map SEC(".maps"); + +static INLINING metrics_t *get_global_metrics(void) { uint64_t key = 0; return bpf_map_lookup_elem(&metrics_map, &key); } -static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) +static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) { const int payload_off = sizeof(*encap) + @@ -388,8 +395,8 @@ static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) return bpf_redirect(skb->ifindex, BPF_F_INGRESS); } -static ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, - struct in_addr *next_hop, metrics_t *metrics) +static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, + struct in_addr *next_hop, metrics_t *metrics) { metrics->forwarded_packets_total_gre++; @@ -509,8 +516,8 @@ static ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, return bpf_redirect(skb->ifindex, 0); } -static ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, - struct in_addr *next_hop, metrics_t *metrics) +static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, + struct in_addr *next_hop, metrics_t *metrics) { /* swap L2 addresses */ /* This assumes that packets are received from a router. @@ -546,7 +553,7 @@ static ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, return bpf_redirect(skb->ifindex, 0); } -static ret_t skip_next_hops(buf_t *pkt, int n) +static INLINING ret_t skip_next_hops(buf_t *pkt, int n) { switch (n) { case 1: @@ -566,8 +573,8 @@ static ret_t skip_next_hops(buf_t *pkt, int n) * pkt is positioned just after the variable length GLB header * iff the call is successful. */ -static ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, - struct in_addr *next_hop) +static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, + struct in_addr *next_hop) { if (encap->unigue.next_hop > encap->unigue.hop_count) { return TC_ACT_SHOT; @@ -601,8 +608,8 @@ static ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, * return value, and calling code works while still being "generic" to * IPv4 and IPv6. */ -static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, - uint64_t iphlen, uint16_t sport, uint16_t dport) +static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, + uint64_t iphlen, uint16_t sport, uint16_t dport) { switch (iphlen) { case sizeof(struct iphdr): { @@ -630,9 +637,9 @@ static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, } } -static verdict_t classify_tcp(struct __sk_buff *skb, - struct bpf_sock_tuple *tuple, uint64_t tuplen, - void *iph, struct tcphdr *tcp) +static INLINING verdict_t classify_tcp(struct __sk_buff *skb, + struct bpf_sock_tuple *tuple, uint64_t tuplen, + void *iph, struct tcphdr *tcp) { struct bpf_sock *sk = bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); @@ -663,8 +670,8 @@ static verdict_t classify_tcp(struct __sk_buff *skb, return UNKNOWN; } -static verdict_t classify_udp(struct __sk_buff *skb, - struct bpf_sock_tuple *tuple, uint64_t tuplen) +static INLINING verdict_t classify_udp(struct __sk_buff *skb, + struct bpf_sock_tuple *tuple, uint64_t tuplen) { struct bpf_sock *sk = bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); @@ -681,9 +688,9 @@ static verdict_t classify_udp(struct __sk_buff *skb, return UNKNOWN; } -static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, - struct bpf_sock_tuple *tuple, uint64_t tuplen, - metrics_t *metrics) +static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, + struct bpf_sock_tuple *tuple, uint64_t tuplen, + metrics_t *metrics) { switch (proto) { case IPPROTO_TCP: @@ -698,7 +705,7 @@ static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, } } -static verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) +static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) { struct icmphdr icmp; if (!buf_copy(pkt, &icmp, sizeof(icmp))) { @@ -745,7 +752,7 @@ static verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) sizeof(tuple.ipv4), metrics); } -static verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) +static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) { struct icmp6hdr icmp6; if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) { @@ -797,8 +804,8 @@ static verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) metrics); } -static verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, - metrics_t *metrics) +static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, + metrics_t *metrics) { metrics->l4_protocol_packets_total_tcp++; @@ -819,8 +826,8 @@ static verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp); } -static verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, - metrics_t *metrics) +static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, + metrics_t *metrics) { metrics->l4_protocol_packets_total_udp++; @@ -837,7 +844,7 @@ static verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, return classify_udp(pkt->skb, &tuple, tuplen); } -static verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) +static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) { metrics->l3_protocol_packets_total_ipv4++; @@ -874,7 +881,7 @@ static verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) } } -static verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics) +static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics) { metrics->l3_protocol_packets_total_ipv6++; diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect_subprogs.c b/tools/testing/selftests/bpf/progs/test_cls_redirect_subprogs.c new file mode 100644 index 000000000000..eed26b70e3a2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect_subprogs.c @@ -0,0 +1,2 @@ +#define SUBPROGS +#include "test_cls_redirect.c" diff --git a/tools/testing/selftests/bpf/progs/test_d_path.c b/tools/testing/selftests/bpf/progs/test_d_path.c index 61f007855649..84e1f883f97b 100644 --- a/tools/testing/selftests/bpf/progs/test_d_path.c +++ b/tools/testing/selftests/bpf/progs/test_d_path.c @@ -15,7 +15,10 @@ char paths_close[MAX_FILES][MAX_PATH_LEN] = {}; int rets_stat[MAX_FILES] = {}; int rets_close[MAX_FILES] = {}; -SEC("fentry/vfs_getattr") +int called_stat = 0; +int called_close = 0; + +SEC("fentry/security_inode_getattr") int BPF_PROG(prog_stat, struct path *path, struct kstat *stat, __u32 request_mask, unsigned int query_flags) { @@ -23,6 +26,8 @@ int BPF_PROG(prog_stat, struct path *path, struct kstat *stat, __u32 cnt = cnt_stat; int ret; + called_stat = 1; + if (pid != my_pid) return 0; @@ -42,6 +47,8 @@ int BPF_PROG(prog_close, struct file *file, void *id) __u32 cnt = cnt_close; int ret; + called_close = 1; + if (pid != my_pid) return 0; diff --git a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c index 28351936a438..b9e2753f4f91 100644 --- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c +++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c @@ -17,9 +17,7 @@ #include "test_iptunnel_common.h" #include <bpf/bpf_endian.h> -int _version SEC("version") = 1; - -static __u32 rol32(__u32 word, unsigned int shift) +static __always_inline __u32 rol32(__u32 word, unsigned int shift) { return (word << shift) | (word >> ((-shift) & 31)); } @@ -52,7 +50,7 @@ static __u32 rol32(__u32 word, unsigned int shift) typedef unsigned int u32; -static u32 jhash(const void *key, u32 length, u32 initval) +static __noinline u32 jhash(const void *key, u32 length, u32 initval) { u32 a, b, c; const unsigned char *k = key; @@ -88,7 +86,7 @@ static u32 jhash(const void *key, u32 length, u32 initval) return c; } -static u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) +static __noinline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) { a += initval; b += initval; @@ -97,7 +95,7 @@ static u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) return c; } -static u32 jhash_2words(u32 a, u32 b, u32 initval) +static __noinline u32 jhash_2words(u32 a, u32 b, u32 initval) { return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); } @@ -200,8 +198,7 @@ struct { __type(value, struct ctl_value); } ctl_array SEC(".maps"); -static __u32 get_packet_hash(struct packet_description *pckt, - bool ipv6) +static __noinline __u32 get_packet_hash(struct packet_description *pckt, bool ipv6) { if (ipv6) return jhash_2words(jhash(pckt->srcv6, 16, MAX_VIPS), @@ -210,10 +207,10 @@ static __u32 get_packet_hash(struct packet_description *pckt, return jhash_2words(pckt->src, pckt->ports, CH_RINGS_SIZE); } -static bool get_packet_dst(struct real_definition **real, - struct packet_description *pckt, - struct vip_meta *vip_info, - bool is_ipv6) +static __noinline bool get_packet_dst(struct real_definition **real, + struct packet_description *pckt, + struct vip_meta *vip_info, + bool is_ipv6) { __u32 hash = get_packet_hash(pckt, is_ipv6); __u32 key = RING_SIZE * vip_info->vip_num + hash % RING_SIZE; @@ -233,8 +230,8 @@ static bool get_packet_dst(struct real_definition **real, return true; } -static int parse_icmpv6(void *data, void *data_end, __u64 off, - struct packet_description *pckt) +static __noinline int parse_icmpv6(void *data, void *data_end, __u64 off, + struct packet_description *pckt) { struct icmp6hdr *icmp_hdr; struct ipv6hdr *ip6h; @@ -255,8 +252,8 @@ static int parse_icmpv6(void *data, void *data_end, __u64 off, return TC_ACT_UNSPEC; } -static int parse_icmp(void *data, void *data_end, __u64 off, - struct packet_description *pckt) +static __noinline int parse_icmp(void *data, void *data_end, __u64 off, + struct packet_description *pckt) { struct icmphdr *icmp_hdr; struct iphdr *iph; @@ -280,8 +277,8 @@ static int parse_icmp(void *data, void *data_end, __u64 off, return TC_ACT_UNSPEC; } -static bool parse_udp(void *data, __u64 off, void *data_end, - struct packet_description *pckt) +static __noinline bool parse_udp(void *data, __u64 off, void *data_end, + struct packet_description *pckt) { struct udphdr *udp; udp = data + off; @@ -299,8 +296,8 @@ static bool parse_udp(void *data, __u64 off, void *data_end, return true; } -static bool parse_tcp(void *data, __u64 off, void *data_end, - struct packet_description *pckt) +static __noinline bool parse_tcp(void *data, __u64 off, void *data_end, + struct packet_description *pckt) { struct tcphdr *tcp; @@ -321,8 +318,8 @@ static bool parse_tcp(void *data, __u64 off, void *data_end, return true; } -static int process_packet(void *data, __u64 off, void *data_end, - bool is_ipv6, struct __sk_buff *skb) +static __noinline int process_packet(void *data, __u64 off, void *data_end, + bool is_ipv6, struct __sk_buff *skb) { void *pkt_start = (void *)(long)skb->data; struct packet_description pckt = {}; diff --git a/tools/testing/selftests/bpf/progs/test_subprogs.c b/tools/testing/selftests/bpf/progs/test_subprogs.c new file mode 100644 index 000000000000..d3c5673c0218 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_subprogs.c @@ -0,0 +1,103 @@ +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> + +const char LICENSE[] SEC("license") = "GPL"; + +__noinline int sub1(int x) +{ + return x + 1; +} + +static __noinline int sub5(int v); + +__noinline int sub2(int y) +{ + return sub5(y + 2); +} + +static __noinline int sub3(int z) +{ + return z + 3 + sub1(4); +} + +static __noinline int sub4(int w) +{ + return w + sub3(5) + sub1(6); +} + +/* sub5() is an identitify function, just to test weirder functions layout and + * call patterns + */ +static __noinline int sub5(int v) +{ + return sub1(v) - 1; /* compensates sub1()'s + 1 */ +} + +/* unfortunately verifier rejects `struct task_struct *t` as an unkown pointer + * type, so we need to accept pointer as integer and then cast it inside the + * function + */ +__noinline int get_task_tgid(uintptr_t t) +{ + /* this ensures that CO-RE relocs work in multi-subprogs .text */ + return BPF_CORE_READ((struct task_struct *)(void *)t, tgid); +} + +int res1 = 0; +int res2 = 0; +int res3 = 0; +int res4 = 0; + +SEC("raw_tp/sys_enter") +int prog1(void *ctx) +{ + /* perform some CO-RE relocations to ensure they work with multi-prog + * sections correctly + */ + struct task_struct *t = (void *)bpf_get_current_task(); + + if (!BPF_CORE_READ(t, pid) || !get_task_tgid((uintptr_t)t)) + return 1; + + res1 = sub1(1) + sub3(2); /* (1 + 1) + (2 + 3 + (4 + 1)) = 12 */ + return 0; +} + +SEC("raw_tp/sys_exit") +int prog2(void *ctx) +{ + struct task_struct *t = (void *)bpf_get_current_task(); + + if (!BPF_CORE_READ(t, pid) || !get_task_tgid((uintptr_t)t)) + return 1; + + res2 = sub2(3) + sub3(4); /* (3 + 2) + (4 + 3 + (4 + 1)) = 17 */ + return 0; +} + +/* prog3 has the same section name as prog1 */ +SEC("raw_tp/sys_enter") +int prog3(void *ctx) +{ + struct task_struct *t = (void *)bpf_get_current_task(); + + if (!BPF_CORE_READ(t, pid) || !get_task_tgid((uintptr_t)t)) + return 1; + + res3 = sub3(5) + 6; /* (5 + 3 + (4 + 1)) + 6 = 19 */ + return 0; +} + +/* prog4 has the same section name as prog2 */ +SEC("raw_tp/sys_exit") +int prog4(void *ctx) +{ + struct task_struct *t = (void *)bpf_get_current_task(); + + if (!BPF_CORE_READ(t, pid) || !get_task_tgid((uintptr_t)t)) + return 1; + + res4 = sub4(7) + sub1(8); /* (7 + (5 + 3 + (4 + 1)) + (6 + 1)) + (8 + 1) = 36 */ + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c b/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c index 458b0d69133e..553a282d816a 100644 --- a/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c +++ b/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c @@ -18,11 +18,11 @@ #define MAX_ULONG_STR_LEN 7 #define MAX_VALUE_STR_LEN (TCP_MEM_LOOPS * MAX_ULONG_STR_LEN) +const char tcp_mem_name[] = "net/ipv4/tcp_mem/very_very_very_very_long_pointless_string"; static __always_inline int is_tcp_mem(struct bpf_sysctl *ctx) { - volatile char tcp_mem_name[] = "net/ipv4/tcp_mem/very_very_very_very_long_pointless_string"; unsigned char i; - char name[64]; + char name[sizeof(tcp_mem_name)]; int ret; memset(name, 0, sizeof(name)); diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c b/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c index b2e6f9b0894d..2b64bc563a12 100644 --- a/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c +++ b/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c @@ -18,11 +18,11 @@ #define MAX_ULONG_STR_LEN 7 #define MAX_VALUE_STR_LEN (TCP_MEM_LOOPS * MAX_ULONG_STR_LEN) +const char tcp_mem_name[] = "net/ipv4/tcp_mem/very_very_very_very_long_pointless_string_to_stress_byte_loop"; static __attribute__((noinline)) int is_tcp_mem(struct bpf_sysctl *ctx) { - volatile char tcp_mem_name[] = "net/ipv4/tcp_mem/very_very_very_very_long_pointless_string_to_stress_byte_loop"; unsigned char i; - char name[64]; + char name[sizeof(tcp_mem_name)]; int ret; memset(name, 0, sizeof(name)); diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c index 50525235380e..5489823c83fc 100644 --- a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c @@ -19,11 +19,11 @@ #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif +const char tcp_mem_name[] = "net/ipv4/tcp_mem"; static __always_inline int is_tcp_mem(struct bpf_sysctl *ctx) { - char tcp_mem_name[] = "net/ipv4/tcp_mem"; unsigned char i; - char name[64]; + char name[sizeof(tcp_mem_name)]; int ret; memset(name, 0, sizeof(name)); diff --git a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c index 8beecec166d9..3a67921f62b5 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c @@ -16,7 +16,7 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> -static __u32 rol32(__u32 word, unsigned int shift) +static __always_inline __u32 rol32(__u32 word, unsigned int shift) { return (word << shift) | (word >> ((-shift) & 31)); } @@ -49,7 +49,7 @@ static __u32 rol32(__u32 word, unsigned int shift) typedef unsigned int u32; -static __attribute__ ((noinline)) +static __noinline u32 jhash(const void *key, u32 length, u32 initval) { u32 a, b, c; @@ -86,7 +86,7 @@ u32 jhash(const void *key, u32 length, u32 initval) return c; } -__attribute__ ((noinline)) +__noinline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) { a += initval; @@ -96,7 +96,7 @@ u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) return c; } -__attribute__ ((noinline)) +__noinline u32 jhash_2words(u32 a, u32 b, u32 initval) { return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); @@ -213,7 +213,7 @@ struct eth_hdr { unsigned short eth_proto; }; -static inline __u64 calc_offset(bool is_ipv6, bool is_icmp) +static __noinline __u64 calc_offset(bool is_ipv6, bool is_icmp) { __u64 off = sizeof(struct eth_hdr); if (is_ipv6) { @@ -797,8 +797,8 @@ out: return XDP_DROP; } -__attribute__ ((section("xdp-test"), used)) -int balancer_ingress(struct xdp_md *ctx) +SEC("xdp-test-v4") +int balancer_ingress_v4(struct xdp_md *ctx) { void *data = (void *)(long)ctx->data; void *data_end = (void *)(long)ctx->data_end; @@ -812,11 +812,27 @@ int balancer_ingress(struct xdp_md *ctx) eth_proto = bpf_ntohs(eth->eth_proto); if (eth_proto == ETH_P_IP) return process_packet(data, nh_off, data_end, 0, ctx); - else if (eth_proto == ETH_P_IPV6) + else + return XDP_DROP; +} + +SEC("xdp-test-v6") +int balancer_ingress_v6(struct xdp_md *ctx) +{ + void *data = (void *)(long)ctx->data; + void *data_end = (void *)(long)ctx->data_end; + struct eth_hdr *eth = data; + __u32 eth_proto; + __u32 nh_off; + + nh_off = sizeof(struct eth_hdr); + if (data + nh_off > data_end) + return XDP_DROP; + eth_proto = bpf_ntohs(eth->eth_proto); + if (eth_proto == ETH_P_IPV6) return process_packet(data, nh_off, data_end, 1, ctx); else return XDP_DROP; } -char _license[] __attribute__ ((section("license"), used)) = "GPL"; -int _version __attribute__ ((section("version"), used)) = 1; +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_bpftool_build.sh b/tools/testing/selftests/bpf/test_bpftool_build.sh index ac349a5cea7e..2db3c60e1e61 100755 --- a/tools/testing/selftests/bpf/test_bpftool_build.sh +++ b/tools/testing/selftests/bpf/test_bpftool_build.sh @@ -85,6 +85,23 @@ make_with_tmpdir() { echo } +make_doc_and_clean() { + echo -e "\$PWD: $PWD" + echo -e "command: make -s $* doc >/dev/null" + RST2MAN_OPTS="--exit-status=1" make $J -s $* doc + if [ $? -ne 0 ] ; then + ERROR=1 + printf "FAILURE: Errors or warnings when building documentation\n" + fi + ( + if [ $# -ge 1 ] ; then + cd ${@: -1} + fi + make -s doc-clean + ) + echo +} + echo "Trying to build bpftool" echo -e "... through kbuild\n" @@ -145,3 +162,7 @@ make_and_clean make_with_tmpdir OUTPUT make_with_tmpdir O + +echo -e "Checking documentation build\n" +# From tools/bpf/bpftool +make_doc_and_clean diff --git a/tools/testing/selftests/bpf/test_bpftool_metadata.sh b/tools/testing/selftests/bpf/test_bpftool_metadata.sh new file mode 100755 index 000000000000..1bf81b49457a --- /dev/null +++ b/tools/testing/selftests/bpf/test_bpftool_metadata.sh @@ -0,0 +1,82 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +TESTNAME=bpftool_metadata +BPF_FS=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts) +BPF_DIR=$BPF_FS/test_$TESTNAME + +_cleanup() +{ + set +e + rm -rf $BPF_DIR 2> /dev/null +} + +cleanup_skip() +{ + echo "selftests: $TESTNAME [SKIP]" + _cleanup + + exit $ksft_skip +} + +cleanup() +{ + if [ "$?" = 0 ]; then + echo "selftests: $TESTNAME [PASS]" + else + echo "selftests: $TESTNAME [FAILED]" + fi + _cleanup +} + +if [ $(id -u) -ne 0 ]; then + echo "selftests: $TESTNAME [SKIP] Need root privileges" + exit $ksft_skip +fi + +if [ -z "$BPF_FS" ]; then + echo "selftests: $TESTNAME [SKIP] Could not run test without bpffs mounted" + exit $ksft_skip +fi + +if ! bpftool version > /dev/null 2>&1; then + echo "selftests: $TESTNAME [SKIP] Could not run test without bpftool" + exit $ksft_skip +fi + +set -e + +trap cleanup_skip EXIT + +mkdir $BPF_DIR + +trap cleanup EXIT + +bpftool prog load metadata_unused.o $BPF_DIR/unused + +METADATA_PLAIN="$(bpftool prog)" +echo "$METADATA_PLAIN" | grep 'a = "foo"' > /dev/null +echo "$METADATA_PLAIN" | grep 'b = 1' > /dev/null + +bpftool prog --json | grep '"metadata":{"a":"foo","b":1}' > /dev/null + +bpftool map | grep 'metadata.rodata' > /dev/null + +rm $BPF_DIR/unused + +bpftool prog load metadata_used.o $BPF_DIR/used + +METADATA_PLAIN="$(bpftool prog)" +echo "$METADATA_PLAIN" | grep 'a = "bar"' > /dev/null +echo "$METADATA_PLAIN" | grep 'b = 2' > /dev/null + +bpftool prog --json | grep '"metadata":{"a":"bar","b":2}' > /dev/null + +bpftool map | grep 'metadata.rodata' > /dev/null + +rm $BPF_DIR/used + +exit 0 diff --git a/tools/testing/selftests/bpf/test_socket_cookie.c b/tools/testing/selftests/bpf/test_socket_cookie.c index 154a8fd2a48d..ca7ca87e91aa 100644 --- a/tools/testing/selftests/bpf/test_socket_cookie.c +++ b/tools/testing/selftests/bpf/test_socket_cookie.c @@ -151,7 +151,7 @@ static int run_test(int cgfd) } bpf_object__for_each_program(prog, pobj) { - prog_name = bpf_program__title(prog, /*needs_copy*/ false); + prog_name = bpf_program__section_name(prog); if (libbpf_attach_type_by_name(prog_name, &attach_type)) goto err; diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 94258c6b5235..c4f5d909e58a 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -647,13 +647,14 @@ .result = REJECT, }, { - "calls: ld_abs with changing ctx data in callee", + "calls: subprog call with ld_abs in main prog", .insns = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), BPF_LD_ABS(BPF_B, 0), BPF_LD_ABS(BPF_H, 0), BPF_LD_ABS(BPF_W, 0), BPF_MOV64_REG(BPF_REG_7, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5), BPF_MOV64_REG(BPF_REG_6, BPF_REG_7), BPF_LD_ABS(BPF_B, 0), @@ -666,8 +667,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .errstr = "BPF_LD_[ABS|IND] instructions cannot be mixed", - .result = REJECT, + .result = ACCEPT, }, { "calls: two calls with bad fallthrough", diff --git a/tools/testing/selftests/bpf/verifier/map_ptr.c b/tools/testing/selftests/bpf/verifier/map_ptr.c index b52209db8250..637f9293bda8 100644 --- a/tools/testing/selftests/bpf/verifier/map_ptr.c +++ b/tools/testing/selftests/bpf/verifier/map_ptr.c @@ -60,3 +60,35 @@ .result = ACCEPT, .retval = 1, }, +{ + "bpf_map_ptr: r = 0, map_ptr = map_ptr + r", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_16b = { 4 }, + .result = ACCEPT, +}, +{ + "bpf_map_ptr: r = 0, r = r + map_ptr", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_LD_MAP_FD(BPF_REG_0, 0), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_16b = { 4 }, + .result = ACCEPT, +}, |